Preparing the data for analysis

Setting up

> library(limma)
> library(edgeR)
> library(DESeq2)
> library(baySeq)
> library(RColorBrewer)
> library(VennDiagram)
> dir.create("Output_using_conservative_filter", showWarnings=FALSE)

Load data

> load("RSCE.mRNA.unstranded.rda") 
> RSCE.mRNA.unstranded$samples[,1:6]

                                  group lib.size norm.factors  Sample Replicate Mixture
R1.000_C5N4YANXX_GATCAGAT_R1.bam      0 25600282            1  R1.000        R1       0
R1.025_C5N4YANXX_ACTTGAAT_R1.bam     25 60222782            1  R1.025        R1      25
R1.050_C5N4YANXX_CAGATCAT_R1.bam     50 54554929            1  R1.050        R1      50
R1.075_C5N4YANXX_GCCAATAT_R1.bam     75 33094969            1  R1.075        R1      75
R1.100_C5N4YANXX_ACAGTGAT_R1.bam    100 27694769            1  R1.100        R1     100
R2.000_C5N4YANXX_AGTTCCGT_R1.bam      0 27873127            1  R2.000        R2       0
R2.025_C5N4YANXX_AGTCAACA_R1.bam     25 36136617            1  R2.025        R2      25
R2.050_C5N4YANXX_CTTGTAAT_R1.bam     50 40846361            1  R2.050        R2      50
R2.075_C5N4YANXX_GGCTACAT_R1.bam     75 54493527            1  R2.075        R2      75
R2.100_C5N4YANXX_TAGCTTAT_R1.bam    100 25496530            1  R2.100        R2     100
R2D.000_C5N4YANXX_ATTCCTTT_R1.bam     0 26148635            1 R2D.000       R2D       0
R2D.025_C5N4YANXX_ACTGATAT_R1.bam    25 25894620            1 R2D.025       R2D      25
R2D.050_C5N4YANXX_GAGTGGAT_R1.bam    50 28414709            1 R2D.050       R2D      50
R2D.075_C5N4YANXX_CGTACGTA_R1.bam    75 26381574            1 R2D.075       R2D      75
R2D.100_C5N4YANXX_GTTTCGGA_R1.bam   100 21204180            1 R2D.100       R2D     100
R3.000_C5N4YANXX_GTGGCCTT_R1.bam      0 26702927            1  R3.000        R3       0
R3.025_C5N4YANXX_GTGAAACG_R1.bam     25 43051311            1  R3.025        R3      25
R3.050_C5N4YANXX_GTCCGCAC_R1.bam     50 37562487            1  R3.050        R3      50
R3.075_C5N4YANXX_CCGTCCCG_R1.bam     75 34298214            1  R3.075        R3      75
R3.100_C5N4YANXX_ATGTCAGA_R1.bam    100 28498549            1  R3.100        R3     100

Multi-dimensional scaling plot

> lcpm <- cpm(RSCE.mRNA.unstranded, log=TRUE)
> pchby <- as.numeric(grepl("D", RSCE.mRNA.unstranded$samples$Sample))+1
> colby <- as.factor(RSCE.mRNA.unstranded$samples$group)
> labelby.group <- paste0(as.numeric(levels(colby)), ":", 100-as.numeric(levels(colby)))
> palette(rainbow(5))
> plotMDS(lcpm, pch=as.numeric(pchby)+15, col=as.numeric(colby), main="Poly-A")
> legend("bottomleft", text.col=unique(as.numeric(colby)), legend=labelby.group, bty="n")

Clean up gene annotations

> RSCE.mRNA.unstranded$genes <- RSCE.mRNA.unstranded$genes[,c("GeneID", "Symbols")]

Filter out lowly expressed genes

> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.5)>=3
> table(keep.exprs)

keep.exprs
FALSE  TRUE 
 7538 16131

> genes.removed <- RSCE.mRNA.unstranded$genes$GeneID[!keep.exprs]
> save(genes.removed, file="./Output_using_conservative_filter/genes_removed_from_analysis.RData")
> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]

Normalisation

> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)

Set comparisons to be carried out downstream

> comparisons <- c("100vs000", "075vs025", "050vs025", "075vs050", "100vs000.D", "075vs025.D", "050vs025.D", "075vs050.D")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)
> sample

   Batch Mixture
1     R1     000
2     R1     025
3     R1     050
4     R1     075
5     R1     100
6     R2     000
7     R2     025
8     R2     050
9     R2     075
10    R2     100
11   R2D     000
12   R2D     025
13   R2D     050
14   R2D     075
15   R2D     100
16    R3     000
17    R3     025
18    R3     050
19    R3     075
20    R3     100

DE methods

voom

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   v <- voom(data, design)
+   fit <- lmFit(v, design)
+   fit <- eBayes(fit)
+   tt <- topTable(fit, coef=ncol(design), n=Inf)
+   colnames(tt)[colnames(tt)=="adj.P.Val"] <- "FDR"
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/voom.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 == 
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

voom with quality weights (voom-qw)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   v <- voomWithQualityWeights(data, design)
+   fit <- lmFit(v, design)
+   fit <- eBayes(fit)
+   tt <- topTable(fit, coef=ncol(design), n=Inf)
+   colnames(tt)[colnames(tt)=="adj.P.Val"] <- "FDR"
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/voom_qw.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 == 
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

edgeR with generalised linear models (edgeR-glm)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   data <- estimateGLMCommonDisp(data, design)
+   data <- estimateGLMTrendedDisp(data, design)
+   data <- estimateGLMTagwiseDisp(data, design)
+   fit <- glmFit(data, design)
+   lrt <- glmLRT(fit)
+   tt <- topTags(lrt, n=Inf)
+   tt <- as.data.frame(tt)
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/edger_glm.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 == 
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

edgeR using exact tests (edgeR-classic)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   data <- estimateCommonDisp(data)
+   data <- estimateTagwiseDisp(data)
+   et <- exactTest(data)
+   tt <- topTags(et, n=Inf)
+   tt <- as.data.frame(tt)
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/edger_classic.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 == 
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

DESeq2

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   levels(data$samples$group) <- c("low", "high")
+   dds <- DESeqDataSetFromMatrix(data$counts, colData=data$samples, design=~group)
+   mcols(dds) <- DataFrame(mcols(dds), data$genes)
+   dds <- DESeq(dds)
+   res <- results(dds, contrast=c("group", "high", "low"))
+   tt <- as.data.frame(res)
+   tt$GeneID <- rownames(tt)
+   colnames(tt)[colnames(tt)=="padj"] <- "FDR"
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/deseq2.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 == 
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

baySeq on a normal distribution (bayseq-norm)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   CD <- new("countData", data=data$counts, 
+             replicates=group, groups=list(NDE=rep(1, 6), DE=as.numeric(group)))
+   CD@annotation <- data$genes
+   libsizes(CD) <- getLibsizes(CD)
+   densityFunction(CD) <- normDensity
+   cl <- makeCluster(8)
+   normCD <- getPriors(CD, cl=cl)
+   normCD <- getLikelihoods(normCD, cl=cl)
+   tt <- topCounts(normCD, group="DE", number=nrow(data))
+   colnames(tt)[colnames(tt)=="FDR.DE"] <- "FDR"
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/bayseq_norm.RData")
+ }

== Running comparison 100vs000 == 
.== Running comparison 075vs025 == 
.== Running comparison 050vs025 == 
.== Running comparison 075vs050 == 
.== Running comparison 100vs000.D == 
.== Running comparison 075vs025.D == 
.== Running comparison 050vs025.D == 
.== Running comparison 075vs050.D == 
.

baySeq on a negative binomial distribution (bayseq-nb)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   CD <- new("countData", data=data$counts, 
+             replicates=group, groups=list(NDE=rep(1, 6), DE=as.numeric(group)))
+   CD@annotation <- data$genes
+   libsizes(CD) <- getLibsizes(CD)
+   cl <- makeCluster(8)
+   CD <- getPriors.NB(CD, cl=cl)
+   CD <- getLikelihoods(CD, cl=cl)
+   tt <- topCounts(CD, group="DE", number=nrow(data))
+   colnames(tt)[colnames(tt)=="FDR.DE"] <- "FDR"
+   table[[c]] <- tt
+   save(table, file="./Output_using_conservative_filter/bayseq_nb.RData")
+ }

== Running comparison 100vs000 == 
.== Running comparison 075vs025 == 
.== Running comparison 050vs025 == 
.== Running comparison 075vs050 == 
.== Running comparison 100vs000.D == 
.== Running comparison 075vs025.D == 
.== Running comparison 050vs025.D == 
.== Running comparison 075vs050.D == 
.

Session time and information

Total run time for this report: 332 minutes

> sessionInfo()

R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
 [1] grid      parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] VennDiagram_1.6.17         futile.logger_1.4.1        RColorBrewer_1.1-2        
 [4] baySeq_2.6.0               perm_1.0-0.0               abind_1.4-5               
 [7] DESeq2_1.12.3              SummarizedExperiment_1.2.3 Biobase_2.32.0            
[10] GenomicRanges_1.24.2       GenomeInfoDb_1.8.1         IRanges_2.6.1             
[13] S4Vectors_0.10.1           BiocGenerics_0.18.0        edgeR_3.14.0              
[16] limma_3.28.14             

loaded via a namespace (and not attached):
 [1] genefilter_1.54.2    locfit_1.5-9.1       splines_3.3.1        lattice_0.20-33     
 [5] colorspace_1.2-6     htmltools_0.3.5      yaml_2.1.13          chron_2.3-47        
 [9] survival_2.39-5      XML_3.98-1.4         foreign_0.8-66       DBI_0.4-1           
[13] BiocParallel_1.6.2   lambda.r_1.1.7       plyr_1.8.4           stringr_1.0.0       
[17] zlibbioc_1.18.0      munsell_0.4.3        gtable_0.2.0         evaluate_0.9        
[21] latticeExtra_0.6-28  knitr_1.13           geneplotter_1.50.0   AnnotationDbi_1.34.3
[25] Rcpp_0.12.5          acepack_1.3-3.3      xtable_1.8-2         scales_0.4.0        
[29] formatR_1.4          Hmisc_3.17-4         annotate_1.50.0      XVector_0.12.0      
[33] gridExtra_2.2.1      ggplot2_2.1.0        digest_0.6.9         stringi_1.1.1       
[37] tools_3.3.1          magrittr_1.5         RSQLite_1.0.0        Formula_1.2-1       
[41] cluster_2.0.4        futile.options_1.0.0 Matrix_1.2-6         data.table_1.9.6    
[45] rmarkdown_1.0        rpart_4.1-10         nnet_7.3-12

Differential expression analyses on mixology data (using conservative filter)

Charity Law

7 October 2016

Preparing the data for analysis

Setting up

Load data

Multi-dimensional scaling plot

Clean up gene annotations

Filter out lowly expressed genes

Normalisation

Set comparisons to be carried out downstream

DE methods

voom

voom with quality weights (voom-qw)

edgeR with generalised linear models (edgeR-glm)

edgeR using exact tests (edgeR-classic)

DESeq2

baySeq on a normal distribution (bayseq-norm)

baySeq on a negative binomial distribution (bayseq-nb)

Session time and information