Preparing the data for analysis
Setting up
> library(limma)
> library(edgeR)
> library(DESeq2)
> library(baySeq)
> library(RColorBrewer)
> library(VennDiagram)
> dir.create("Output", showWarnings=FALSE)
Load data
> load("RSCE.mRNA.unstranded.rda")
> RSCE.mRNA.unstranded$samples[,1:6]
group lib.size norm.factors Sample Replicate Mixture
R1.000_C5N4YANXX_GATCAGAT_R1.bam 0 25600282 1 R1.000 R1 0
R1.025_C5N4YANXX_ACTTGAAT_R1.bam 25 60222782 1 R1.025 R1 25
R1.050_C5N4YANXX_CAGATCAT_R1.bam 50 54554929 1 R1.050 R1 50
R1.075_C5N4YANXX_GCCAATAT_R1.bam 75 33094969 1 R1.075 R1 75
R1.100_C5N4YANXX_ACAGTGAT_R1.bam 100 27694769 1 R1.100 R1 100
R2.000_C5N4YANXX_AGTTCCGT_R1.bam 0 27873127 1 R2.000 R2 0
R2.025_C5N4YANXX_AGTCAACA_R1.bam 25 36136617 1 R2.025 R2 25
R2.050_C5N4YANXX_CTTGTAAT_R1.bam 50 40846361 1 R2.050 R2 50
R2.075_C5N4YANXX_GGCTACAT_R1.bam 75 54493527 1 R2.075 R2 75
R2.100_C5N4YANXX_TAGCTTAT_R1.bam 100 25496530 1 R2.100 R2 100
R2D.000_C5N4YANXX_ATTCCTTT_R1.bam 0 26148635 1 R2D.000 R2D 0
R2D.025_C5N4YANXX_ACTGATAT_R1.bam 25 25894620 1 R2D.025 R2D 25
R2D.050_C5N4YANXX_GAGTGGAT_R1.bam 50 28414709 1 R2D.050 R2D 50
R2D.075_C5N4YANXX_CGTACGTA_R1.bam 75 26381574 1 R2D.075 R2D 75
R2D.100_C5N4YANXX_GTTTCGGA_R1.bam 100 21204180 1 R2D.100 R2D 100
R3.000_C5N4YANXX_GTGGCCTT_R1.bam 0 26702927 1 R3.000 R3 0
R3.025_C5N4YANXX_GTGAAACG_R1.bam 25 43051311 1 R3.025 R3 25
R3.050_C5N4YANXX_GTCCGCAC_R1.bam 50 37562487 1 R3.050 R3 50
R3.075_C5N4YANXX_CCGTCCCG_R1.bam 75 34298214 1 R3.075 R3 75
R3.100_C5N4YANXX_ATGTCAGA_R1.bam 100 28498549 1 R3.100 R3 100
Multi-dimensional scaling plot
> lcpm <- cpm(RSCE.mRNA.unstranded, log=TRUE)
> pchby <- as.numeric(grepl("D", RSCE.mRNA.unstranded$samples$Sample))+1
> colby <- as.factor(RSCE.mRNA.unstranded$samples$group)
> labelby.group <- paste0(as.numeric(levels(colby)), ":", 100-as.numeric(levels(colby)))
> palette(rainbow(5))
> plotMDS(lcpm, pch=as.numeric(pchby)+15, col=as.numeric(colby), main="Poly-A")
> legend("bottomleft", text.col=unique(as.numeric(colby)), legend=labelby.group, bty="n")
Clean up gene annotations
> RSCE.mRNA.unstranded$genes <- RSCE.mRNA.unstranded$genes[,c("GeneID", "Symbols")]
Filter out lowly expressed genes
> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>1)>=3
> table(keep.exprs)
keep.exprs
FALSE TRUE
8688 14981
> genes.removed <- RSCE.mRNA.unstranded$genes$GeneID[!keep.exprs]
> save(genes.removed, file="./Output/genes_removed_from_analysis.RData")
> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]
Normalisation
> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)
Set comparisons to be carried out downstream
> comparisons <- c("100vs000", "075vs025", "050vs025", "075vs050", "100vs000.D", "075vs025.D", "050vs025.D", "075vs050.D")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)
> sample
Batch Mixture
1 R1 000
2 R1 025
3 R1 050
4 R1 075
5 R1 100
6 R2 000
7 R2 025
8 R2 050
9 R2 075
10 R2 100
11 R2D 000
12 R2D 025
13 R2D 050
14 R2D 075
15 R2D 100
16 R3 000
17 R3 025
18 R3 050
19 R3 075
20 R3 100
DE methods
voom
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ v <- voom(data, design)
+ fit <- lmFit(v, design)
+ fit <- eBayes(fit)
+ tt <- topTable(fit, coef=ncol(design), n=Inf)
+ colnames(tt)[colnames(tt)=="adj.P.Val"] <- "FDR"
+ table[[c]] <- tt
+ save(table, file="./Output/voom.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
== Running comparison 100vs000.D ==
== Running comparison 075vs025.D ==
== Running comparison 050vs025.D ==
== Running comparison 075vs050.D ==
voom with quality weights (voom-qw)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ v <- voomWithQualityWeights(data, design)
+ fit <- lmFit(v, design)
+ fit <- eBayes(fit)
+ tt <- topTable(fit, coef=ncol(design), n=Inf)
+ colnames(tt)[colnames(tt)=="adj.P.Val"] <- "FDR"
+ table[[c]] <- tt
+ save(table, file="./Output/voom_qw.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
== Running comparison 100vs000.D ==
== Running comparison 075vs025.D ==
== Running comparison 050vs025.D ==
== Running comparison 075vs050.D ==
edgeR with generalised linear models (edgeR-glm)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ data <- estimateGLMCommonDisp(data, design)
+ data <- estimateGLMTrendedDisp(data, design)
+ data <- estimateGLMTagwiseDisp(data, design)
+ fit <- glmFit(data, design)
+ lrt <- glmLRT(fit)
+ tt <- topTags(lrt, n=Inf)
+ tt <- as.data.frame(tt)
+ table[[c]] <- tt
+ save(table, file="./Output/edger_glm.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
== Running comparison 100vs000.D ==
== Running comparison 075vs025.D ==
== Running comparison 050vs025.D ==
== Running comparison 075vs050.D ==
edgeR using exact tests (edgeR-classic)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ data <- estimateCommonDisp(data)
+ data <- estimateTagwiseDisp(data)
+ et <- exactTest(data)
+ tt <- topTags(et, n=Inf)
+ tt <- as.data.frame(tt)
+ table[[c]] <- tt
+ save(table, file="./Output/edger_classic.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
== Running comparison 100vs000.D ==
== Running comparison 075vs025.D ==
== Running comparison 050vs025.D ==
== Running comparison 075vs050.D ==
DESeq2
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ levels(data$samples$group) <- c("low", "high")
+ dds <- DESeqDataSetFromMatrix(data$counts, colData=data$samples, design=~group)
+ mcols(dds) <- DataFrame(mcols(dds), data$genes)
+ dds <- DESeq(dds)
+ res <- results(dds, contrast=c("group", "high", "low"))
+ tt <- as.data.frame(res)
+ tt$GeneID <- rownames(tt)
+ colnames(tt)[colnames(tt)=="padj"] <- "FDR"
+ table[[c]] <- tt
+ save(table, file="./Output/deseq2.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
== Running comparison 100vs000.D ==
== Running comparison 075vs025.D ==
== Running comparison 050vs025.D ==
== Running comparison 075vs050.D ==
baySeq on a normal distribution (bayseq-norm)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ CD <- new("countData", data=data$counts,
+ replicates=group, groups=list(NDE=rep(1, 6), DE=as.numeric(group)))
+ CD@annotation <- data$genes
+ libsizes(CD) <- getLibsizes(CD)
+ densityFunction(CD) <- normDensity
+ cl <- makeCluster(8)
+ normCD <- getPriors(CD, cl=cl)
+ normCD <- getLikelihoods(normCD, cl=cl)
+ tt <- topCounts(normCD, group="DE", number=nrow(data))
+ colnames(tt)[colnames(tt)=="FDR.DE"] <- "FDR"
+ table[[c]] <- tt
+ save(table, file="./Output/bayseq_norm.RData")
+ }
== Running comparison 100vs000 ==
.== Running comparison 075vs025 ==
.== Running comparison 050vs025 ==
.== Running comparison 075vs050 ==
.== Running comparison 100vs000.D ==
.== Running comparison 075vs025.D ==
.== Running comparison 050vs025.D ==
.== Running comparison 075vs050.D ==
.
baySeq on a negative binomial distribution (bayseq-nb)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ CD <- new("countData", data=data$counts,
+ replicates=group, groups=list(NDE=rep(1, 6), DE=as.numeric(group)))
+ CD@annotation <- data$genes
+ libsizes(CD) <- getLibsizes(CD)
+ cl <- makeCluster(8)
+ CD <- getPriors.NB(CD, cl=cl)
+ CD <- getLikelihoods(CD, cl=cl)
+ tt <- topCounts(CD, group="DE", number=nrow(data))
+ colnames(tt)[colnames(tt)=="FDR.DE"] <- "FDR"
+ table[[c]] <- tt
+ save(table, file="./Output/bayseq_nb.RData")
+ }
== Running comparison 100vs000 ==
.== Running comparison 075vs025 ==
.== Running comparison 050vs025 ==
.== Running comparison 075vs050 ==
.== Running comparison 100vs000.D ==
.== Running comparison 075vs025.D ==
.== Running comparison 050vs025.D ==
.== Running comparison 075vs050.D ==
.