> library(limma)
> library(edgeR)
> library(DEXSeq)
> library(RColorBrewer)
> library(VennDiagram)
> library(BiocParallel)
> dir.create("Output", showWarnings=FALSE)
> load("RSCE.mRNA.unstranded.exon.rda")
> RSCE.mRNA.unstranded$samples[,1:6]
group lib.size norm.factors Sample Replicate Mixture
R1.000_C5N4YANXX_GATCAGAT_R1.bam 0 25596249 1 R1.000 R1 0
R1.025_C5N4YANXX_ACTTGAAT_R1.bam 25 60211551 1 R1.025 R1 25
R1.050_C5N4YANXX_CAGATCAT_R1.bam 50 54543829 1 R1.050 R1 50
R1.075_C5N4YANXX_GCCAATAT_R1.bam 75 33087367 1 R1.075 R1 75
R1.100_C5N4YANXX_ACAGTGAT_R1.bam 100 27687770 1 R1.100 R1 100
R2.000_C5N4YANXX_AGTTCCGT_R1.bam 0 27868893 1 R2.000 R2 0
R2.025_C5N4YANXX_AGTCAACA_R1.bam 25 36130329 1 R2.025 R2 25
R2.050_C5N4YANXX_CTTGTAAT_R1.bam 50 40838511 1 R2.050 R2 50
R2.075_C5N4YANXX_GGCTACAT_R1.bam 75 54481749 1 R2.075 R2 75
R2.100_C5N4YANXX_TAGCTTAT_R1.bam 100 25490533 1 R2.100 R2 100
R2D.000_C5N4YANXX_ATTCCTTT_R1.bam 0 26144678 1 R2D.000 R2D 0
R2D.025_C5N4YANXX_ACTGATAT_R1.bam 25 25890044 1 R2D.025 R2D 25
R2D.050_C5N4YANXX_GAGTGGAT_R1.bam 50 28409164 1 R2D.050 R2D 50
R2D.075_C5N4YANXX_CGTACGTA_R1.bam 75 26375545 1 R2D.075 R2D 75
R2D.100_C5N4YANXX_GTTTCGGA_R1.bam 100 21198960 1 R2D.100 R2D 100
R3.000_C5N4YANXX_GTGGCCTT_R1.bam 0 26699053 1 R3.000 R3 0
R3.025_C5N4YANXX_GTGAAACG_R1.bam 25 43044202 1 R3.025 R3 25
R3.050_C5N4YANXX_GTCCGCAC_R1.bam 50 37555387 1 R3.050 R3 50
R3.075_C5N4YANXX_CCGTCCCG_R1.bam 75 34291008 1 R3.075 R3 75
R3.100_C5N4YANXX_ATGTCAGA_R1.bam 100 28492313 1 R3.100 R3 100
> lcpm <- cpm(RSCE.mRNA.unstranded, log=TRUE)
> pchby <- as.numeric(grepl("D", RSCE.mRNA.unstranded$samples$Sample))+1
> colby <- as.factor(RSCE.mRNA.unstranded$samples$group)
> labelby.group <- paste0(as.numeric(levels(colby)), ":", 100-as.numeric(levels(colby)))
> palette(rainbow(5))
> plotMDS(lcpm, pch=as.numeric(pchby)+15, col=as.numeric(colby), main="Poly-A")
> legend("topleft", text.col=unique(as.numeric(colby)), legend=labelby.group, bty="n")
> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.125)>=10
> table(keep.exprs)
keep.exprs
FALSE TRUE
61169 138042
> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]
> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)
> comparisons <- c("100vs000", "075vs025", "050vs025", "075vs050")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)
> sample
Batch Mixture
1 R1 000
2 R1 025
3 R1 050
4 R1 075
5 R1 100
6 R2 000
7 R2 025
8 R2 050
9 R2 075
10 R2 100
11 R2D 000
12 R2D 025
13 R2D 050
14 R2D 075
15 R2D 100
16 R3 000
17 R3 025
18 R3 050
19 R3 075
20 R3 100
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ v <- voom(data, design)
+ fit <- lmFit(v, design)
+ ds <- diffSplice(fit, geneid="GeneID")
+ tt <- topSplice(ds, test="F", n=Inf)
+ table[[c]] <- tt
+ save(comparisons, table, file="./Output/diffsplice.f.RData")
+ }
== Running comparison 100vs000 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 075vs025 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 050vs025 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 075vs050 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ design <- model.matrix(~group)
+ y <- estimateDisp(data, design=design, robust=TRUE)
+ fit <- glmQLFit(y, design=design, robust=TRUE)
+ ds <- diffSpliceDGE(fit, geneid="GeneID")
+ tt <- topSpliceDGE(ds, test="gene", n=Inf)
+ table[[c]] <- tt
+ save(comparisons, table, file="./Output/edger.RData")
+ }
== Running comparison 100vs000 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 075vs025 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 050vs025 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
== Running comparison 075vs050 ==
Total number of exons: 138042
Total number of genes: 16500
Number of genes with 1 exon: 3113
Mean number of exons in a gene: 8
Max number of exons in a gene: 111
> ncores <- MulticoreParam(workers=4)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+ cat("== Running comparison", c, "== \n")
+ batch <- paste0("R2", substr(c, start=10, stop=10))
+ batch <- c(batch, "R1", "R3")
+ name <- gsub(".D", "", c)
+ select <- c(strsplit2(name, split="vs"))
+ sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+ data <- RSCE.mRNA.unstranded[,sample.select]
+ group <- data$samples$group
+ dex.samples <- data.frame(cbind(condition=group))
+ dex.samples$condition <- as.factor(dex.samples$condition)
+ dexds <- DEXSeqDataSet(countData=data$counts, sampleData=dex.samples,
+ design=~sample+exon+condition:exon, featureID=as.character(1:nrow(data)),
+ groupID=as.character(data$genes$GeneID))
+ dexds <- estimateSizeFactors(dexds)
+ dexds <- estimateDispersions(dexds, BPPARAM=ncores)
+ dexds <- testForDEU(dexds, BPPARAM=ncores)
+ dexds <- estimateExonFoldChanges(dexds, BPPARAM=ncores)
+ dex.exon <- DEXSeqResults(dexds)
+ dex.gene <- perGeneQValue(dex.exon, p="pvalue")
+ dex.gene <- as.data.frame(dex.gene)
+ dex.gene <- data.frame(rownames(dex.gene), dex.gene)
+ colnames(dex.gene) <- c("GeneID","FDR")
+ o <- order(dex.gene$FDR)
+ tt <- dex.gene[o,]
+ table[[c]] <- tt
+ save(comparisons, table, file="./Output/dexseq.RData")
+ }
== Running comparison 100vs000 ==
== Running comparison 075vs025 ==
== Running comparison 050vs025 ==
== Running comparison 075vs050 ==
> files <- list.files(path="./Output", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> methods.label <- c("DEXSeq", "voom-ds", "edgeR-ds")
> names(methods.label) <- methods
> methods.label
dexseq diffsplice.f edger
"DEXSeq" "voom-ds" "edgeR-ds"
> for (i in 1:length(methods)) {
+ load(files[i])
+ assign(methods[i], table)
+ }
> cutoff <- 0.05
> reference <- as.list(rep(NA, length(methods)))
> names(reference) <- methods
> for (m in methods) {
+ this.reference <- get(m)[["100vs000"]]
+ this.reference <- this.reference$GeneID[this.reference$FDR<cutoff]
+ reference[[m]] <- as.character(this.reference)
+ }
> recovery <- matrix(NA, nrow=length(comparisons), ncol=length(methods))
> rownames(recovery) <- comparisons
> colnames(recovery) <- methods
> inconsistency <- truepositives <- falsediscoveries <- recovery
> for (m in methods) {
+ this.reference <- reference[[m]]
+ for (c in comparisons) {
+ results <- get(m)[[c]]
+ results.detected <- results$GeneID[results$FDR<cutoff]
+ tp <- sum(results.detected %in% this.reference)
+ recovery[c,m] <- tp/length(this.reference)
+ nd <- length(results.detected)
+ fd <- sum(!results.detected %in% this.reference)
+ inconsistency[c,m] <- fd/nd
+ truepositives[c,m] <- tp
+ falsediscoveries[c,m] <- fd
+ }
+ }
> recovery
dexseq diffsplice.f edger
100vs000 1.000 1.000 1.000
075vs025 0.353 0.221 0.286
050vs025 0.057 0.016 0.023
075vs050 0.053 0.017 0.022
> inconsistency
dexseq diffsplice.f edger
100vs000 0.00 0.000 0.000
075vs025 0.37 0.078 0.089
050vs025 0.39 0.000 0.053
075vs050 0.27 0.000 0.000
> falsediscoveries
dexseq diffsplice.f edger
100vs000 0 0 0
075vs025 291 14 22
050vs025 52 0 1
075vs050 28 0 0
> truepositives
dexseq diffsplice.f edger
100vs000 1428 750 788
075vs025 504 166 225
050vs025 81 12 18
075vs050 75 13 17
> totaldetected <- falsediscoveries + truepositives
> totaldetected
dexseq diffsplice.f edger
100vs000 1428 750 788
075vs025 795 180 247
050vs025 133 12 19
075vs050 103 13 17
> col <- brewer.pal(9, "Paired")[c(9,5,4)]
> names(col) <- methods
> lty <- c(1,1,3)
> par(mfrow=c(1,2))
> par(mar=c(6,4.5,2,0.5))
> matplot(recovery, type="l", lty=lty, lwd=2, col=col, ylim=c(0,1), ylab="", xaxt="n", yaxt="n")
> title(ylab="Recovery rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.25,0.5,0.75,1), paste0(100*c(0,0.25,0.5,0.75,1), "%"), las=1)
> legend("topright", text.col=col, lty=lty, col=col, legend=methods.label, cex=0.7, bty="n")
> par(mar=c(6,4.9,2,0.1))
> matplot(inconsistency, type="l", lty=lty, lwd=2, col=col, ylim=c(0,0.4), ylab="", xaxt="n", yaxt="n")
> title(ylab="Inconsistency rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.1,0.2,0.3,0.4), paste0(100*c(0,0.1,0.2,0.3,0.4), "%"), las=1)
> reference.all <- unique(unlist(reference))
> names <- methods
> venncounts <- matrix(0, nrow=length(reference.all), ncol=length(names))
> colnames(venncounts) <- names
> for (i in names) venncounts[,i] <- as.numeric(reference.all %in% reference[[i]])
> venn.plot <- draw.triple.venn(
+ area1 = sum(venncounts[,1]), area2 = sum(venncounts[,2]), area3 = sum(venncounts[,3]),
+ n12 = sum(venncounts[,1]==1 & venncounts[,2]==1),
+ n13 = sum(venncounts[,1]==1 & venncounts[,3]==1),
+ n23 = sum(venncounts[,2]==1 & venncounts[,3]==1),
+ n123 = sum(venncounts[,1]==1 & venncounts[,2]==1 & venncounts[,3]==1),
+ category = methods.label[names], fill = col[names],
+ lty = 1, cex = 2, cat.cex = 2, cat.col = "black"
+ )
Total run time for this report: 56 minutes
> sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)
locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
attached base packages:
[1] grid stats4 parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] VennDiagram_1.6.17 futile.logger_1.4.1 DEXSeq_1.18.4
[4] RColorBrewer_1.1-2 AnnotationDbi_1.34.3 DESeq2_1.12.3
[7] SummarizedExperiment_1.2.3 GenomicRanges_1.24.2 GenomeInfoDb_1.8.1
[10] IRanges_2.6.1 S4Vectors_0.10.1 Biobase_2.32.0
[13] BiocGenerics_0.18.0 BiocParallel_1.6.2 edgeR_3.14.0
[16] limma_3.28.14
loaded via a namespace (and not attached):
[1] genefilter_1.54.2 statmod_1.4.24 locfit_1.5-9.1 splines_3.3.1
[5] lattice_0.20-33 colorspace_1.2-6 htmltools_0.3.5 yaml_2.1.13
[9] chron_2.3-47 survival_2.39-5 XML_3.98-1.4 foreign_0.8-66
[13] DBI_0.4-1 lambda.r_1.1.7 plyr_1.8.4 stringr_1.0.0
[17] zlibbioc_1.18.0 Biostrings_2.40.2 munsell_0.4.3 gtable_0.2.0
[21] hwriter_1.3.2 evaluate_0.9 latticeExtra_0.6-28 knitr_1.13
[25] geneplotter_1.50.0 biomaRt_2.28.0 Rcpp_0.12.5 acepack_1.3-3.3
[29] xtable_1.8-2 scales_0.4.0 formatR_1.4 Hmisc_3.17-4
[33] annotate_1.50.0 XVector_0.12.0 Rsamtools_1.24.0 gridExtra_2.2.1
[37] ggplot2_2.1.0 digest_0.6.9 stringi_1.1.1 bitops_1.0-6
[41] tools_3.3.1 magrittr_1.5 RCurl_1.95-4.8 RSQLite_1.0.0
[45] futile.options_1.0.0 Formula_1.2-1 cluster_2.0.4 Matrix_1.2-6
[49] data.table_1.9.6 rmarkdown_1.0 rpart_4.1-10 nnet_7.3-12