Preparing the data for analysis

Setting up

> library(limma)
> library(edgeR)
> library(DEXSeq)
> library(RColorBrewer)
> library(VennDiagram)
> library(BiocParallel)
> dir.create("Output_using_degraded_samples", showWarnings=FALSE)

Load data

> load("RSCE.mRNA.unstranded.exon.rda") 
> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.125)>=10
> table(keep.exprs)
keep.exprs
 FALSE   TRUE 
 61169 138042 
> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]
> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)

Set comparisons to be carried out downstream

> comparisons <- c("100vs000.D", "075vs025.D", "050vs025.D", "075vs050.D")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)

Differential splicing analysis

Voom-diffSplice using method of F-tests (voom-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   v <- voom(data, design)
+   fit <- lmFit(v, design)
+   ds <- diffSplice(fit, geneid="GeneID")
+   tt <- topSplice(ds, test="F", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/diffsplice.f.RData")
+ }
== Running comparison 100vs000.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 

EdgeR-diffSplice using method of F-tests (edgeR-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   y <- estimateDisp(data, design=design, robust=TRUE)
+   fit <- glmQLFit(y, design=design, robust=TRUE)
+   ds <- diffSpliceDGE(fit, geneid="GeneID")
+   tt <- topSpliceDGE(ds, test="gene", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/edger.RData")
+ }
== Running comparison 100vs000.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 

DEXSeq

> ncores <- MulticoreParam(workers=4)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   dex.samples <- data.frame(cbind(condition=group))
+   dex.samples$condition <- as.factor(dex.samples$condition)
+   dexds <- DEXSeqDataSet(countData=data$counts, sampleData=dex.samples, 
+     design=~sample+exon+condition:exon, featureID=as.character(1:nrow(data)), 
+     groupID=as.character(data$genes$GeneID))
+   dexds <- estimateSizeFactors(dexds) 
+   dexds <- estimateDispersions(dexds, BPPARAM=ncores) 
+   dexds <- testForDEU(dexds, BPPARAM=ncores) 
+   dexds <- estimateExonFoldChanges(dexds, BPPARAM=ncores) 
+   dex.exon <- DEXSeqResults(dexds)
+   dex.gene <- perGeneQValue(dex.exon, p="pvalue") 
+   dex.gene <- as.data.frame(dex.gene) 
+   dex.gene <- data.frame(rownames(dex.gene), dex.gene) 
+   colnames(dex.gene) <- c("GeneID","FDR") 
+   o <- order(dex.gene$FDR)
+   tt <- dex.gene[o,]
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/dexseq.RData")
+ }
== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D == 

Recovery and inconsistency rates

Load saved DE results without degraded samples

> files <- list.files(path="./Output", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> methods.label <- c("DEXSeq", "voom-ds", "edgeR-ds")
> names(methods.label) <- methods
> methods.label
      dexseq diffsplice.f        edger 
    "DEXSeq"    "voom-ds"   "edgeR-ds" 
> for (i in 1:length(methods)) {
+   load(files[i])
+   assign(methods[i], table)
+ }

Create sets of reference genes

> cutoff <- 0.05
> reference <- as.list(rep(NA, length(methods)))
> names(reference) <- methods
> for (m in methods) {
+   this.reference <- get(m)[["100vs000"]]
+   this.reference <- this.reference$GeneID[this.reference$FDR<cutoff]
+   reference[[m]] <- as.character(this.reference)
+ }

Load saved DE results using degraded samples

> files <- list.files(path="./Output_using_degraded_samples", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> name.comparisons <- gsub(".D", "d", comparisons)
> for (i in 1:length(methods)) {
+   load(files[i])
+   assign(methods[i], table)
+ }

Calculate recovery and inconsistency rates

> recovery <- matrix(NA, nrow=length(comparisons), ncol=length(methods))
> rownames(recovery) <- comparisons
> colnames(recovery) <- methods
> inconsistency <- truepositives <- falsediscoveries <- recovery
> for (m in methods) {
+   this.reference <- reference[[m]]
+   for (c in comparisons) {
+       results <- get(m)[[c]]
+       results.detected <- results$GeneID[results$FDR<cutoff]
+       tp <- sum(results.detected %in% this.reference)
+       recovery[c,m] <- tp/length(this.reference)
+       nd <- length(results.detected)
+       fd <- sum(!results.detected %in% this.reference)
+       inconsistency[c,m] <- fd/nd
+       truepositives[c,m] <- tp
+       falsediscoveries[c,m] <- fd
+   }
+ } 
> recovery
           dexseq diffsplice.f edger
100vs000.D 0.3669       0.3453  0.33
075vs025.D 0.0469       0.0067  0.01
050vs025.D 0.0063       0.0000  0.00
075vs050.D 0.0063       0.0000  0.00
> inconsistency
           dexseq diffsplice.f edger
100vs000.D  0.066        0.048 0.044
075vs025.D  0.000        0.000 0.000
050vs025.D  0.000          NaN   NaN
075vs050.D  0.000          NaN   NaN
> falsediscoveries
           dexseq diffsplice.f edger
100vs000.D     37           13    12
075vs025.D      0            0     0
050vs025.D      0            0     0
075vs050.D      0            0     0
> truepositives
           dexseq diffsplice.f edger
100vs000.D    524          259   260
075vs025.D     67            5     8
050vs025.D      9            0     0
075vs050.D      9            0     0
> totaldetected <- falsediscoveries + truepositives
> totaldetected
           dexseq diffsplice.f edger
100vs000.D    561          272   272
075vs025.D     67            5     8
050vs025.D      9            0     0
075vs050.D      9            0     0

Plots of recovery and inconsistency rates

> col <- brewer.pal(9, "Paired")[c(9,5,4)]
> names(col) <- methods
> lty <- c(1,1,3)
> par(mfrow=c(1,2))
> par(mar=c(6,4.5,2,0.5))
> matplot(recovery, type="l", lty=lty, lwd=2, col=col, ylim=c(0,1), ylab="", xaxt="n", yaxt="n")
> title(ylab="Recovery rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.25,0.5,0.75,1), paste0(100*c(0,0.25,0.5,0.75,1), "%"), las=1)
> legend("topright", text.col=col, lty=lty, col=col, legend=methods.label, cex=0.7, bty="n")
> par(mar=c(6,4.9,2,0.1))
> matplot(inconsistency, type="l", lty=lty, lwd=2, col=col, ylim=c(0,0.4), ylab="", xaxt="n", yaxt="n")
> title(ylab="Inconsistency rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.1,0.2,0.3,0.4), paste0(100*c(0,0.1,0.2,0.3,0.4), "%"), las=1)

Session time and information

Total run time for this report: 39 minutes 
> sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
 [1] grid      stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] VennDiagram_1.6.17         futile.logger_1.4.1        DEXSeq_1.18.4             
 [4] RColorBrewer_1.1-2         AnnotationDbi_1.34.3       DESeq2_1.12.3             
 [7] SummarizedExperiment_1.2.3 GenomicRanges_1.24.2       GenomeInfoDb_1.8.1        
[10] IRanges_2.6.1              S4Vectors_0.10.1           Biobase_2.32.0            
[13] BiocGenerics_0.18.0        BiocParallel_1.6.2         edgeR_3.14.0              
[16] limma_3.28.14             

loaded via a namespace (and not attached):
 [1] genefilter_1.54.2    statmod_1.4.24       locfit_1.5-9.1       splines_3.3.1       
 [5] lattice_0.20-33      colorspace_1.2-6     htmltools_0.3.5      yaml_2.1.13         
 [9] chron_2.3-47         survival_2.39-5      XML_3.98-1.4         foreign_0.8-66      
[13] DBI_0.4-1            lambda.r_1.1.7       plyr_1.8.4           stringr_1.0.0       
[17] zlibbioc_1.18.0      Biostrings_2.40.2    munsell_0.4.3        gtable_0.2.0        
[21] hwriter_1.3.2        evaluate_0.9         latticeExtra_0.6-28  knitr_1.13          
[25] geneplotter_1.50.0   biomaRt_2.28.0       Rcpp_0.12.5          acepack_1.3-3.3     
[29] xtable_1.8-2         scales_0.4.0         formatR_1.4          Hmisc_3.17-4        
[33] annotate_1.50.0      XVector_0.12.0       Rsamtools_1.24.0     gridExtra_2.2.1     
[37] ggplot2_2.1.0        digest_0.6.9         stringi_1.1.1        bitops_1.0-6        
[41] tools_3.3.1          magrittr_1.5         RCurl_1.95-4.8       RSQLite_1.0.0       
[45] futile.options_1.0.0 Formula_1.2-1        cluster_2.0.4        Matrix_1.2-6        
[49] data.table_1.9.6     rmarkdown_1.0        rpart_4.1-10         nnet_7.3-12