Preparing the data for analysis

Setting up

> library(limma)
> library(edgeR)
> library(DEXSeq)
> library(RColorBrewer)
> library(VennDiagram)
> library(BiocParallel)
> dir.create("Output_using_degraded_samples", showWarnings=FALSE)

Load data

> load("RSCE.mRNA.unstranded.exon.rda") 
> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.125)>=10
> table(keep.exprs)

keep.exprs
 FALSE   TRUE 
 61169 138042

> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]
> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)

Set comparisons to be carried out downstream

> comparisons <- c("100vs000.D", "075vs025.D", "050vs025.D", "075vs050.D")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)

Differential splicing analysis

Voom-diffSplice using method of F-tests (voom-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   v <- voom(data, design)
+   fit <- lmFit(v, design)
+   ds <- diffSplice(fit, geneid="GeneID")
+   tt <- topSplice(ds, test="F", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/diffsplice.f.RData")
+ }

== Running comparison 100vs000.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111

EdgeR-diffSplice using method of F-tests (edgeR-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   y <- estimateDisp(data, design=design, robust=TRUE)
+   fit <- glmQLFit(y, design=design, robust=TRUE)
+   ds <- diffSpliceDGE(fit, geneid="GeneID")
+   tt <- topSpliceDGE(ds, test="gene", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/edger.RData")
+ }

== Running comparison 100vs000.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050.D == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111

DEXSeq

> ncores <- MulticoreParam(workers=4)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   dex.samples <- data.frame(cbind(condition=group))
+   dex.samples$condition <- as.factor(dex.samples$condition)
+   dexds <- DEXSeqDataSet(countData=data$counts, sampleData=dex.samples, 
+     design=~sample+exon+condition:exon, featureID=as.character(1:nrow(data)), 
+     groupID=as.character(data$genes$GeneID))
+   dexds <- estimateSizeFactors(dexds) 
+   dexds <- estimateDispersions(dexds, BPPARAM=ncores) 
+   dexds <- testForDEU(dexds, BPPARAM=ncores) 
+   dexds <- estimateExonFoldChanges(dexds, BPPARAM=ncores) 
+   dex.exon <- DEXSeqResults(dexds)
+   dex.gene <- perGeneQValue(dex.exon, p="pvalue") 
+   dex.gene <- as.data.frame(dex.gene) 
+   dex.gene <- data.frame(rownames(dex.gene), dex.gene) 
+   colnames(dex.gene) <- c("GeneID","FDR") 
+   o <- order(dex.gene$FDR)
+   tt <- dex.gene[o,]
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output_using_degraded_samples/dexseq.RData")
+ }

== Running comparison 100vs000.D == 
== Running comparison 075vs025.D == 
== Running comparison 050vs025.D == 
== Running comparison 075vs050.D ==

Recovery and inconsistency rates

Load saved DE results without degraded samples

> files <- list.files(path="./Output", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> methods.label <- c("DEXSeq", "voom-ds", "edgeR-ds")
> names(methods.label) <- methods
> methods.label

      dexseq diffsplice.f        edger 
    "DEXSeq"    "voom-ds"   "edgeR-ds"

> for (i in 1:length(methods)) {
+   load(files[i])
+   assign(methods[i], table)
+ }

Create sets of reference genes

> cutoff <- 0.05
> reference <- as.list(rep(NA, length(methods)))
> names(reference) <- methods
> for (m in methods) {
+   this.reference <- get(m)[["100vs000"]]
+   this.reference <- this.reference$GeneID[this.reference$FDR<cutoff]
+   reference[[m]] <- as.character(this.reference)
+ }

Load saved DE results using degraded samples

> files <- list.files(path="./Output_using_degraded_samples", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> name.comparisons <- gsub(".D", "d", comparisons)
> for (i in 1:length(methods)) {
+   load(files[i])
+   assign(methods[i], table)
+ }

Calculate recovery and inconsistency rates

> recovery <- matrix(NA, nrow=length(comparisons), ncol=length(methods))
> rownames(recovery) <- comparisons
> colnames(recovery) <- methods
> inconsistency <- truepositives <- falsediscoveries <- recovery
> for (m in methods) {
+   this.reference <- reference[[m]]
+   for (c in comparisons) {
+       results <- get(m)[[c]]
+       results.detected <- results$GeneID[results$FDR<cutoff]
+       tp <- sum(results.detected %in% this.reference)
+       recovery[c,m] <- tp/length(this.reference)
+       nd <- length(results.detected)
+       fd <- sum(!results.detected %in% this.reference)
+       inconsistency[c,m] <- fd/nd
+       truepositives[c,m] <- tp
+       falsediscoveries[c,m] <- fd
+   }
+ } 
> recovery

           dexseq diffsplice.f edger
100vs000.D 0.3669       0.3453  0.33
075vs025.D 0.0469       0.0067  0.01
050vs025.D 0.0063       0.0000  0.00
075vs050.D 0.0063       0.0000  0.00

> inconsistency

           dexseq diffsplice.f edger
100vs000.D  0.066        0.048 0.044
075vs025.D  0.000        0.000 0.000
050vs025.D  0.000          NaN   NaN
075vs050.D  0.000          NaN   NaN

> falsediscoveries

           dexseq diffsplice.f edger
100vs000.D     37           13    12
075vs025.D      0            0     0
050vs025.D      0            0     0
075vs050.D      0            0     0

> truepositives

           dexseq diffsplice.f edger
100vs000.D    524          259   260
075vs025.D     67            5     8
050vs025.D      9            0     0
075vs050.D      9            0     0

> totaldetected <- falsediscoveries + truepositives
> totaldetected

           dexseq diffsplice.f edger
100vs000.D    561          272   272
075vs025.D     67            5     8
050vs025.D      9            0     0
075vs050.D      9            0     0

Plots of recovery and inconsistency rates

> col <- brewer.pal(9, "Paired")[c(9,5,4)]
> names(col) <- methods
> lty <- c(1,1,3)
> par(mfrow=c(1,2))
> par(mar=c(6,4.5,2,0.5))
> matplot(recovery, type="l", lty=lty, lwd=2, col=col, ylim=c(0,1), ylab="", xaxt="n", yaxt="n")
> title(ylab="Recovery rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.25,0.5,0.75,1), paste0(100*c(0,0.25,0.5,0.75,1), "%"), las=1)
> legend("topright", text.col=col, lty=lty, col=col, legend=methods.label, cex=0.7, bty="n")
> par(mar=c(6,4.9,2,0.1))
> matplot(inconsistency, type="l", lty=lty, lwd=2, col=col, ylim=c(0,0.4), ylab="", xaxt="n", yaxt="n")
> title(ylab="Inconsistency rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.1,0.2,0.3,0.4), paste0(100*c(0,0.1,0.2,0.3,0.4), "%"), las=1)

Session time and information

Total run time for this report: 39 minutes

> sessionInfo()

R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
 [1] grid      stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] VennDiagram_1.6.17         futile.logger_1.4.1        DEXSeq_1.18.4             
 [4] RColorBrewer_1.1-2         AnnotationDbi_1.34.3       DESeq2_1.12.3             
 [7] SummarizedExperiment_1.2.3 GenomicRanges_1.24.2       GenomeInfoDb_1.8.1        
[10] IRanges_2.6.1              S4Vectors_0.10.1           Biobase_2.32.0            
[13] BiocGenerics_0.18.0        BiocParallel_1.6.2         edgeR_3.14.0              
[16] limma_3.28.14             

loaded via a namespace (and not attached):
 [1] genefilter_1.54.2    statmod_1.4.24       locfit_1.5-9.1       splines_3.3.1       
 [5] lattice_0.20-33      colorspace_1.2-6     htmltools_0.3.5      yaml_2.1.13         
 [9] chron_2.3-47         survival_2.39-5      XML_3.98-1.4         foreign_0.8-66      
[13] DBI_0.4-1            lambda.r_1.1.7       plyr_1.8.4           stringr_1.0.0       
[17] zlibbioc_1.18.0      Biostrings_2.40.2    munsell_0.4.3        gtable_0.2.0        
[21] hwriter_1.3.2        evaluate_0.9         latticeExtra_0.6-28  knitr_1.13          
[25] geneplotter_1.50.0   biomaRt_2.28.0       Rcpp_0.12.5          acepack_1.3-3.3     
[29] xtable_1.8-2         scales_0.4.0         formatR_1.4          Hmisc_3.17-4        
[33] annotate_1.50.0      XVector_0.12.0       Rsamtools_1.24.0     gridExtra_2.2.1     
[37] ggplot2_2.1.0        digest_0.6.9         stringi_1.1.1        bitops_1.0-6        
[41] tools_3.3.1          magrittr_1.5         RCurl_1.95-4.8       RSQLite_1.0.0       
[45] futile.options_1.0.0 Formula_1.2-1        cluster_2.0.4        Matrix_1.2-6        
[49] data.table_1.9.6     rmarkdown_1.0        rpart_4.1-10         nnet_7.3-12

Differential splicing analyses on mixology data (using degraded samples)

Charity Law

30 August 2016

Preparing the data for analysis

Setting up

Load data

Set comparisons to be carried out downstream

Differential splicing analysis

Voom-diffSplice using method of F-tests (voom-ds)

EdgeR-diffSplice using method of F-tests (edgeR-ds)

DEXSeq

Recovery and inconsistency rates

Load saved DE results without degraded samples

Create sets of reference genes

Load saved DE results using degraded samples

Calculate recovery and inconsistency rates

Plots of recovery and inconsistency rates

Session time and information