Preparing the data for analysis

Setting up

> library(limma)
> library(edgeR)
> library(DEXSeq)
> library(RColorBrewer)
> library(VennDiagram)
> library(BiocParallel)
> dir.create("Output", showWarnings=FALSE)

Load data

> load("RSCE.mRNA.unstranded.exon.rda") 
> RSCE.mRNA.unstranded$samples[,1:6]

                                  group lib.size norm.factors  Sample Replicate Mixture
R1.000_C5N4YANXX_GATCAGAT_R1.bam      0 25596249            1  R1.000        R1       0
R1.025_C5N4YANXX_ACTTGAAT_R1.bam     25 60211551            1  R1.025        R1      25
R1.050_C5N4YANXX_CAGATCAT_R1.bam     50 54543829            1  R1.050        R1      50
R1.075_C5N4YANXX_GCCAATAT_R1.bam     75 33087367            1  R1.075        R1      75
R1.100_C5N4YANXX_ACAGTGAT_R1.bam    100 27687770            1  R1.100        R1     100
R2.000_C5N4YANXX_AGTTCCGT_R1.bam      0 27868893            1  R2.000        R2       0
R2.025_C5N4YANXX_AGTCAACA_R1.bam     25 36130329            1  R2.025        R2      25
R2.050_C5N4YANXX_CTTGTAAT_R1.bam     50 40838511            1  R2.050        R2      50
R2.075_C5N4YANXX_GGCTACAT_R1.bam     75 54481749            1  R2.075        R2      75
R2.100_C5N4YANXX_TAGCTTAT_R1.bam    100 25490533            1  R2.100        R2     100
R2D.000_C5N4YANXX_ATTCCTTT_R1.bam     0 26144678            1 R2D.000       R2D       0
R2D.025_C5N4YANXX_ACTGATAT_R1.bam    25 25890044            1 R2D.025       R2D      25
R2D.050_C5N4YANXX_GAGTGGAT_R1.bam    50 28409164            1 R2D.050       R2D      50
R2D.075_C5N4YANXX_CGTACGTA_R1.bam    75 26375545            1 R2D.075       R2D      75
R2D.100_C5N4YANXX_GTTTCGGA_R1.bam   100 21198960            1 R2D.100       R2D     100
R3.000_C5N4YANXX_GTGGCCTT_R1.bam      0 26699053            1  R3.000        R3       0
R3.025_C5N4YANXX_GTGAAACG_R1.bam     25 43044202            1  R3.025        R3      25
R3.050_C5N4YANXX_GTCCGCAC_R1.bam     50 37555387            1  R3.050        R3      50
R3.075_C5N4YANXX_CCGTCCCG_R1.bam     75 34291008            1  R3.075        R3      75
R3.100_C5N4YANXX_ATGTCAGA_R1.bam    100 28492313            1  R3.100        R3     100

Multi-dimensional scaling plot

> lcpm <- cpm(RSCE.mRNA.unstranded, log=TRUE)
> pchby <- as.numeric(grepl("D", RSCE.mRNA.unstranded$samples$Sample))+1
> colby <- as.factor(RSCE.mRNA.unstranded$samples$group)
> labelby.group <- paste0(as.numeric(levels(colby)), ":", 100-as.numeric(levels(colby)))
> palette(rainbow(5))
> plotMDS(lcpm, pch=as.numeric(pchby)+15, col=as.numeric(colby), main="Poly-A")
> legend("topleft", text.col=unique(as.numeric(colby)), legend=labelby.group, bty="n")

Filter out lowly expressed genes

> keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.125)>=10
> table(keep.exprs)

keep.exprs
 FALSE   TRUE 
 61169 138042

> RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]

Normalisation

> RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)

Set comparisons to be carried out downstream

> comparisons <- c("100vs000", "075vs025", "050vs025", "075vs050")
> sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
> colnames(sample) <- c("Batch", "Mixture")
> sample <- as.data.frame(sample)
> sample

   Batch Mixture
1     R1     000
2     R1     025
3     R1     050
4     R1     075
5     R1     100
6     R2     000
7     R2     025
8     R2     050
9     R2     075
10    R2     100
11   R2D     000
12   R2D     025
13   R2D     050
14   R2D     075
15   R2D     100
16    R3     000
17    R3     025
18    R3     050
19    R3     075
20    R3     100

Differential splicing analysis

Voom-diffSplice using method of F-tests (voom-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   v <- voom(data, design)
+   fit <- lmFit(v, design)
+   ds <- diffSplice(fit, geneid="GeneID")
+   tt <- topSplice(ds, test="F", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output/diffsplice.f.RData")
+ }

== Running comparison 100vs000 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111

EdgeR-diffSplice using method of F-tests (edgeR-ds)

> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   design <- model.matrix(~group)
+   y <- estimateDisp(data, design=design, robust=TRUE)
+   fit <- glmQLFit(y, design=design, robust=TRUE)
+   ds <- diffSpliceDGE(fit, geneid="GeneID")
+   tt <- topSpliceDGE(ds, test="gene", n=Inf)
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output/edger.RData")
+ }

== Running comparison 100vs000 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs025 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 050vs025 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111 
== Running comparison 075vs050 == 
Total number of exons:  138042 
Total number of genes:  16500 
Number of genes with 1 exon:  3113 
Mean number of exons in a gene:  8 
Max number of exons in a gene:  111

DEXSeq

> ncores <- MulticoreParam(workers=4)
> table <- as.list(rep(NA, length(comparisons)))
> names(table) <- comparisons
> for (c in comparisons){
+   cat("== Running comparison", c, "== \n")
+   batch <- paste0("R2", substr(c, start=10, stop=10))
+   batch <- c(batch, "R1", "R3")
+   name <- gsub(".D", "", c)
+   select <- c(strsplit2(name, split="vs"))
+   sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
+   data <- RSCE.mRNA.unstranded[,sample.select]
+   group <- data$samples$group
+   dex.samples <- data.frame(cbind(condition=group))
+   dex.samples$condition <- as.factor(dex.samples$condition)
+   dexds <- DEXSeqDataSet(countData=data$counts, sampleData=dex.samples, 
+     design=~sample+exon+condition:exon, featureID=as.character(1:nrow(data)), 
+     groupID=as.character(data$genes$GeneID))
+   dexds <- estimateSizeFactors(dexds) 
+   dexds <- estimateDispersions(dexds, BPPARAM=ncores) 
+   dexds <- testForDEU(dexds, BPPARAM=ncores) 
+   dexds <- estimateExonFoldChanges(dexds, BPPARAM=ncores) 
+   dex.exon <- DEXSeqResults(dexds)
+   dex.gene <- perGeneQValue(dex.exon, p="pvalue") 
+   dex.gene <- as.data.frame(dex.gene) 
+   dex.gene <- data.frame(rownames(dex.gene), dex.gene) 
+   colnames(dex.gene) <- c("GeneID","FDR") 
+   o <- order(dex.gene$FDR)
+   tt <- dex.gene[o,]
+   table[[c]] <- tt
+   save(comparisons, table, file="./Output/dexseq.RData")
+ }

== Running comparison 100vs000 == 
== Running comparison 075vs025 == 
== Running comparison 050vs025 == 
== Running comparison 075vs050 ==

Recovery and inconsistency rates

Load saved DE results

> files <- list.files(path="./Output", full.name=TRUE)
> methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
> methods.label <- c("DEXSeq", "voom-ds", "edgeR-ds")
> names(methods.label) <- methods
> methods.label

      dexseq diffsplice.f        edger 
    "DEXSeq"    "voom-ds"   "edgeR-ds"

> for (i in 1:length(methods)) {
+   load(files[i])
+   assign(methods[i], table)
+ }

Create sets of reference genes

> cutoff <- 0.05
> reference <- as.list(rep(NA, length(methods)))
> names(reference) <- methods
> for (m in methods) {
+   this.reference <- get(m)[["100vs000"]]
+   this.reference <- this.reference$GeneID[this.reference$FDR<cutoff]
+   reference[[m]] <- as.character(this.reference)
+ }

Calculate recovery and inconsistency rates

> recovery <- matrix(NA, nrow=length(comparisons), ncol=length(methods))
> rownames(recovery) <- comparisons
> colnames(recovery) <- methods
> inconsistency <- truepositives <- falsediscoveries <- recovery
> for (m in methods) {
+   this.reference <- reference[[m]]
+   for (c in comparisons) {
+       results <- get(m)[[c]]
+       results.detected <- results$GeneID[results$FDR<cutoff]
+       tp <- sum(results.detected %in% this.reference)
+       recovery[c,m] <- tp/length(this.reference)
+       nd <- length(results.detected)
+       fd <- sum(!results.detected %in% this.reference)
+       inconsistency[c,m] <- fd/nd
+       truepositives[c,m] <- tp
+       falsediscoveries[c,m] <- fd
+   }
+ } 
> recovery

         dexseq diffsplice.f edger
100vs000  1.000        1.000 1.000
075vs025  0.353        0.221 0.286
050vs025  0.057        0.016 0.023
075vs050  0.053        0.017 0.022

> inconsistency

         dexseq diffsplice.f edger
100vs000   0.00        0.000 0.000
075vs025   0.37        0.078 0.089
050vs025   0.39        0.000 0.053
075vs050   0.27        0.000 0.000

> falsediscoveries

         dexseq diffsplice.f edger
100vs000      0            0     0
075vs025    291           14    22
050vs025     52            0     1
075vs050     28            0     0

> truepositives

         dexseq diffsplice.f edger
100vs000   1428          750   788
075vs025    504          166   225
050vs025     81           12    18
075vs050     75           13    17

> totaldetected <- falsediscoveries + truepositives
> totaldetected

         dexseq diffsplice.f edger
100vs000   1428          750   788
075vs025    795          180   247
050vs025    133           12    19
075vs050    103           13    17

Plots of recovery and inconsistency rates

> col <- brewer.pal(9, "Paired")[c(9,5,4)]
> names(col) <- methods
> lty <- c(1,1,3)
> par(mfrow=c(1,2))
> par(mar=c(6,4.5,2,0.5))
> matplot(recovery, type="l", lty=lty, lwd=2, col=col, ylim=c(0,1), ylab="", xaxt="n", yaxt="n")
> title(ylab="Recovery rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.25,0.5,0.75,1), paste0(100*c(0,0.25,0.5,0.75,1), "%"), las=1)
> legend("topright", text.col=col, lty=lty, col=col, legend=methods.label, cex=0.7, bty="n")
> par(mar=c(6,4.9,2,0.1))
> matplot(inconsistency, type="l", lty=lty, lwd=2, col=col, ylim=c(0,0.4), ylab="", xaxt="n", yaxt="n")
> title(ylab="Inconsistency rate", line=3.5)
> axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
> axis(2, at=c(0,0.1,0.2,0.3,0.4), paste0(100*c(0,0.1,0.2,0.3,0.4), "%"), las=1)

Overlap in reference genes

> reference.all <- unique(unlist(reference))
> names <- methods
> venncounts <- matrix(0, nrow=length(reference.all), ncol=length(names))
> colnames(venncounts) <- names
> for (i in names) venncounts[,i] <- as.numeric(reference.all %in% reference[[i]])
> venn.plot <- draw.triple.venn(
+   area1 = sum(venncounts[,1]), area2 = sum(venncounts[,2]), area3 = sum(venncounts[,3]),
+   n12 = sum(venncounts[,1]==1 & venncounts[,2]==1),
+   n13 = sum(venncounts[,1]==1 & venncounts[,3]==1),
+   n23 = sum(venncounts[,2]==1 & venncounts[,3]==1),
+   n123 = sum(venncounts[,1]==1 & venncounts[,2]==1 & venncounts[,3]==1),
+   category = methods.label[names], fill = col[names],
+   lty = 1, cex = 2, cat.cex = 2, cat.col = "black"
+ )

Session time and information

Total run time for this report: 56 minutes

> sessionInfo()

R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.3 (El Capitan)

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
 [1] grid      stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] VennDiagram_1.6.17         futile.logger_1.4.1        DEXSeq_1.18.4             
 [4] RColorBrewer_1.1-2         AnnotationDbi_1.34.3       DESeq2_1.12.3             
 [7] SummarizedExperiment_1.2.3 GenomicRanges_1.24.2       GenomeInfoDb_1.8.1        
[10] IRanges_2.6.1              S4Vectors_0.10.1           Biobase_2.32.0            
[13] BiocGenerics_0.18.0        BiocParallel_1.6.2         edgeR_3.14.0              
[16] limma_3.28.14             

loaded via a namespace (and not attached):
 [1] genefilter_1.54.2    statmod_1.4.24       locfit_1.5-9.1       splines_3.3.1       
 [5] lattice_0.20-33      colorspace_1.2-6     htmltools_0.3.5      yaml_2.1.13         
 [9] chron_2.3-47         survival_2.39-5      XML_3.98-1.4         foreign_0.8-66      
[13] DBI_0.4-1            lambda.r_1.1.7       plyr_1.8.4           stringr_1.0.0       
[17] zlibbioc_1.18.0      Biostrings_2.40.2    munsell_0.4.3        gtable_0.2.0        
[21] hwriter_1.3.2        evaluate_0.9         latticeExtra_0.6-28  knitr_1.13          
[25] geneplotter_1.50.0   biomaRt_2.28.0       Rcpp_0.12.5          acepack_1.3-3.3     
[29] xtable_1.8-2         scales_0.4.0         formatR_1.4          Hmisc_3.17-4        
[33] annotate_1.50.0      XVector_0.12.0       Rsamtools_1.24.0     gridExtra_2.2.1     
[37] ggplot2_2.1.0        digest_0.6.9         stringi_1.1.1        bitops_1.0-6        
[41] tools_3.3.1          magrittr_1.5         RCurl_1.95-4.8       RSQLite_1.0.0       
[45] futile.options_1.0.0 Formula_1.2-1        cluster_2.0.4        Matrix_1.2-6        
[49] data.table_1.9.6     rmarkdown_1.0        rpart_4.1-10         nnet_7.3-12

Differential splicing analyses on mixology data

Charity Law

30 August 2016

Preparing the data for analysis

Setting up

Load data

Multi-dimensional scaling plot

Filter out lowly expressed genes

Normalisation

Set comparisons to be carried out downstream

Differential splicing analysis

Voom-diffSplice using method of F-tests (voom-ds)

EdgeR-diffSplice using method of F-tests (edgeR-ds)

DEXSeq

Recovery and inconsistency rates

Load saved DE results

Create sets of reference genes

Calculate recovery and inconsistency rates

Plots of recovery and inconsistency rates

Overlap in reference genes

Session time and information