---
title: "Differential splicing analyses on mixology data"
author: "Charity Law"
date: "30 August 2016"
output: html_document
---

```{r global_options, include=FALSE}
knitr::opts_chunk$set(prompt=TRUE, comment="", message=FALSE, warning=FALSE, fig.width=5, fig.align="center")
options(digits=2)
options(width=105)
starttime <- proc.time()
```

	
	
	
# Preparing the data for analysis

### Setting up
```{r setup}
library(limma)
library(edgeR)
library(DEXSeq)
library(RColorBrewer)
library(VennDiagram)
library(BiocParallel)
dir.create("Output", showWarnings=FALSE)
```

### Load data
```{r changedisplay, include=FALSE}
options(digits=10)
```

```{r data}
load("RSCE.mRNA.unstranded.exon.rda") 
RSCE.mRNA.unstranded$samples[,1:6]
```

```{r changedisplayback, include=FALSE}
options(digits=2)
```

### Multi-dimensional scaling plot
```{r mdsplot}
lcpm <- cpm(RSCE.mRNA.unstranded, log=TRUE)
pchby <- as.numeric(grepl("D", RSCE.mRNA.unstranded$samples$Sample))+1
colby <- as.factor(RSCE.mRNA.unstranded$samples$group)
labelby.group <- paste0(as.numeric(levels(colby)), ":", 100-as.numeric(levels(colby)))
palette(rainbow(5))
plotMDS(lcpm, pch=as.numeric(pchby)+15, col=as.numeric(colby), main="Poly-A")
legend("topleft", text.col=unique(as.numeric(colby)), legend=labelby.group, bty="n")  
```

### Filter out lowly expressed genes
```{r filtering}
keep.exprs <- rowSums(cpm(RSCE.mRNA.unstranded)>0.125)>=10
table(keep.exprs)
RSCE.mRNA.unstranded <- RSCE.mRNA.unstranded[keep.exprs,]
```    
  
### Normalisation
```{r normalisation}  
RSCE.mRNA.unstranded <- calcNormFactors(RSCE.mRNA.unstranded)
``` 

### Set comparisons to be carried out downstream
```{r comparisons}  
comparisons <- c("100vs000", "075vs025", "050vs025", "075vs050")
sample <- strsplit2(RSCE.mRNA.unstranded$samples$Sample, split="\\.")
colnames(sample) <- c("Batch", "Mixture")
sample <- as.data.frame(sample)
sample
```
  
  
  

# Differential splicing analysis

### Voom-diffSplice using method of F-tests (voom-ds)
```{r voom}
table <- as.list(rep(NA, length(comparisons)))
names(table) <- comparisons
for (c in comparisons){
  cat("== Running comparison", c, "== \n")
  batch <- paste0("R2", substr(c, start=10, stop=10))
  batch <- c(batch, "R1", "R3")
  name <- gsub(".D", "", c)
  select <- c(strsplit2(name, split="vs"))
  sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
  data <- RSCE.mRNA.unstranded[,sample.select]
  group <- data$samples$group
  design <- model.matrix(~group)
  v <- voom(data, design)
  fit <- lmFit(v, design)
  ds <- diffSplice(fit, geneid="GeneID")
  tt <- topSplice(ds, test="F", n=Inf)
  table[[c]] <- tt
  save(comparisons, table, file="./Output/diffsplice.f.RData")
}
```

### EdgeR-diffSplice using method of F-tests (edgeR-ds)
```{r edger}
table <- as.list(rep(NA, length(comparisons)))
names(table) <- comparisons
for (c in comparisons){
  cat("== Running comparison", c, "== \n")
  batch <- paste0("R2", substr(c, start=10, stop=10))
  batch <- c(batch, "R1", "R3")
  name <- gsub(".D", "", c)
  select <- c(strsplit2(name, split="vs"))
  sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
  data <- RSCE.mRNA.unstranded[,sample.select]
  group <- data$samples$group
  design <- model.matrix(~group)
  y <- estimateDisp(data, design=design, robust=TRUE)
  fit <- glmQLFit(y, design=design, robust=TRUE)
  ds <- diffSpliceDGE(fit, geneid="GeneID")
  tt <- topSpliceDGE(ds, test="gene", n=Inf)
  table[[c]] <- tt
  save(comparisons, table, file="./Output/edger.RData")
}
```

### DEXSeq
```{r dexseq}
ncores <- MulticoreParam(workers=4)
table <- as.list(rep(NA, length(comparisons)))
names(table) <- comparisons
for (c in comparisons){
  cat("== Running comparison", c, "== \n")
  batch <- paste0("R2", substr(c, start=10, stop=10))
  batch <- c(batch, "R1", "R3")
  name <- gsub(".D", "", c)
  select <- c(strsplit2(name, split="vs"))
  sample.select <- which(sample$Mixture %in% select & sample$Batch %in% batch)
  data <- RSCE.mRNA.unstranded[,sample.select]
  group <- data$samples$group
  dex.samples <- data.frame(cbind(condition=group))
  dex.samples$condition <- as.factor(dex.samples$condition)
  dexds <- DEXSeqDataSet(countData=data$counts, sampleData=dex.samples, 
    design=~sample+exon+condition:exon, featureID=as.character(1:nrow(data)), 
    groupID=as.character(data$genes$GeneID))
  dexds <- estimateSizeFactors(dexds) 
  dexds <- estimateDispersions(dexds, BPPARAM=ncores) 
  dexds <- testForDEU(dexds, BPPARAM=ncores) 
  dexds <- estimateExonFoldChanges(dexds, BPPARAM=ncores) 
  dex.exon <- DEXSeqResults(dexds)
  dex.gene <- perGeneQValue(dex.exon, p="pvalue") 
  dex.gene <- as.data.frame(dex.gene) 
  dex.gene <- data.frame(rownames(dex.gene), dex.gene) 
  colnames(dex.gene) <- c("GeneID","FDR") 
  o <- order(dex.gene$FDR)
  tt <- dex.gene[o,]
  table[[c]] <- tt
  save(comparisons, table, file="./Output/dexseq.RData")
}
```




# Recovery and inconsistency rates

### Load saved DE results
```{r loadresults}
files <- list.files(path="./Output", full.name=TRUE)
methods <- gsub(".RData", "", strsplit2(files, split="/")[,3])
methods.label <- c("DEXSeq", "voom-ds", "edgeR-ds")
names(methods.label) <- methods
methods.label
for (i in 1:length(methods)) {
	load(files[i])
	assign(methods[i], table)
}
```

### Create sets of reference genes
```{r refgenes}
cutoff <- 0.05
reference <- as.list(rep(NA, length(methods)))
names(reference) <- methods
for (m in methods) {
	this.reference <- get(m)[["100vs000"]]
	this.reference <- this.reference$GeneID[this.reference$FDR<cutoff]
	reference[[m]] <- as.character(this.reference)
}
```

### Calculate recovery and inconsistency rates
```{r recoveryrate}
recovery <- matrix(NA, nrow=length(comparisons), ncol=length(methods))
rownames(recovery) <- comparisons
colnames(recovery) <- methods
inconsistency <- truepositives <- falsediscoveries <- recovery
for (m in methods) {
	this.reference <- reference[[m]]
	for (c in comparisons) {
		results <- get(m)[[c]]
		results.detected <- results$GeneID[results$FDR<cutoff]
		tp <- sum(results.detected %in% this.reference)
		recovery[c,m] <- tp/length(this.reference)
		nd <- length(results.detected)
		fd <- sum(!results.detected %in% this.reference)
		inconsistency[c,m] <- fd/nd
		truepositives[c,m] <- tp
		falsediscoveries[c,m] <- fd
	}
}	
recovery
inconsistency
falsediscoveries
truepositives
totaldetected <- falsediscoveries + truepositives
totaldetected
```	

### Plots of recovery and inconsistency rates
```{r, fig.width=10}
col <- brewer.pal(9, "Paired")[c(9,5,4)]
names(col) <- methods
lty <- c(1,1,3)
par(mfrow=c(1,2))
par(mar=c(6,4.5,2,0.5))
matplot(recovery, type="l", lty=lty, lwd=2, col=col, ylim=c(0,1), ylab="", xaxt="n", yaxt="n")
title(ylab="Recovery rate", line=3.5)
axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
axis(2, at=c(0,0.25,0.5,0.75,1), paste0(100*c(0,0.25,0.5,0.75,1), "%"), las=1)
legend("topright", text.col=col, lty=lty, col=col, legend=methods.label, cex=0.7, bty="n")
par(mar=c(6,4.9,2,0.1))
matplot(inconsistency, type="l", lty=lty, lwd=2, col=col, ylim=c(0,0.4), ylab="", xaxt="n", yaxt="n")
title(ylab="Inconsistency rate", line=3.5)
axis(1, at=1:4, comparisons, tick=FALSE, las=2, font=1)
axis(2, at=c(0,0.1,0.2,0.3,0.4), paste0(100*c(0,0.1,0.2,0.3,0.4), "%"), las=1)
```




# Overlap in reference genes
```{r overlap, fig.width=7, fig.height=7}
reference.all <- unique(unlist(reference))
names <- methods
venncounts <- matrix(0, nrow=length(reference.all), ncol=length(names))
colnames(venncounts) <- names
for (i in names) venncounts[,i] <- as.numeric(reference.all %in% reference[[i]])
venn.plot <- draw.triple.venn(
  area1 = sum(venncounts[,1]), area2 = sum(venncounts[,2]), area3 = sum(venncounts[,3]),
  n12 = sum(venncounts[,1]==1 & venncounts[,2]==1),
  n13 = sum(venncounts[,1]==1 & venncounts[,3]==1),
  n23 = sum(venncounts[,2]==1 & venncounts[,3]==1),
  n123 = sum(venncounts[,1]==1 & venncounts[,2]==1 & venncounts[,3]==1),
  category = methods.label[names], fill = col[names],
  lty = 1, cex = 2, cat.cex = 2, cat.col = "black"
)
```




# Session time and information  
```{r runtime, echo=FALSE}
totaltime <- proc.time()-starttime
cat("Total run time for this report:", round(totaltime[3]/60, 2), "minutes \n")
```

```{r sessioninfo}
sessionInfo()
```