--- title: "RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR" author: - name: Charity Law affiliation: The Walter and Eliza Hall Institute of Medical Research, 1G Royal Parade, Parkville, VIC 3052, Melbourne, Australia; Department of Medical Biology, The University of Melbourne, Parkville, VIC 3010, Melbourne, Australia - name: Monther Alhamdoosh affiliation: CSL Limited, Bio21 Institute, 30 Flemington Road, Parkville, Victoria 3010, Australia - name: Shian Su affiliation: The Walter and Eliza Hall Institute of Medical Research, 1G Royal Parade, Parkville, VIC 3052, Melbourne, Australia - name: Gordon K. Smyth affiliation: The Walter and Eliza Hall Institute of Medical Research, 1G Royal Parade, Parkville, VIC 3052, Melbourne, Australia; School of Mathematics and Statistics, The University of Melbourne, Parkville, VIC 3010, Melbourne, Australia - name: Matthew E. Ritchie affiliation: The Walter and Eliza Hall Institute of Medical Research, 1G Royal Parade, Parkville, VIC 3052, Melbourne, Australia; Department of Medical Biology, The University of Melbourne, Parkville, VIC 3010, Melbourne, Australia; School of Mathematics and Statistics, The University of Melbourne, Parkville, VIC 3010, Melbourne, Australia\ date: 31 May 2016 output: BiocStyle::html_document --- # Set-up ```{r setup, message=FALSE} library(BiocStyle) library(knitr) options(digits=3) options(width=90) url <- "http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file" utils::download.file(url, destfile="GSE63310_RAW.tar", mode="wb") utils::untar("GSE63310_RAW.tar", exdir = ".") files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") for(i in paste(files, ".gz", sep="")) R.utils::gunzip(i, overwrite=TRUE) ``` Note that this workflow uses the developmental version of the Glimma package, so be ensure to install as follows before running. ```{r Glimmainstall} source("http://www.bioconductor.org/biocLite.R") biocLite("Glimma", suppressUpdates=TRUE) ``` # Data packaging ```{r import1} files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") read.delim(files[1], nrow=5) ``` ```{r import2} library(limma) library(edgeR) x <- readDGE(files, columns=c(1,3)) class(x) dim(x) ``` ```{r annotatesamples} samplenames <- substring(colnames(x), 12, nchar(colnames(x))) samplenames colnames(x) <- samplenames group <- as.factor(c("LP", "ML", "Basal", "Basal", "ML", "LP", "Basal", "ML", "LP")) x$samples$group <- group lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2))) x$samples$lane <- lane x$samples ``` ```{r annotategenes, message=FALSE} library(Mus.musculus) geneid <- rownames(x) genes <- select(Mus.musculus, keys=geneid, columns=c("SYMBOL", "TXCHROM"), keytype="ENTREZID") head(genes) ``` ```{r removedups} dup <- genes$ENTREZID[duplicated(genes$ENTREZID)] genes[genes$ENTREZID %in% dup,][1:10,] mat <- match(geneid, genes$ENTREZID) genes <- genes[mat,] genes[genes$ENTREZID %in% dup,][1:5,] ``` ```{r assigngeneanno} x$genes <- genes ``` ```{r displaycountobject} x ``` # Data pre-processing ```{r cpm} cpm <- cpm(x) lcpm <- cpm(x, log=TRUE) ``` ```{r zeroes} table(rowSums(x$counts==0)==9) ``` ```{r filter} keep.exprs <- rowSums(cpm>1)>=3 x <- x[keep.exprs,, keep.lib.sizes=FALSE] dim(x) ``` ```{r filterplot1, fig.height=4, fig.width=8} library(RColorBrewer) nsamples <- ncol(x) col <- brewer.pal(nsamples, "Paired") par(mfrow=c(1,2)) plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="A. Raw data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") lcpm <- cpm(x, log=TRUE) plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="B. Filtered data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") ``` ```{r normalize} x <- calcNormFactors(x, method = "TMM") x$samples$norm.factors ``` ```{r normalizemodifieddata} x2 <- x x2$samples$norm.factors <- 1 x2$counts[,1] <- ceiling(x2$counts[,1]*0.05) x2$counts[,2] <- x2$counts[,2]*5 ``` ```{r plotmodifieddata, fig.height=4, fig.width=8} par(mfrow=c(1,2)) lcpm <- cpm(x2, log=TRUE) boxplot(lcpm, las=2, col=col, main="") title(main="A. Example: Unnormalised data",ylab="Log-cpm") x2 <- calcNormFactors(x2) x2$samples$norm.factors lcpm <- cpm(x2, log=TRUE) boxplot(lcpm, las=2, col=col, main="") title(main="B. Example: Normalised data",ylab="Log-cpm") ``` ```{r MDS1, fig.height=4, fig.width=8} lcpm <- cpm(x, log=TRUE) par(mfrow=c(1,2)) col.group <- group levels(col.group) <- brewer.pal(nlevels(col.group), "Set1") col.group <- as.character(col.group) col.lane <- lane levels(col.lane) <- brewer.pal(nlevels(col.lane), "Set2") col.lane <- as.character(col.lane) plotMDS(lcpm, labels=group, col=col.group) title(main="A. Sample groups") plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4)) title(main="B. Sequencing lanes") ``` ```{r GlimmaMDSplot} library(Glimma) glMDSPlot(lcpm, labels=paste(group, lane, sep="_"), groups=x$samples[,c(2,5)], launch=FALSE) ``` [Link to Interactive MDS plot](glimma-plots/MDS-Plot.html) # Differential expression analysis ```{r design} design <- model.matrix(~0+group+lane) colnames(design) <- gsub("group", "", colnames(design)) design ``` ```{r contrasts} contr.matrix <- makeContrasts( BasalvsLP = Basal-LP, BasalvsML = Basal - ML, LPvsML = LP - ML, levels = colnames(design)) contr.matrix ``` ```{r voom, fig.height=4, fig.width=8} par(mfrow=c(1,2)) v <- voom(x, design, plot=TRUE) v vfit <- lmFit(v, design) vfit <- contrasts.fit(vfit, contrasts=contr.matrix) efit <- eBayes(vfit) plotSA(efit) ``` ```{r decidetests} summary(decideTests(efit)) ``` ```{r treat} tfit <- treat(vfit, lfc=1) dt <- decideTests(tfit) summary(dt) ``` ```{r venn, fig.height=6, fig.width=6} de.common <- which(dt[,1]!=0 & dt[,2]!=0) length(de.common) head(tfit$genes$SYMBOL[de.common], n=20) vennDiagram(dt[,1:2], circle.col=c("turquoise", "salmon")) write.fit(tfit, dt, file="results.txt") ``` ```{r toptables} basal.vs.lp <- topTreat(tfit, coef=1, n=Inf) basal.vs.ml <- topTreat(tfit, coef=2, n=Inf) head(basal.vs.lp) head(basal.vs.ml) ``` ```{r MDplot, fig.keep='none'} plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1], xlim=c(-8,13)) ``` ```{r GlimmaMDplot} glMDPlot(tfit, coef=1, status=dt[,1], main=colnames(tfit)[1], counts=x$counts, samples=colnames(x), anno=x$genes, groups=group, id.column="ENTREZID", display.columns=c("SYMBOL", "ENTREZID"), search.by="SYMBOL", launch=FALSE) ``` [Link to Interactive MD plot](glimma-plots/MD-Plot.html) ```{r heatmap, fig.height=10, fig.width=7} library(gplots) basal.vs.lp.topgenes <- basal.vs.lp$ENTREZID[1:100] i <- which(v$genes$ENTREZID %in% basal.vs.lp.topgenes) mycol <- colorpanel(1000,"blue","white","red") heatmap.2(v$E[i,], scale="row", labRow=v$genes$SYMBOL[i], labCol=group, col=mycol, trace="none", density.info="none", margin=c(8,6), lhei=c(2,10), dendrogram="column") ``` # Gene set testing with camera ```{r camera} load(url("http://bioinf.wehi.edu.au/software/MSigDB/mouse_c2_v5p1.rdata")) idx <- ids2indices(Mm.c2,id=rownames(v)) cam.BasalvsLP <- camera(v,idx,design,contrast=contr.matrix[,1], inter.gene.cor=0.01) head(cam.BasalvsLP,5) cam.BasalvsML <- camera(v,idx,design,contrast=contr.matrix[,2], inter.gene.cor=0.01) head(cam.BasalvsML,5) cam.LPvsML <- camera(v,idx,design,contrast=contr.matrix[,3], inter.gene.cor=0.01) head(cam.LPvsML,5) ``` ```{r barcodeplot, fig.height=6, fig.width=6} barcodeplot(efit$t[,3], index=idx$LIM_MAMMARY_LUMINAL_MATURE_UP, index2=idx$LIM_MAMMARY_LUMINAL_MATURE_DN, main="LPvsML") ``` # Software and code used ```{r softwareinfo} sessionInfo() ```