# Simulation study of 3 replicate arrays, with know variances: # 1: 1 1 2 # 2: 1 1 10 # 3: 1 5 10 # Note: These simulations take a long time to complete! library(limma) # Load data load("qc.Rdata") MA <- normalizeWithinArrays(RG) isGene <- MA$genes$Status=="Gene" out.no.wts <- lmFit(MA[isGene,]) # remove low intensity spots indexA <- out.no.wts$Amean > quantile(out.no.wts$Amean,0.25) # store estimated means and sd for selected spots mu.g <- out.no.wts$coef[indexA] sd.g <- out.no.wts$sigma[indexA] # generate standardised residuals std.res <- (MA$M[isGene,][indexA,]-mu.g)/sd.g # Remove NAs std.res <- std.res[!is.na(std.res)] narrays <- 3 ngenes <- 10000 # 8 methods to compare # 1. Ordinary t, remove worst array # 2. Ordinary t, full weights # 3. Ordinary t, empirical aray weights, full REML # 4. Ordinary t, empirical aray weights, gene-by-gene REML # 5. Moderated t, remove worst array # 6. Moderated t, full weights # 7. Moderated t, empirical aray weights, full REML # 8. Moderated t, empirical aray weights, gene-by-gene REML nmethods <- 8 # Number of DE genes ndeg <- 0.05*ngenes # Function to tally false discoveries # Assumes the first ndeg genes are DE (0 false) and the remainder are not (1 true) getfdr <- function(tstats, ndeg, indextruepos=rep(c(0,1), times=c(ndeg, length(tstats)-ndeg))) { ranking <- order(abs(tstats), decreasing=TRUE) return(cumsum(indextruepos[ranking])) } # Simulations with normal data nsim <- 1000 set.seed(20051213); u <- runif(1000) arrayvars <- c(2,10,10) wn <- list(NULL) wn[[1]] <- matrix(0,nsim,narrays) wn[[2]] <- matrix(0,nsim,narrays) wn[[3]] <- matrix(0,nsim,narrays) wngbg <- list(NULL) wngbg[[1]] <- matrix(0,nsim,narrays) wngbg[[2]] <- matrix(0,nsim,narrays) wngbg[[3]] <- matrix(0,nsim,narrays) fdr <- list(NULL) for(j in 1:length(arrayvars)) { fdr.tally <- matrix(0, ngenes, nmethods) for (isim in 1:nsim) { stdev <- sample(sd.g, size=10000, replace=TRUE) M <- matrix(rnorm(ngenes*narrays),ngenes,narrays) M[,narrays] <- sqrt(arrayvars[j])*M[,narrays] if(j==3) { M[,(narrays-1)] <- sqrt(5)*M[,(narrays-1)] } M <- stdev*M # Simulate DE expressed genes, half with a fold change of 3 M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2) # and half with a fold change of 2 M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2) wn[[j]][isim,] <- arrayWeightsSimple(M) wngbg[[j]][isim,] <- arrayWeights(M) fitremove <- lmFit(M[,-3]) fitremove <- eBayes(fitremove) fitwts <- lmFit(M, weights=wn[[j]][isim,]) fitwts <- eBayes(fitwts) fitwtsgbg <- lmFit(M, weights=wngbg[[j]][isim,]) fitwtsgbg <- eBayes(fitwtsgbg) fitnowts <- lmFit(M, weights=NULL) fitnowts <- eBayes(fitnowts) ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled), fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled), fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled), fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled)) mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t) fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg)) fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg)) fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat) cat(isim, " ") } fdr[[j]] <- fdr.tally } save(fdr, wn, wngbg, file="sim1000-3arrays.normal.rda") # Simulations with empirical data set.seed(20051213); u <- runif(1000) we <- list(NULL) we[[1]] <- matrix(0,nsim,narrays) we[[2]] <- matrix(0,nsim,narrays) we[[3]] <- matrix(0,nsim,narrays) wegbg <- list(NULL) wegbg[[1]] <- matrix(0,nsim,narrays) wegbg[[2]] <- matrix(0,nsim,narrays) wegbg[[3]] <- matrix(0,nsim,narrays) arrayvars = c(2,10,10) fdr.emp <- list(NULL) for(j in 1:length(arrayvars)) { fdr.tally <- matrix(0, ngenes, nmethods) for (isim in 1:nsim) { stdev <- sample(sd.g, size=10000, replace=TRUE) M <- sample(std.res,ngenes*narrays,replace=TRUE) dim(M) <- c(ngenes,narrays) M[,narrays] <- sqrt(arrayvars[j])*M[,narrays] if(j==3) { M[,(narrays-1)] <- sqrt(5)*M[,(narrays-1)] } M <- stdev*M # Simulate DE expressed genes, half with a fold change of 3 M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2) # and half with a fold change of 2 M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2) we[[j]][isim,] <- arrayWeightsSimple(M) wegbg[[j]][isim,] <- arrayWeights(M) fitremove <- lmFit(M[,-3]) fitremove <- eBayes(fitremove) fitwts <- lmFit(M, weights=we[[j]][isim,]) fitwts <- eBayes(fitwts) fitwtsgbg <- lmFit(M, weights=wegbg[[j]][isim,]) fitwtsgbg <- eBayes(fitwtsgbg) fitnowts <- lmFit(M, weights=NULL) fitnowts <- eBayes(fitnowts) ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled), fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled), fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled), fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled)) mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t) fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg)) fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg)) fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat) cat(isim, " ") } fdr.emp[[j]] <- fdr.tally } save(fdr.emp, we, wegbg, file="sim1000-3arrays.nonnormal.rda") # Results # 3 arrays normal data load("sim1000-3arrays.normal.rda") fdr3norm <- fdr round(apply(wn[[1]], 2, FUN="mean"),2) round(apply(wn[[1]], 2, FUN="sd"),2) round(apply(wn[[2]], 2, FUN="mean"),2) round(apply(wn[[2]], 2, FUN="sd"),2) round(apply(wn[[3]], 2, FUN="mean"),2) round(apply(wn[[3]], 2, FUN="sd"),2) round(apply(wn[[1]], 2, FUN="mean")/max(apply(wn[[1]], 2, FUN="mean")),2) round(apply(wn[[2]], 2, FUN="mean")/max(apply(wn[[2]], 2, FUN="mean")),2) round(apply(wn[[3]], 2, FUN="mean")/max(apply(wn[[3]], 2, FUN="mean")),2) round(apply(wngbg[[1]], 2, FUN="mean"),2) round(apply(wngbg[[1]], 2, FUN="sd"),2) round(apply(wngbg[[2]], 2, FUN="mean"),2) round(apply(wngbg[[2]], 2, FUN="sd"),2) round(apply(wngbg[[3]], 2, FUN="mean"),2) round(apply(wngbg[[3]], 2, FUN="sd"),2) round(apply(wngbg[[1]], 2, FUN="mean")/max(apply(wngbg[[1]], 2, FUN="mean")),2) round(apply(wngbg[[2]], 2, FUN="mean")/max(apply(wngbg[[2]], 2, FUN="mean")),2) round(apply(wngbg[[3]], 2, FUN="mean")/max(apply(wngbg[[3]], 2, FUN="mean")),2) # 3 arrays non-normal data load("sim1000-3arrays.nonnormal.rda") fdr3nonnorm <- fdr.emp round(apply(we[[1]], 2, FUN="mean"),2) round(apply(we[[1]], 2, FUN="sd"),2) round(apply(we[[2]], 2, FUN="mean"),2) round(apply(we[[2]], 2, FUN="sd"),2) round(apply(we[[3]], 2, FUN="mean"),2) round(apply(we[[3]], 2, FUN="sd"),2) round(apply(we[[1]], 2, FUN="mean")/max(apply(we[[1]], 2, FUN="mean")),2) round(apply(we[[2]], 2, FUN="mean")/max(apply(we[[2]], 2, FUN="mean")),2) round(apply(we[[3]], 2, FUN="mean")/max(apply(we[[3]], 2, FUN="mean")),2) round(apply(wegbg[[1]], 2, FUN="mean"),2) round(apply(wegbg[[1]], 2, FUN="sd"),2) round(apply(wegbg[[2]], 2, FUN="mean"),2) round(apply(wegbg[[2]], 2, FUN="sd"),2) round(apply(wegbg[[3]], 2, FUN="mean"), 2) round(apply(wegbg[[3]], 2, FUN="sd"),2) round(apply(wegbg[[1]], 2, FUN="mean")/max(apply(wegbg[[1]], 2, FUN="mean")),2) round(apply(wegbg[[2]], 2, FUN="mean")/max(apply(wegbg[[2]], 2, FUN="mean")),2) round(apply(wegbg[[3]], 2, FUN="mean")/max(apply(wegbg[[3]], 2, FUN="mean")),2) # Code to calculate averages and sd's apply(wn[[1]], 2, FUN="mean") apply(wn[[1]], 2, FUN="sd") apply(wn[[2]], 2, FUN="mean") apply(wn[[2]], 2, FUN="sd") apply(wn[[3]], 2, FUN="mean") apply(wn[[3]], 2, FUN="sd") apply(wngbg[[1]], 2, FUN="mean") apply(wngbg[[1]], 2, FUN="sd") apply(wngbg[[2]], 2, FUN="mean") apply(wngbg[[2]], 2, FUN="sd") apply(wngbg[[3]], 2, FUN="mean") apply(wngbg[[3]], 2, FUN="sd") apply(we[[1]], 2, FUN="mean") apply(we[[1]], 2, FUN="sd") apply(we[[2]], 2, FUN="mean") apply(we[[2]], 2, FUN="sd") apply(we[[3]], 2, FUN="mean") apply(we[[3]], 2, FUN="sd") apply(wegbg[[1]], 2, FUN="mean") apply(wegbg[[1]], 2, FUN="sd") apply(wegbg[[2]], 2, FUN="mean") apply(wegbg[[2]], 2, FUN="sd") apply(wegbg[[3]], 2, FUN="mean") apply(wegbg[[3]], 2, FUN="sd") # fdr plots ind <- seq(1:10000) lwd = 2.5 lty = 1 xlim=c(0,400) ylim=c(0,200) plotnames <- NULL plotnames[[1]] <- c("(a)", "(b)") plotnames[[2]] <- c("(c)", "(d)") plotnames[[3]] <- c("(e)", "(f)") nsim = 1000 colgray <- gray(c(0, 0.8, 0.5, 0, 0.8, 0.5)) keep <- c(2,3,5,6,7) par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0)) par(mfrow=c(3,2)) for(j in 1:3) { lty=1 plot(ind, fdr3norm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim, asp=1) if(j==1) legend(-15,225, legend=c("ord t remove array", "ord t array wts", "ord t equal wts", "mod t remove array", "mod t array wts", "mod t equal wts"), col=colgray[c(1,3,2,1,3,2)], lty= rep(c(1,2), each=3), lwd=1.5, cex=0.7, bty="n") for(i in seq(along=keep)) { if(i<=2) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr3norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1]) } lty=1 plot(ind, fdr3nonnorm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim, asp=1) for(i in seq(along=keep)) { if(i<=2) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr3nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1]) } } mtext("Number of genes selected", side=1, outer=TRUE) mtext("Number of false positives", side=2, outer=TRUE) mtext(" Normal Non-normal", side=3, outer=TRUE) # Plot for Supplementary materials ind <- seq(1:10000) lwd = 2.5 lty = 1 xlim=c(0,400) ylim=c(0,200) plotnames <- NULL plotnames[[1]] <- c("(a)", "(b)") plotnames[[2]] <- c("(c)", "(d)") plotnames[[3]] <- c("(e)", "(f)") nsim = 1000 cols <- c(2,3,2,3) keep <- c(4,7,8) # png("sim3compare.png") par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0)) par(mfrow=c(3,2)) for(j in 1:3) { lty=1 plot(ind, fdr3norm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) if(j==1) legend(-5,200, legend=c("ord t array wts, full", "ord t array wts, gene-by-gene", "mod t array wts, full", "mod t array wts, gene-by-gene"), col=cols, lty= rep(c(1,2), each=2), cex=0.7, bty="n") for(i in seq(along=keep)) { if(i<=1) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr3norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1]) } lty=1 plot(ind, fdr3nonnorm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) for(i in seq(along=keep)) { if(i<=1) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr3nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1]) } } mtext("Number of genes selected", side=1, outer=TRUE) mtext("Number of false positives", side=2, outer=TRUE) mtext(" Normal Non-normal", side=3, outer=TRUE) # dev.off()