# Simulation study of 5 replicate arrays, with know variances: # 4: 1 1 1 2 4 # 5: 1 1 1 5 10 # 6: 1 2 4 6 10 # Note: These simulations take a long time to complete! library(limma) # Load data load("qc.Rdata") MA <- normalizeWithinArrays(RG) isGene <- MA$genes$Status=="Gene" out.no.wts <- lmFit(MA[isGene,]) # remove low intensity spots indexA <- out.no.wts$Amean > quantile(out.no.wts$Amean,0.25) # store estimated means and sd for selected spots mu.g <- out.no.wts$coef[indexA] sd.g <- out.no.wts$sigma[indexA] # generate standardised residuals std.res <- (MA$M[isGene,][indexA,]-mu.g)/sd.g # Remove NAs std.res <- std.res[!is.na(std.res)] narrays <- 5 ngenes <- 10000 # 8 methods to compare # 1. Ordinary t, remove two worst arrays # 2. Ordinary t, full weights # 3. Ordinary t, empirical aray weights, full REML # 4. Ordinary t, empirical aray weights, gene-by-gene REML # 5. Moderated t, remove two worst arrays # 6. Moderated t, full weights # 7. Moderated t, empirical aray weights, full REML # 8. Moderated t, empirical aray weights, gene-by-gene REML nmethods <- 8 # Number of DE genes ndeg <- 0.05*ngenes # Function to tally false discoveries # Assumes the first ndeg genes are DE (0 false) and the remainder are not (1 true) getfdr <- function(tstats, ndeg, indextruepos=rep(c(0,1), times=c(ndeg, length(tstats)-ndeg))) { ranking <- order(abs(tstats), decreasing=TRUE) return(cumsum(indextruepos[ranking])) } # Simulations with Normal data nsim <- 1000 set.seed(20051213); u <- runif(1000) wn <- list(NULL) wn[[1]] <- matrix(0,nsim,narrays) wn[[2]] <- matrix(0,nsim,narrays) wn[[3]] <- matrix(0,nsim,narrays) wngbg <- list(NULL) wngbg[[1]] <- matrix(0,nsim,narrays) wngbg[[2]] <- matrix(0,nsim,narrays) wngbg[[3]] <- matrix(0,nsim,narrays) matrix(0,nsim,narrays) arrayvars = c(2, 5, 10) fdr <- list(NULL) for(j in 1:length(arrayvars)) { fdr.tally <- matrix(0, ngenes, nmethods) for (isim in 1:nsim) { stdev <- sample(sd.g, size=10000, replace=TRUE) M <- matrix(rnorm(ngenes*narrays),ngenes,narrays) if(j!=3) { M[,narrays-1] <- sqrt(arrayvars[j])*M[,narrays-1] M[,narrays] <- sqrt(2*arrayvars[j])*M[,narrays] } else { M[,narrays-3] <- sqrt(2)*M[,narrays-3] M[,narrays-2] <- sqrt(4)*M[,narrays-2] M[,narrays-1] <- sqrt(6)*M[,narrays-1] M[,narrays] <- sqrt(arrayvars[j])*M[,narrays] } M <- stdev*M # Simulate DE expressed genes, half with a fold change of 3 M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2) # and half with a fold change of 2 M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2) wn[[j]][isim,] <- arrayWeightsSimple(M) wngbg[[j]][isim,] <- arrayWeights(M) fitremove <- lmFit(M[,c(-4,-5)]) fitremove <- eBayes(fitremove) fitwts <- lmFit(M, weights=wn[[j]][isim,]) fitwts <- eBayes(fitwts) fitnowts <- lmFit(M, weights=NULL) fitnowts <- eBayes(fitnowts) fitwtsgbg <- lmFit(M, weights=wngbg[[j]][isim,]) fitwtsgbg <- eBayes(fitwtsgbg) ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled), fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled), fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled), fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled)) mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t) fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg)) fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg)) fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat) cat(isim, " ") } fdr[[j]] <- fdr.tally } save(fdr, wn, wngbg, file="sim1000-5arrays.normal.rda") # Simulations with empirical data set.seed(20051213); u <- runif(1000) we <- list(NULL) we[[1]] <- matrix(0,nsim,narrays) we[[2]] <- matrix(0,nsim,narrays) we[[3]] <- matrix(0,nsim,narrays) wegbg <- list(NULL) wegbg[[1]] <- matrix(0,nsim,narrays) wegbg[[2]] <- matrix(0,nsim,narrays) wegbg[[3]] <- matrix(0,nsim,narrays) arrayvars = c(2,5,10) fdr.emp <- list(NULL) for(j in 1:length(arrayvars)) { fdr.tally <- matrix(0, ngenes, nmethods) for (isim in 1:nsim) { stdev <- sample(sd.g, size=10000, replace=TRUE) M <- sample(std.res,ngenes*narrays,replace=TRUE) dim(M) <- c(ngenes,narrays) if(j!=3) { M[,narrays-1] <- sqrt(arrayvars[j])*M[,narrays-1] M[,narrays] <- sqrt(2*arrayvars[j])*M[,narrays] } else { M[,narrays-3] <- sqrt(2)*M[,narrays-3] M[,narrays-2] <- sqrt(4)*M[,narrays-2] M[,narrays-1] <- sqrt(6)*M[,narrays-1] M[,narrays] <- sqrt(arrayvars[j])*M[,narrays] } M <- stdev*M # Simulate DE expressed genes, half with a fold change of 3 M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2) # and half with a fold change of 2 M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2) we[[j]][isim,] <- arrayWeightsSimple(M) wegbg[[j]][isim,] <- arrayWeights(M) fitremove <- lmFit(M[,c(-4,-5)]) fitremove <- eBayes(fitremove) fitwts <- lmFit(M, weights=we[[j]][isim,]) fitwts <- eBayes(fitwts) fitnowts <- lmFit(M, weights=NULL) fitnowts <- eBayes(fitnowts) fitwtsgbg <- lmFit(M, weights=wegbg[[j]][isim,]) fitwtsgbg <- eBayes(fitwtsgbg) ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled), fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled), fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled), fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled)) mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t) fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg)) fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg)) fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat) cat(isim, " ") } fdr.emp[[j]] <- fdr.tally } save(fdr.emp, we, wegbg, file="sim1000-5arrays.nonnormal.rda") # Results # 5 arrays normal data load("sim1000-5arrays.normal.rda") fdr5norm <- fdr round(apply(wn[[1]], 2, FUN="mean"), 2) round(apply(wn[[1]], 2, FUN="sd"), 2) round(apply(wn[[2]], 2, FUN="mean"), 2) round(apply(wn[[2]], 2, FUN="sd"), 2) round(apply(wn[[3]], 2, FUN="mean"), 2) round(apply(wn[[3]], 2, FUN="sd"), 2) round(apply(wn[[1]], 2, FUN="mean")/max(apply(wn[[1]], 2, FUN="mean")),2) round(apply(wn[[2]], 2, FUN="mean")/max(apply(wn[[2]], 2, FUN="mean")),2) round(apply(wn[[3]], 2, FUN="mean")/max(apply(wn[[3]], 2, FUN="mean")),2) round(apply(wngbg[[1]], 2, FUN="mean"),2) round(apply(wngbg[[1]], 2, FUN="sd"), 2) round(apply(wngbg[[2]], 2, FUN="mean"), 2) round(apply(wngbg[[2]], 2, FUN="sd"), 2) round(apply(wngbg[[3]], 2, FUN="mean"), 2) round(apply(wngbg[[3]], 2, FUN="sd"), 2) round(apply(wngbg[[1]], 2, FUN="mean")/max(apply(wngbg[[1]], 2, FUN="mean")),2) round(apply(wngbg[[2]], 2, FUN="mean")/max(apply(wngbg[[2]], 2, FUN="mean")),2) round(apply(wngbg[[3]], 2, FUN="mean")/max(apply(wngbg[[3]], 2, FUN="mean")),2) # 5 arrays non-normal data load("sim1000-5arrays.nonnormal.rda") fdr5nonnorm <- fdr.emp round(apply(we[[1]], 2, FUN="mean"), 2) round(apply(we[[1]], 2, FUN="sd"), 2) round(apply(we[[2]], 2, FUN="mean"), 2) round(apply(we[[2]], 2, FUN="sd"), 2) round(apply(we[[3]], 2, FUN="mean"), 2) round(apply(we[[3]], 2, FUN="sd"),2) round(apply(we[[1]], 2, FUN="mean")/max(apply(we[[1]], 2, FUN="mean")),2) round(apply(we[[2]], 2, FUN="mean")/max(apply(we[[2]], 2, FUN="mean")),2) round(apply(we[[3]], 2, FUN="mean")/max(apply(we[[3]], 2, FUN="mean")),2) round(apply(wegbg[[1]], 2, FUN="mean"), 2) round(apply(wegbg[[1]], 2, FUN="sd"),2) round(apply(wegbg[[2]], 2, FUN="mean"),2) round(apply(wegbg[[2]], 2, FUN="sd"),2) round(apply(wegbg[[3]], 2, FUN="mean"),2) round(apply(wegbg[[3]], 2, FUN="sd"),2) round(apply(wegbg[[1]], 2, FUN="mean")/max(apply(wegbg[[1]], 2, FUN="mean")),2) round(apply(wegbg[[2]], 2, FUN="mean")/max(apply(wegbg[[2]], 2, FUN="mean")),2) round(apply(wegbg[[3]], 2, FUN="mean")/max(apply(wegbg[[3]], 2, FUN="mean")),2) # fdr plots ind <- seq(1:10000) xlim=c(0,500) ylim=c(0,100) colgray <- gray(c(0, 0.8, 0.5, 0, 0.8, 0.5)) keep <- c(2,3,5,6,7) par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0)) par(mfrow=c(3,2)) for(j in 1:3) { lty=1 plot(ind, fdr5norm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) if(j==1) legend(-15,105, legend=c("ord t remove arrays", "ord t array wts", "ord t equal wts", "mod t remove arrays", "mod t array wts", "mod t equal wts"), lwd=1.5, col=colgray[c(1,3,2,1,3,2)], lty= rep(c(1,2), each=3), cex=0.7, bty="n") for(i in seq(along=keep)) { if(i<=2) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr5norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1]) } lty=1 plot(ind, fdr5nonnorm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) for(i in seq(along=keep)) { if(i<=2) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr5nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1]) } } mtext("Number of genes selected", side=1, outer=TRUE) mtext("Number of false positives", side=2, outer=TRUE) mtext(" Normal Non-normal", side=3, outer=TRUE) # Plot for Supplementary materials ind <- seq(1:10000) lwd = 2.5 lty = 1 xlim=c(0,500) ylim=c(0,100) plotnames <- NULL plotnames[[1]] <- c("(a)", "(b)") plotnames[[2]] <- c("(c)", "(d)") plotnames[[3]] <- c("(e)", "(f)") nsim = 1000 cols <- c(2,3,2,3) keep <- c(4,7,8) # png("sim5compare.png") par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0)) par(mfrow=c(3,2)) for(j in 1:3) { lty=1 plot(ind, fdr5norm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) if(j==1) legend(-10,100, legend=c("ord t array wts, full", "ord t array wts, gene-by-gene", "mod t array wts, full", "mod t array wts, gene-by-gene"), col=cols, lty= rep(c(1,2), each=2), cex=0.7, bty="n") for(i in seq(along=keep)) { if(i<=1) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr5norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1]) } lty=1 plot(ind, fdr5nonnorm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim) for(i in seq(along=keep)) { if(i<=1) lty=1 else lty=2 ind2 <- keep[i] points(ind, fdr5nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1]) } } mtext("Number of genes selected", side=1, outer=TRUE) mtext("Number of false positives", side=2, outer=TRUE) mtext(" Normal Non-normal", side=3, outer=TRUE) # dev.off()