# Simulation study of 5 replicate arrays, with know variances:
# 4: 1 1 1 2 4
# 5: 1 1 1 5 10
# 6: 1 2 4 6 10
# Note:  These simulations take a long time to complete!

library(limma)

# Load data
load("qc.Rdata")

MA <- normalizeWithinArrays(RG)
isGene <- MA$genes$Status=="Gene"
out.no.wts <- lmFit(MA[isGene,])
# remove low intensity spots
indexA <- out.no.wts$Amean > quantile(out.no.wts$Amean,0.25)
# store estimated means and sd for selected spots
mu.g <- out.no.wts$coef[indexA]
sd.g <- out.no.wts$sigma[indexA]
# generate standardised residuals
std.res <- (MA$M[isGene,][indexA,]-mu.g)/sd.g
# Remove NAs
std.res <- std.res[!is.na(std.res)]

narrays <- 5
ngenes <- 10000

# 8 methods to compare
# 1. Ordinary t, remove two worst arrays
# 2. Ordinary t, full weights
# 3. Ordinary t, empirical aray weights, full REML
# 4. Ordinary t, empirical aray weights, gene-by-gene REML
# 5. Moderated t, remove two worst arrays
# 6. Moderated t, full weights
# 7. Moderated t, empirical aray weights, full REML
# 8. Moderated t, empirical aray weights, gene-by-gene REML
nmethods <- 8

# Number of DE genes
ndeg <- 0.05*ngenes

# Function to tally false discoveries
# Assumes the first ndeg genes are DE (0 false) and the remainder are not (1 true)
getfdr <- function(tstats, ndeg, indextruepos=rep(c(0,1), times=c(ndeg, length(tstats)-ndeg))) {   
  ranking <- order(abs(tstats), decreasing=TRUE)
  return(cumsum(indextruepos[ranking]))
  }

# Simulations with Normal data

nsim <- 1000
set.seed(20051213); u <- runif(1000)
wn <- list(NULL)
wn[[1]] <- matrix(0,nsim,narrays)
wn[[2]] <- matrix(0,nsim,narrays)
wn[[3]] <- matrix(0,nsim,narrays)

wngbg <- list(NULL)
wngbg[[1]] <- matrix(0,nsim,narrays)
wngbg[[2]] <- matrix(0,nsim,narrays)
wngbg[[3]] <- matrix(0,nsim,narrays)

matrix(0,nsim,narrays)
arrayvars = c(2, 5, 10)

fdr <- list(NULL) 

for(j in 1:length(arrayvars)) {
 fdr.tally <- matrix(0, ngenes, nmethods)
 for (isim in 1:nsim) {
   stdev <- sample(sd.g, size=10000, replace=TRUE) 
   M <- matrix(rnorm(ngenes*narrays),ngenes,narrays)
   if(j!=3) {
     M[,narrays-1] <- sqrt(arrayvars[j])*M[,narrays-1]
     M[,narrays] <- sqrt(2*arrayvars[j])*M[,narrays]
     }
   else {
     M[,narrays-3] <- sqrt(2)*M[,narrays-3]
     M[,narrays-2] <- sqrt(4)*M[,narrays-2]
     M[,narrays-1] <- sqrt(6)*M[,narrays-1]
     M[,narrays] <- sqrt(arrayvars[j])*M[,narrays]
     }
   M <- stdev*M
   # Simulate DE expressed genes, half with a fold change of 3
   M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2)
   # and half with a fold change of 2
   M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2)
   wn[[j]][isim,] <- arrayWeightsSimple(M)
   wngbg[[j]][isim,] <- arrayWeights(M)
   
   fitremove <- lmFit(M[,c(-4,-5)])
   fitremove <- eBayes(fitremove)
   fitwts <- lmFit(M, weights=wn[[j]][isim,])
   fitwts <- eBayes(fitwts)
   fitnowts <- lmFit(M, weights=NULL)
   fitnowts <- eBayes(fitnowts)
   fitwtsgbg <- lmFit(M, weights=wngbg[[j]][isim,])
   fitwtsgbg <- eBayes(fitwtsgbg)

   ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled),   
                      fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled),  
                      fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled),
                      fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled))
   mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t)
   fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg))
   fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg))
   fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat)
   cat(isim, " ")
   }
   fdr[[j]] <- fdr.tally
}

save(fdr, wn, wngbg, file="sim1000-5arrays.normal.rda")

# Simulations with empirical data

set.seed(20051213); u <- runif(1000)
we <- list(NULL)
we[[1]] <- matrix(0,nsim,narrays)
we[[2]] <- matrix(0,nsim,narrays)
we[[3]] <- matrix(0,nsim,narrays)

wegbg <- list(NULL)
wegbg[[1]] <- matrix(0,nsim,narrays)
wegbg[[2]] <- matrix(0,nsim,narrays)
wegbg[[3]] <- matrix(0,nsim,narrays)

arrayvars = c(2,5,10)
fdr.emp <- list(NULL)

for(j in 1:length(arrayvars)) {
 fdr.tally <- matrix(0, ngenes, nmethods)
 for (isim in 1:nsim) {
   stdev <- sample(sd.g, size=10000, replace=TRUE) 
   M <- sample(std.res,ngenes*narrays,replace=TRUE)
   dim(M) <- c(ngenes,narrays)   
   if(j!=3) {
      M[,narrays-1] <- sqrt(arrayvars[j])*M[,narrays-1]
      M[,narrays] <- sqrt(2*arrayvars[j])*M[,narrays]
      }
    else {
      M[,narrays-3] <- sqrt(2)*M[,narrays-3]
      M[,narrays-2] <- sqrt(4)*M[,narrays-2]
      M[,narrays-1] <- sqrt(6)*M[,narrays-1]
      M[,narrays] <- sqrt(arrayvars[j])*M[,narrays]
      }
   M <- stdev*M   
   # Simulate DE expressed genes, half with a fold change of 3
   M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2)
   # and half with a fold change of 2
   M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2)
 
   we[[j]][isim,] <- arrayWeightsSimple(M)
   wegbg[[j]][isim,] <- arrayWeights(M)

   fitremove <- lmFit(M[,c(-4,-5)])
   fitremove <- eBayes(fitremove)
   fitwts <- lmFit(M, weights=we[[j]][isim,])
   fitwts <- eBayes(fitwts)
   fitnowts <- lmFit(M, weights=NULL)
   fitnowts <- eBayes(fitnowts)
   fitwtsgbg <- lmFit(M, weights=wegbg[[j]][isim,])
   fitwtsgbg <- eBayes(fitwtsgbg)

   ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled),   
                      fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled),  
                      fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled),
                      fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled))
   mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t)
   fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg))
   fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg))
   fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat)
   cat(isim, " ")
   }
   fdr.emp[[j]] <- fdr.tally
}

save(fdr.emp, we, wegbg, file="sim1000-5arrays.nonnormal.rda")

# Results
# 5 arrays normal data
load("sim1000-5arrays.normal.rda")
fdr5norm <- fdr

round(apply(wn[[1]], 2, FUN="mean"), 2)
round(apply(wn[[1]], 2, FUN="sd"), 2)
round(apply(wn[[2]], 2, FUN="mean"), 2)
round(apply(wn[[2]], 2, FUN="sd"), 2)
round(apply(wn[[3]], 2, FUN="mean"), 2)
round(apply(wn[[3]], 2, FUN="sd"), 2)

round(apply(wn[[1]], 2, FUN="mean")/max(apply(wn[[1]], 2, FUN="mean")),2)
round(apply(wn[[2]], 2, FUN="mean")/max(apply(wn[[2]], 2, FUN="mean")),2)
round(apply(wn[[3]], 2, FUN="mean")/max(apply(wn[[3]], 2, FUN="mean")),2)

round(apply(wngbg[[1]], 2, FUN="mean"),2)
round(apply(wngbg[[1]], 2, FUN="sd"), 2)
round(apply(wngbg[[2]], 2, FUN="mean"), 2)
round(apply(wngbg[[2]], 2, FUN="sd"), 2)
round(apply(wngbg[[3]], 2, FUN="mean"), 2)
round(apply(wngbg[[3]], 2, FUN="sd"), 2)

round(apply(wngbg[[1]], 2, FUN="mean")/max(apply(wngbg[[1]], 2, FUN="mean")),2)
round(apply(wngbg[[2]], 2, FUN="mean")/max(apply(wngbg[[2]], 2, FUN="mean")),2)
round(apply(wngbg[[3]], 2, FUN="mean")/max(apply(wngbg[[3]], 2, FUN="mean")),2)


# 5 arrays non-normal data
load("sim1000-5arrays.nonnormal.rda")
fdr5nonnorm <- fdr.emp

round(apply(we[[1]], 2, FUN="mean"), 2)
round(apply(we[[1]], 2, FUN="sd"), 2)
round(apply(we[[2]], 2, FUN="mean"), 2)
round(apply(we[[2]], 2, FUN="sd"), 2)
round(apply(we[[3]], 2, FUN="mean"), 2)
round(apply(we[[3]], 2, FUN="sd"),2)

round(apply(we[[1]], 2, FUN="mean")/max(apply(we[[1]], 2, FUN="mean")),2)
round(apply(we[[2]], 2, FUN="mean")/max(apply(we[[2]], 2, FUN="mean")),2)
round(apply(we[[3]], 2, FUN="mean")/max(apply(we[[3]], 2, FUN="mean")),2)

round(apply(wegbg[[1]], 2, FUN="mean"), 2)
round(apply(wegbg[[1]], 2, FUN="sd"),2)
round(apply(wegbg[[2]], 2, FUN="mean"),2)
round(apply(wegbg[[2]], 2, FUN="sd"),2)
round(apply(wegbg[[3]], 2, FUN="mean"),2)
round(apply(wegbg[[3]], 2, FUN="sd"),2)

round(apply(wegbg[[1]], 2, FUN="mean")/max(apply(wegbg[[1]], 2, FUN="mean")),2)
round(apply(wegbg[[2]], 2, FUN="mean")/max(apply(wegbg[[2]], 2, FUN="mean")),2)
round(apply(wegbg[[3]], 2, FUN="mean")/max(apply(wegbg[[3]], 2, FUN="mean")),2)

# fdr plots
ind <- seq(1:10000)
xlim=c(0,500)
ylim=c(0,100)
colgray <- gray(c(0, 0.8, 0.5, 0, 0.8, 0.5)) 
keep <- c(2,3,5,6,7)  
par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0))
par(mfrow=c(3,2))
for(j in 1:3) {
  lty=1
  plot(ind, fdr5norm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", 
       col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  if(j==1)
  legend(-15,105, legend=c("ord t remove arrays", "ord t array wts", "ord t equal wts",
                           "mod t remove arrays", "mod t array wts", "mod t equal wts"),
                           lwd=1.5, col=colgray[c(1,3,2,1,3,2)], lty= rep(c(1,2), each=3),   
                           cex=0.7, bty="n")
  for(i in seq(along=keep)) {
     if(i<=2)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr5norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1])
     }
  lty=1
  plot(ind, fdr5nonnorm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", 
       col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  for(i in seq(along=keep)) {
     if(i<=2)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr5nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1])
     }
  }
mtext("Number of genes selected", side=1, outer=TRUE)
mtext("Number of false positives", side=2, outer=TRUE)
mtext("        Normal                                              Non-normal", side=3, outer=TRUE)

# Plot for Supplementary materials
ind <- seq(1:10000)
lwd = 2.5
lty = 1
xlim=c(0,500)
ylim=c(0,100)
plotnames <- NULL
plotnames[[1]] <- c("(a)", "(b)")
plotnames[[2]] <- c("(c)", "(d)")
plotnames[[3]] <- c("(e)", "(f)")
nsim = 1000
cols <- c(2,3,2,3)
keep <- c(4,7,8)

# png("sim5compare.png")
par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0))
par(mfrow=c(3,2))
for(j in 1:3) {
  lty=1
  plot(ind, fdr5norm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", 
       col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  if(j==1)
  legend(-10,100, legend=c("ord t array wts, full", "ord t array wts, gene-by-gene",
                           "mod t array wts, full", 
                           "mod t array wts, gene-by-gene"),
                           col=cols, lty= rep(c(1,2), each=2), cex=0.7, bty="n")
  for(i in seq(along=keep)) {
     if(i<=1)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr5norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1])
     }
  lty=1
  plot(ind, fdr5nonnorm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", 
       col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  for(i in seq(along=keep)) {
     if(i<=1)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr5nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1])
     }
  }
mtext("Number of genes selected", side=1, outer=TRUE)
mtext("Number of false positives", side=2, outer=TRUE)
mtext("        Normal                                              Non-normal", side=3, outer=TRUE)
# dev.off()