# Simulation study of 3 replicate arrays, with know variances:
# 1: 1 1 2
# 2: 1 1 10
# 3: 1 5 10
# Note:  These simulations take a long time to complete!

library(limma)

# Load data
load("qc.Rdata")

MA <- normalizeWithinArrays(RG)
isGene <- MA$genes$Status=="Gene"
out.no.wts <- lmFit(MA[isGene,])
# remove low intensity spots
indexA <- out.no.wts$Amean > quantile(out.no.wts$Amean,0.25)
# store estimated means and sd for selected spots
mu.g <- out.no.wts$coef[indexA]
sd.g <- out.no.wts$sigma[indexA]
# generate standardised residuals
std.res <- (MA$M[isGene,][indexA,]-mu.g)/sd.g
# Remove NAs
std.res <- std.res[!is.na(std.res)]

narrays <- 3
ngenes <- 10000

# 8 methods to compare
# 1. Ordinary t, remove worst array
# 2. Ordinary t, full weights
# 3. Ordinary t, empirical aray weights, full REML
# 4. Ordinary t, empirical aray weights, gene-by-gene REML
# 5. Moderated t, remove worst array
# 6. Moderated t, full weights
# 7. Moderated t, empirical aray weights, full REML
# 8. Moderated t, empirical aray weights, gene-by-gene REML
nmethods <- 8

# Number of DE genes
ndeg <- 0.05*ngenes

# Function to tally false discoveries
# Assumes the first ndeg genes are DE (0 false) and the remainder are not (1 true)
getfdr <- function(tstats, ndeg, indextruepos=rep(c(0,1), times=c(ndeg, length(tstats)-ndeg))) {   
  ranking <- order(abs(tstats), decreasing=TRUE)
  return(cumsum(indextruepos[ranking]))
  }

# Simulations with normal data

nsim <- 1000
set.seed(20051213); u <- runif(1000)

arrayvars <- c(2,10,10)
wn <- list(NULL)
wn[[1]] <- matrix(0,nsim,narrays)
wn[[2]] <- matrix(0,nsim,narrays)
wn[[3]] <- matrix(0,nsim,narrays)

wngbg <- list(NULL)
wngbg[[1]] <- matrix(0,nsim,narrays)
wngbg[[2]] <- matrix(0,nsim,narrays)
wngbg[[3]] <- matrix(0,nsim,narrays)

fdr <- list(NULL)

for(j in 1:length(arrayvars)) {
 fdr.tally <- matrix(0, ngenes, nmethods)
 for (isim in 1:nsim) {
   stdev <- sample(sd.g, size=10000, replace=TRUE) 
   M <- matrix(rnorm(ngenes*narrays),ngenes,narrays)
   M[,narrays] <- sqrt(arrayvars[j])*M[,narrays]
   if(j==3) {
      M[,(narrays-1)] <- sqrt(5)*M[,(narrays-1)]
      }
   M <- stdev*M

   # Simulate DE expressed genes, half with a fold change of 3
   M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2)
   # and half with a fold change of 2
   M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2)
   wn[[j]][isim,] <- arrayWeightsSimple(M)
   wngbg[[j]][isim,] <- arrayWeights(M)
   
   fitremove <- lmFit(M[,-3])
   fitremove <- eBayes(fitremove)
   fitwts <- lmFit(M, weights=wn[[j]][isim,])
   fitwts <- eBayes(fitwts)
   fitwtsgbg <- lmFit(M, weights=wngbg[[j]][isim,])
   fitwtsgbg <- eBayes(fitwtsgbg)
   fitnowts <- lmFit(M, weights=NULL)
   fitnowts <- eBayes(fitnowts)
  
   ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled),   
                      fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled),  
                      fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled),
                      fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled))
   mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t)
   fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg))
   fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg))
   fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat)
   cat(isim, " ")
  }
   fdr[[j]] <- fdr.tally
}

save(fdr, wn, wngbg, file="sim1000-3arrays.normal.rda")

# Simulations with empirical data
set.seed(20051213); u <- runif(1000)
we <- list(NULL)
we[[1]] <- matrix(0,nsim,narrays)
we[[2]] <- matrix(0,nsim,narrays)
we[[3]] <- matrix(0,nsim,narrays)

wegbg <- list(NULL)
wegbg[[1]] <- matrix(0,nsim,narrays)
wegbg[[2]] <- matrix(0,nsim,narrays)
wegbg[[3]] <- matrix(0,nsim,narrays)

arrayvars = c(2,10,10)
fdr.emp <- list(NULL)

for(j in 1:length(arrayvars)) {
 fdr.tally <- matrix(0, ngenes, nmethods)
 for (isim in 1:nsim) {
   stdev <- sample(sd.g, size=10000, replace=TRUE) 
   M <- sample(std.res,ngenes*narrays,replace=TRUE)
   dim(M) <- c(ngenes,narrays)   
   M[,narrays] <- sqrt(arrayvars[j])*M[,narrays]   
   if(j==3) {
       M[,(narrays-1)] <- sqrt(5)*M[,(narrays-1)]
       }
   M <- stdev*M   
   # Simulate DE expressed genes, half with a fold change of 3
   M[1:(ndeg/2),] = M[1:(ndeg/2),]+log(3,2)
   # and half with a fold change of 2
   M[seq(ndeg/2+1, ndeg, by=1),] = M[seq(ndeg/2+1, ndeg, by=1),]+log(2,2)
 
   we[[j]][isim,] <- arrayWeightsSimple(M)
   wegbg[[j]][isim,] <- arrayWeights(M)
 
   fitremove <- lmFit(M[,-3])
   fitremove <- eBayes(fitremove)
   fitwts <- lmFit(M, weights=we[[j]][isim,])
   fitwts <- eBayes(fitwts)
   fitwtsgbg <- lmFit(M, weights=wegbg[[j]][isim,])
   fitwtsgbg <- eBayes(fitwtsgbg)
   fitnowts <- lmFit(M, weights=NULL)
   fitnowts <- eBayes(fitnowts)

   ord.tstat <- cbind(fitremove$coef/(fitremove$sigma*fitremove$stdev.unscaled),   
                      fitnowts$coef/(fitnowts$sigma*fitnowts$stdev.unscaled),  
                      fitwts$coef/(fitwts$sigma*fitwts$stdev.unscaled),
                      fitwtsgbg$coef/(fitwtsgbg$sigma*fitwtsgbg$stdev.unscaled))
   mod.tstat <- cbind(fitremove$t, fitnowts$t, fitwts$t, fitwtsgbg$t)
   fdr.ord.tstat <- data.frame(remove=getfdr(ord.tstat[,1], ndeg), nowts=getfdr(ord.tstat[,2], ndeg), wts=getfdr(ord.tstat[,3], ndeg), wtsgbg=getfdr(ord.tstat[,4], ndeg))
   fdr.mod.tstat <- data.frame(remove=getfdr(mod.tstat[,1], ndeg), nowts=getfdr(mod.tstat[,2], ndeg), wts=getfdr(mod.tstat[,3], ndeg), wtsgbg=getfdr(mod.tstat[,4], ndeg))
   fdr.tally <- fdr.tally+cbind(fdr.ord.tstat, fdr.mod.tstat)
   cat(isim, " ")
   }
   fdr.emp[[j]] <- fdr.tally
}

save(fdr.emp, we, wegbg, file="sim1000-3arrays.nonnormal.rda")

# Results
# 3 arrays normal data
load("sim1000-3arrays.normal.rda")
fdr3norm <- fdr

round(apply(wn[[1]], 2, FUN="mean"),2)
round(apply(wn[[1]], 2, FUN="sd"),2)
round(apply(wn[[2]], 2, FUN="mean"),2)
round(apply(wn[[2]], 2, FUN="sd"),2)
round(apply(wn[[3]], 2, FUN="mean"),2)
round(apply(wn[[3]], 2, FUN="sd"),2)

round(apply(wn[[1]], 2, FUN="mean")/max(apply(wn[[1]], 2, FUN="mean")),2)
round(apply(wn[[2]], 2, FUN="mean")/max(apply(wn[[2]], 2, FUN="mean")),2)
round(apply(wn[[3]], 2, FUN="mean")/max(apply(wn[[3]], 2, FUN="mean")),2)

round(apply(wngbg[[1]], 2, FUN="mean"),2)
round(apply(wngbg[[1]], 2, FUN="sd"),2)
round(apply(wngbg[[2]], 2, FUN="mean"),2)
round(apply(wngbg[[2]], 2, FUN="sd"),2)
round(apply(wngbg[[3]], 2, FUN="mean"),2)
round(apply(wngbg[[3]], 2, FUN="sd"),2)

round(apply(wngbg[[1]], 2, FUN="mean")/max(apply(wngbg[[1]], 2, FUN="mean")),2)
round(apply(wngbg[[2]], 2, FUN="mean")/max(apply(wngbg[[2]], 2, FUN="mean")),2)
round(apply(wngbg[[3]], 2, FUN="mean")/max(apply(wngbg[[3]], 2, FUN="mean")),2)


# 3 arrays non-normal data
load("sim1000-3arrays.nonnormal.rda")
fdr3nonnorm <- fdr.emp
round(apply(we[[1]], 2, FUN="mean"),2)
round(apply(we[[1]], 2, FUN="sd"),2)
round(apply(we[[2]], 2, FUN="mean"),2)
round(apply(we[[2]], 2, FUN="sd"),2)
round(apply(we[[3]], 2, FUN="mean"),2)
round(apply(we[[3]], 2, FUN="sd"),2)

round(apply(we[[1]], 2, FUN="mean")/max(apply(we[[1]], 2, FUN="mean")),2)
round(apply(we[[2]], 2, FUN="mean")/max(apply(we[[2]], 2, FUN="mean")),2)
round(apply(we[[3]], 2, FUN="mean")/max(apply(we[[3]], 2, FUN="mean")),2)

round(apply(wegbg[[1]], 2, FUN="mean"),2)
round(apply(wegbg[[1]], 2, FUN="sd"),2)
round(apply(wegbg[[2]], 2, FUN="mean"),2)
round(apply(wegbg[[2]], 2, FUN="sd"),2)
round(apply(wegbg[[3]], 2, FUN="mean"), 2)
round(apply(wegbg[[3]], 2, FUN="sd"),2)

round(apply(wegbg[[1]], 2, FUN="mean")/max(apply(wegbg[[1]], 2, FUN="mean")),2)
round(apply(wegbg[[2]], 2, FUN="mean")/max(apply(wegbg[[2]], 2, FUN="mean")),2)
round(apply(wegbg[[3]], 2, FUN="mean")/max(apply(wegbg[[3]], 2, FUN="mean")),2)


# Code to calculate averages and sd's
apply(wn[[1]], 2, FUN="mean")
apply(wn[[1]], 2, FUN="sd")
apply(wn[[2]], 2, FUN="mean")
apply(wn[[2]], 2, FUN="sd")
apply(wn[[3]], 2, FUN="mean")
apply(wn[[3]], 2, FUN="sd")

apply(wngbg[[1]], 2, FUN="mean")
apply(wngbg[[1]], 2, FUN="sd")
apply(wngbg[[2]], 2, FUN="mean")
apply(wngbg[[2]], 2, FUN="sd")
apply(wngbg[[3]], 2, FUN="mean")
apply(wngbg[[3]], 2, FUN="sd")

apply(we[[1]], 2, FUN="mean")
apply(we[[1]], 2, FUN="sd")
apply(we[[2]], 2, FUN="mean")
apply(we[[2]], 2, FUN="sd")
apply(we[[3]], 2, FUN="mean")
apply(we[[3]], 2, FUN="sd")

apply(wegbg[[1]], 2, FUN="mean")
apply(wegbg[[1]], 2, FUN="sd")
apply(wegbg[[2]], 2, FUN="mean")
apply(wegbg[[2]], 2, FUN="sd")
apply(wegbg[[3]], 2, FUN="mean")
apply(wegbg[[3]], 2, FUN="sd")

# fdr plots
ind <- seq(1:10000)
lwd = 2.5
lty = 1
xlim=c(0,400)
ylim=c(0,200)
plotnames <- NULL
plotnames[[1]] <- c("(a)", "(b)")
plotnames[[2]] <- c("(c)", "(d)")
plotnames[[3]] <- c("(e)", "(f)")
nsim = 1000

colgray <- gray(c(0, 0.8, 0.5, 0, 0.8, 0.5)) 
keep <- c(2,3,5,6,7)  
par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0))
par(mfrow=c(3,2))
for(j in 1:3) {
  lty=1
  plot(ind, fdr3norm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", 
       col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim, asp=1)
  if(j==1)
  legend(-15,225, legend=c("ord t remove array", "ord t array wts", "ord t equal wts",
                           "mod t remove array", "mod t array wts", "mod t equal wts"),
                           col=colgray[c(1,3,2,1,3,2)], lty= rep(c(1,2), each=3),
                           lwd=1.5, cex=0.7, bty="n")
  for(i in seq(along=keep)) {
     if(i<=2)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr3norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1])
     }
  lty=1
  plot(ind, fdr3nonnorm[[j]][,1]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", 
       col=colgray[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim, asp=1)
  for(i in seq(along=keep)) {
     if(i<=2)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr3nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=colgray[i+1])
     }
  }
mtext("Number of genes selected", side=1, outer=TRUE)
mtext("Number of false positives", side=2, outer=TRUE)
mtext("        Normal                                              Non-normal", side=3, outer=TRUE)

# Plot for Supplementary materials
ind <- seq(1:10000)
lwd = 2.5
lty = 1
xlim=c(0,400)
ylim=c(0,200)
plotnames <- NULL
plotnames[[1]] <- c("(a)", "(b)")
plotnames[[2]] <- c("(c)", "(d)")
plotnames[[3]] <- c("(e)", "(f)")
nsim = 1000
cols <- c(2,3,2,3)
keep <- c(4,7,8)

# png("sim3compare.png")
par(mai=c(0.3, 0.3, 0.2, 0.1), oma=c(1.2,1.2,1.2,0))
par(mfrow=c(3,2))
for(j in 1:3) {
  lty=1
  plot(ind, fdr3norm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][1], type="l", 
       col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  if(j==1)
  legend(-5,200, legend=c("ord t array wts, full", "ord t array wts, gene-by-gene",
                           "mod t array wts, full", 
                           "mod t array wts, gene-by-gene"),
                           col=cols, lty= rep(c(1,2), each=2), cex=0.7, bty="n")
  for(i in seq(along=keep)) {
     if(i<=1)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr3norm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1])
     }
  lty=1
  plot(ind, fdr3nonnorm[[j]][,3]/nsim, xlab="", ylab="", main=plotnames[[j]][2], type="l", 
       col=cols[1], lwd=lwd, lty=lty, xlim=xlim, ylim=ylim)
  for(i in seq(along=keep)) {
     if(i<=1)
        lty=1
     else
        lty=2
     ind2 <- keep[i]
     points(ind, fdr3nonnorm[[j]][,ind2]/nsim, type="l", lwd=lwd, lty=lty, col=cols[i+1])
     }
  }
mtext("Number of genes selected", side=1, outer=TRUE)
mtext("Number of false positives", side=2, outer=TRUE)
mtext("        Normal                                              Non-normal", side=3, outer=TRUE)
# dev.off()