# This simulation generates a mixture of complex peak structures. # This is done by treating each peak as a mixture of three subpeaks. # For DB peaks, one or more of those subpeaks are randomly chosen and eliminated. require(xscss) require(csaw) require(edgeR) fraglen <- 100 iters <- 10 npeaks <- 20000 nde <- 500 prior.df <- 20 dispersion <- 1/prior.df # For the sake of possible flexibility later; right now, it just cancels out. grouping <- c("A","A","B","B") true.width <- 500 base.mu <- 30 if (is.tf) { all.fix <- "tf" radius <- fraglen if (is.homo) { prior.df <- 1e8 all.fix <- "tfx" base.mu <- 10 up.mu <- 20 down.mu <- 0 } else { up.mu <- 45 down.mu <- 15 } } else { is.homo <- FALSE all.fix <- "hist" radius <- true.width/2L } #################################################################################################### design<-model.matrix(~factor(grouping)) ofile <- paste0(all.fix, "_out.tsv") fdr.thresholds<-c(0.01, 0.05, 0.1, 0.15, 0.2) result.file <- paste0(all.fix, "_result.tsv") unlink(result.file) dump2file <- function(id, cutoff, result) { write.table(file=result.file, data.frame(id, cutoff, 1-result$overlap/result$found, result$recall), sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE, append=TRUE) } set.seed(21347234) #################################################################################################### for (it in 1:iters) { ### Generating simulated data for histone mark data. up.pk <- 1:nde down.pk <- 1:nde + nde disp <- prior.df*dispersion/rchisq(npeaks, df=prior.df) if (!is.tf) { type.A.1 <- type.A.2 <- type.A.3 <- !logical(npeaks) chosen.drop.A <- sample(1:6, nde, replace=TRUE) type.A.1[up.pk] <- bitwAnd(chosen.drop.A, 0x1) > 0L type.A.2[up.pk] <- bitwAnd(chosen.drop.A, 0x2) > 0L type.A.3[up.pk] <- bitwAnd(chosen.drop.A, 0x4) > 0L type.B.1 <- type.B.2 <- type.B.3 <- !logical(npeaks) chosen.drop.B <- sample(1:6, nde, replace=TRUE) type.B.1[down.pk] <- bitwAnd(chosen.drop.B, 0x1) > 0L type.B.2[down.pk] <- bitwAnd(chosen.drop.B, 0x2) > 0L type.B.3[down.pk] <- bitwAnd(chosen.drop.B, 0x4) > 0L } # Single chromosome, for convenience. distances<-round(runif(npeaks, 10000, 20000)) pos.1 <- cumsum(distances) if (!is.tf) { pos.2 <- pos.1 + true.width/2 pos.3 <- pos.1 + true.width sizes <- c(chrA=max(pos.3) + 10000) } else { sizes <- c(chrA=max(pos.1) + 10000) } chrs <- rep("chrA", npeaks) fnames<-list() for (lib in 1:length(grouping)) { fname <- paste0(all.fix, "_out_", lib, ".sam") if (!is.tf) { # Simulating complex histone marks. if (grouping[lib]=="A") { drop.1 <- type.A.1 drop.2 <- type.A.2 drop.3 <- type.A.3 } else { drop.1 <- type.B.1 drop.2 <- type.B.2 drop.3 <- type.B.3 } peakFile(fname, chrs=chrs[drop.1], pos=pos.1[drop.1], mu=base.mu, disp=disp[drop.1], sizes=sizes, fraglen=fraglen, width=true.width, tf=FALSE) peakFile(fname, chrs=chrs[drop.2], pos=pos.2[drop.2], mu=base.mu, disp=disp[drop.2], sizes=sizes, fraglen=fraglen, width=true.width, tf=FALSE, append=TRUE) peakFile(fname, chrs=chrs[drop.3], pos=pos.3[drop.3], mu=base.mu, disp=disp[drop.3], sizes=sizes, fraglen=fraglen, width=true.width, tf=FALSE, append=TRUE) } else { # Simulating simple TF changes. cur.mu <- rep(base.mu, npeaks) if (grouping[lib]=="A") { cur.mu[down.pk] <- down.mu cur.mu[up.pk] <- up.mu } else { cur.mu[up.pk] <- down.mu cur.mu[down.pk] <- up.mu } peakFile(fname, chrs=chrs, pos=pos.1, mu=cur.mu, disp=disp, sizes=sizes, fraglen=fraglen, width=true.width, tf=TRUE) } fnames[[lib]]<-fname } fnames<-unlist(fnames) addBackground(fnames, sizes=sizes, width=2000, rlen=10, dispersion=dispersion, prior.df=prior.df, append=TRUE) bam.files<-crunch2BAM(fnames) unlink(fnames) lfile <- paste0(all.fix, "_log.txt") if (is.tf) { write.table(file=lfile, data.frame(chr=chrs[up.pk], start=pos.1[up.pk]-radius, end=pos.1[up.pk]+radius, logFC=1), row.names=FALSE, sep="\t", quote=FALSE) write.table(file=lfile, data.frame(chrs[down.pk], pos.1[down.pk]-radius, pos.1[down.pk]+radius, logFC=-1), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) } else { write.table(file=lfile, data.frame(chr=chrs[!type.A.1], start=pos.1[!type.A.1]-radius, end=pos.1[!type.A.1]+radius, logFC=-1, name=which(!type.A.1)), row.names=FALSE, sep="\t", quote=FALSE) write.table(file=lfile, data.frame(chr=chrs[!type.A.2], start=pos.2[!type.A.2]-radius, end=pos.2[!type.A.2]+radius, logFC=-1, name=which(!type.A.2)), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) write.table(file=lfile, data.frame(chr=chrs[!type.A.3], start=pos.3[!type.A.3]-radius, end=pos.3[!type.A.3]+radius, logFC=-1, name=which(!type.A.3)), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) write.table(file=lfile, data.frame(chr=chrs[!type.B.1], start=pos.1[!type.B.1]-radius, end=pos.1[!type.B.1]+radius, logFC=1, name=which(!type.B.1)), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) write.table(file=lfile, data.frame(chr=chrs[!type.B.2], start=pos.2[!type.B.2]-radius, end=pos.2[!type.B.2]+radius, logFC=1, name=which(!type.B.2)), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) write.table(file=lfile, data.frame(chr=chrs[!type.B.3], start=pos.3[!type.B.3]-radius, end=pos.3[!type.B.3]+radius, logFC=1, name=which(!type.B.3)), row.names=FALSE, sep="\t", quote=FALSE, append=TRUE, col.names=FALSE) } ############################################################# ### Running HOMER or MACS with DiffBind. ############################################################# peakdir <- paste0(all.fix, "_peaks") dir.create(peakdir, showWarnings=FALSE) prefix <- sub("\\.bam$", "", basename(bam.files)) gsize <- sum(as.numeric(sizes)) for (peakcaller in c("HOMER", "MACS", "SICER")) { all.peakfiles <- list() if (peakcaller=="HOMER") { pktype <- "bed" hpdir <- file.path(peakdir, "homerpool") dir.create(hpdir) for (x in 1:length(bam.files)) { pooldir <- file.path(hpdir, prefix[x]) dir.create(pooldir) system(paste("makeTagDirectory", pooldir, bam.files[x], "-format sam -keepAll")) outers <- file.path(peakdir, paste0("homer_", prefix[x], ".txt")) system(paste("findPeaks", pooldir, "-style", ifelse(is.tf, "factor", "histone"), "-o", outers, "-gsize", gsize, "-fragLength", fraglen, "-tbp", 0)) # Converting to proper BED format. blah <- read.table(outers) write.table(file=outers, blah[,c(2,3,4,1,8)], row.names=FALSE, quote=FALSE, sep="\t", col.names=FALSE) all.peakfiles[[x]] <- outers } unlink(hpdir, recursive=TRUE) } else if (peakcaller=="MACS") { pktype <- "macs" for (x in 1:length(bam.files)) { oprefix <- file.path(peakdir, paste0("macs_", prefix[x])) runMACS2(bam.files[x], oprefix, fraglen=fraglen, gsize=gsize, format="BAM") all.peakfiles[[x]] <- paste0(oprefix, "_peaks.xls") } macs.peakfiles <- all.peakfiles # For use with DBChIP. } else if (peakcaller=="SICER") { if (is.tf || it > 1L) { next } # Only doing it for histone mark data, and just once, for demonstration. pktype <- "bed" scdir <- file.path(peakdir, "sicerbed") dir.create(scdir) if (!file.exists("~/tmp")) { dir.create("~/tmp") } win <- 200 gap <- win*2L eval <- 1 all.peakfiles <- list() for (x in 1:length(bam.files)) { cur.bed <- "out.bed" convertBamToBed(bam.files[x], file.path(scdir, cur.bed)) system(paste("methods/SICER_V1.1/SICER/SICER-rb.sh", scdir, cur.bed, scdir, "Simulated", 2147483647, win, fraglen, 1, gap, eval)) newname <- file.path(peakdir, paste0("sicer_", x, ".bed")) blah <- read.table(file.path(scdir, sprintf("out-W%i-G%i-E%i.scoreisland", win, gap, eval))) write.table(data.frame(blah[,1:3], paste0("peak-", 1:nrow(blah)), blah[,4]), file=newname, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) all.peakfiles[[x]] <- newname } unlink("chr.list") unlink(scdir, recursive=TRUE) } # Running DiffBind. require(DiffBind) current <- dba(sampleSheet=data.frame(SampleID=prefix, Condition=grouping, bamReads=bam.files, Peaks=unlist(all.peakfiles), PeakCaller=pktype), minOverlap=2) current <- dba.count(current, fragmentSize=fraglen, bRemoveDuplicates=FALSE) current <- dba.contrast(current, group1=current$masks$A, group2=current$masks$B, name1="A", name2="B", minMembers=2) current <- dba.analyze(current, method=DBA_EDGER, bTagwise=!is.homo) # Avoiding tagwise.dispersion when data is homoskedastic. out <- dba.report(current, th=1) curtab <- as.data.frame(elementMetadata(out)) names(curtab)[names(curtab)=="Fold"] <- "logFC" for (cutoff in fdr.thresholds) { resultDump(out, curtab, cutoff, out=ofile) result<-assessChIP(ofile, lfile, tol=NA, checkfc=FALSE) # Don't bother checking fold change, as it isn't well defined for complex events. dump2file(paste("DiffBind +", peakcaller), cutoff, result) } } ############################################################# # Running MACS with DBChIP. ############################################################# if (is.tf) { all.bed <- list() for (x in 1:length(bam.files)) { curbed <- file.path(peakdir, sub("\\.bam$", ".bed", basename(bam.files[x]))) convertBamToBed(bam.files[x], curbed) all.bed[[x]] <- curbed } all.bed <- unlist(all.bed) conds <- factor(as.character(grouping)) # # Re-running MACS on pooled reads for each condition. # pool.1 <- file.path(peakdir, "pooledA.bed") # system(paste(c("cat", all.bed[grouping=="A"], ">", pool.1), collapse=" ")) # opref.1 <- file.path(peakdir, "macs_A") # runMACS2(pool.1, opref.1, fraglen=fraglen, gsize=gsize) # # pool.2 <- file.path(peakdir, "pooledB.bed") # system(paste(c("cat", all.bed[grouping=="B"], ">", pool.2), collapse=" ")) # opref.2 <- file.path(peakdir, "macs_B") # runMACS2(pool.2, opref.2, fraglen=fraglen, gsize=gsize) # # binding.site.list <- list() # binding.site.list[["A"]] <- read.table(paste0(opref.1, "_peaks.xls"), header=TRUE)[,c("chr", "abs_summit", "X.log10.pvalue.")] # binding.site.list[["B"]] <- read.table(paste0(opref.2, "_peaks.xls"), header=TRUE)[,c("chr", "abs_summit", "X.log10.pvalue.")] # colnames(binding.site.list$A) <- colnames(binding.site.list$B) <- c("chr", "pos", "weight") # Loading the peaks, using the first peak set for each condition. binding.site.list <- list() for (x in unique(grouping)) { binding.site.list[[x]] <- read.table(macs.peakfiles[[which(grouping==x)[1]]], header=TRUE)[,c("chr", "abs_summit", "X.log10.pvalue.")] colnames(binding.site.list[[x]]) <- c("chr", "pos", "weight") } # Running DBCHIP. require(DBChIP) bs.list <- read.binding.site.list(binding.site.list) consensus.site <- site.merge(bs.list) chip.data.list <- as.list(all.bed) names(chip.data.list) <- names(conds) <- paste0("x", 1:length(conds)) # Just to avoid confusion. dat <- load.data(chip.data.list=chip.data.list, conds=conds, consensus.site=consensus.site, data.type="BED", chr.vec=names(sizes), chr.len.vec=sizes, frag.len=fraglen) dat <- get.site.count(dat) dat <- test.diff.binding(dat) # Default uses common dispersion, exactTest; no need for special behaviour when is.homo=TRUE. rept <- report.peak(dat, n=Inf) out <- GRanges(rept$chr, IRanges(rept$pos, rept$pos)) curtab <- data.frame(logFC=log2(rept$FC.B), PValue=rept$pval, FDR=rept$FDR) for (cutoff in fdr.thresholds) { resultDump(out, curtab, cutoff, out=ofile) result<-assessChIP(ofile, lfile, tol=NA, checkfc=FALSE) dump2file("DBChIP + MACS", cutoff, result) } unlink(all.bed) } ############################################################# # Running PePr. ############################################################# outname <- file.path(peakdir, "pepr") first.set <- paste(bam.files[grouping=="A"], collapse=",") second.set <- paste(bam.files[grouping=="B"], collapse=",") system(sprintf("python methods/PePr-master/PePr/PePr.py -c %s --chip2 %s -n %s -f BAM --peaktype=%s --diff -s %i", first.set, second.set, outname, ifelse(is.tf, "sharp", "broad"), fraglen/2L)) collected.up <- read.table(paste0(outname, "__PePr_up_peaks.bed")) collected.down <- read.table(paste0(outname, "__PePr_down_peaks.bed")) all.collected <- c(GRanges(collected.up[,1], IRanges(collected.up[,2], collected.up[,3])), GRanges(collected.down[,1], IRanges(collected.down[,2], collected.down[,3]))) curtab <- rbind(collected.up[,5:6], collected.down[,5:6]) curtab <- cbind(c(rep(1, nrow(collected.up)), rep(-1, nrow(collected.down))), curtab) colnames(curtab) <- c("logFC", "PValue", "FDR") for (cutoff in fdr.thresholds) { # We can't turn up the default window threshold (otherwise everything gets aggregated into a peak). # We'll have to work with the FDRs that are reported, most of which are below 0.05. As such, # results will be conservative for most of the cut-offs. resultDump(all.collected, curtab, cutoff, out=ofile) result<-assessChIP(ofile, lfile, tol=NA, checkfc=FALSE) dump2file("PePr", cutoff, result) } unlink(list.files(pattern="debug\\.log$")) ############################################################# # Running multiGPS, but only once for each data set. ############################################################# if (is.tf && it==1L) { gpsdir <- file.path(peakdir, "multigps") dir.create(gpsdir) file.spec <- paste0('--expt', grouping, "-", as.integer(duplicated(grouping))+1, " ", bam.files) gdata <- file.path(gpsdir, "gdata.data") write.table(data.frame("chrA", gsize), file=gdata, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) system(sprintf("java -Xmx4G -jar methods/multigps_v0.5.jar --geninfo %s %s --out %s", gdata, paste(file.spec, collapse=" "), gpsdir)) file.copy(file.path(gpsdir, "multigps_A.events"), file.path(peakdir, "multigps_A.events")) file.copy(file.path(gpsdir, "multigps_B.events"), file.path(peakdir, "multigps_B.events")) null <- data.frame(character(0), matrix(0, ncol=8, nrow=0)) collected.up <- tryCatch({ read.table(file.path(peakdir, "multigps_A.events"), stringsAsFactors=FALSE) }, error=function(w) { null }) collected.down <- tryCatch({ read.table(file.path(peakdir, "multigps_B.events"), stringsAsFactors=FALSE) }, error=function(w) { null }) cur.str <- c(collected.up[,1], collected.down[,1]) cur.pos <- as.integer(sub(".*:", "", cur.str)) cur.chr <- sub(":.*", "", cur.str) granges <- GRanges(cur.chr, IRanges(cur.pos, cur.pos)) curtab <- data.frame(logCPM=c(collected.up[,6], collected.down[,6]), logFC=c(collected.up[,7], -collected.down[,7]), FDR=2^c(collected.up[,8], collected.down[,8])) # Assuming already BH-adjusted (see setting of DEPVal in EdgeRDifferentialEnrichment.java, # ultimately used in setInterCondP to set interCondP for final printing to *.events via # writeBindingEventFiles in DifferentialTester.java. for (cutoff in fdr.thresholds) { resultDump(granges, curtab, cutoff, out=ofile) result<-assessChIP(ofile, lfile, tol=NA, checkfc=FALSE) dump2file("multiGPS", cutoff, result) } unlink(gdata) unlink(gpsdir, recursive=TRUE) } # regs <- read.table(outers, sep="\t") # regions <- GRanges(regs[,2], IRanges(regs[,3], regs[,4])) # data<-regionCounts(bam.files, regions, ext=1L, param=readParam(dedup=FALSE)) # keep<-rowSums(assay(data))>=filter # counts<-assay(data)[keep,,drop=FALSE] # tabhom<-analyzeQLCounts(counts, design, totals=data$totals) # unlink(pooldir, recursive=TRUE) ############################################################# ### csaw, with its combined window methodology. ############################################################ xparam <- readParam(dedup=FALSE) if (is.tf) { test.widths <- 10 names <- c("csaw") } else { test.widths <- c(50, 150, 250) names <- c("csaw.short", "csaw", "csaw.long") } for (w in seq_along(test.widths)) { data <- windowCounts(bam.files, width=test.widths[w], ext=fraglen, param=xparam, filter=20) binned <- windowCounts(bam.files, width=2000, bin=TRUE, param=xparam) bin.ab <- scaledAverage(asDGEList(binned), scale=median(getWidths(binned))/median(getWidths(data))) threshold <- median(bin.ab) + log2(2) keep <- aveLogCPM(asDGEList(data)) > threshold data <- data[keep,] tabres <- analyzeQLCounts(assay(data), design, totals=data$totals) merged <- mergeWindows(rowRanges(data), tol=100, max.width=5000) tabneg <- combineTests(merged$id, tabres) for (cutoff in fdr.thresholds) { resultDump(merged$region, tabneg, cutoff, out=ofile) result<-assessChIP(ofile, lfile, tol=NA, checkfc=FALSE) dump2file(names[w], cutoff, result) } } } #################################################################################################### # Cleaning up. unlink(bam.files) unlink(paste0(bam.files, ".bai")) unlink(lfile) unlink(ofile) ####################################################################################################