htswanalysis/bin/macs

   1 #!/usr/bin/env python
   2 # Time-stamp: <2008-05-21 18:19:59 Tao Liu>
   3
   4 """Module Description
   5
   6 Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
   7
   8 This code is free software; you can redistribute it and/or modify it
   9 under the terms of the Artistic License (see the file COPYING included
  10 with the distribution).
  11
  12 @status:  beta
  13 @version: $Revision$
  14 @author:  Yong Zhang, Tao Liu
  15 @contact: taoliu@jimmy.harvard.edu
  16 """
  17
  18 # ------------------------------------
  19 # python modules
  20 # ------------------------------------
  21
  22 import os
  23 import sys
  24 import logging
  25 import math
  26 from optparse import OptionParser
  27 # ------------------------------------
  28 # own python modules
  29 # ------------------------------------
  30 from MACS.gsl.cdf import binom_cdf_Q
  31 from MACS.TabIO import BEDParser, ELANDParser
  32 from MACS.PeakModel import PeakModel
  33 from MACS.PeakDetect import PeakDetect
  34
  35 # ------------------------------------
  36 # Main function
  37 # ------------------------------------
  38 def main():
  39     usage = "usage: %prog <-t tfile> [options]"
  40     description = "MACS -- Model-based Analysis for ChIP-Sequencing"
  41
  42     # Parse options...
  43     optparser = OptionParser(version="%prog 1.0 beta",description=description,usage=usage,add_help_option=False)
  44     optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
  45     optparser.add_option("-t","--treatment",dest="tfile",type="string",
  46                          help="ChIP-seq treatment files. REQUIRED")
  47     optparser.add_option("-c","--control",dest="cfile",type="string",
  48                          help="control files.")
  49     optparser.add_option("--name",dest="name",type="string",
  50                          help="experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
  51                          default="NA")
  52     optparser.add_option("--format",dest="format",type="string",
  53                          help="Format of tag file, \"ELAND\" or \"BED\". DEFAULT: \"BED\"",
  54                          default="bed")
  55     optparser.add_option("--gsize",dest="gsize",type="int",default=2700000000,
  56                          help="genome size, default:2700000000")
  57     optparser.add_option("--tsize",dest="tsize",type="int",default=25,
  58                          help="tag size. DEFAULT: 25")
  59     optparser.add_option("--bw",dest="bw",type="int",default=300,
  60                          help="band width. DEFAULT: 300")
  61     optparser.add_option("--pvalue",dest="pvalue",type="float",default=1e-5,
  62                          help="pvalue cutoff for peak detection. DEFAULT: 1e-5")
  63     optparser.add_option("--mfold",dest="mfold",type="int",default=32,
  64                          help="select the regions with MFOLD high-confidence enrichment ratio against background to build model. DEFAULT:32")
  65 #     optparser.add_option("--wig",dest="wig",action="store_true",
  66 #                          help="Whether or not to save the peak shape for every peak region in wiggle file",
  67 #                          default=False)
  68     optparser.add_option("--verbose",dest="verbose",type="int",default=2,
  69                          help="set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
  70     (options,args) = optparser.parse_args()
  71
  72     if not options.tfile:
  73         optparser.print_help()
  74         sys.exit(1)
  75
  76     # configure the logging instance
  77     logging.basicConfig(level=(4-options.verbose)*10,
  78                         format='%(levelname)-5s @ %(asctime)s: %(message)s ',
  79                         datefmt='%a, %d %b %Y %H:%M:%S',
  80                         stream=sys.stderr,
  81                         filemode="w"
  82                         )
  83
  84     if not os.path.isfile (options.tfile):
  85         logging.error("No such file: %s!" % options.tfile)
  86         sys.exit(1)
  87     if options.cfile and not os.path.isfile (options.cfile):
  88         logging.error("No such file: %s!" % options.cfile)
  89         sys.exit(1)
  90
  91     #0 output arguments
  92     logging.info("#0 arguments list:")
  93     logging.info("#0 name = %s" % (options.name))
  94     logging.info("#0 ChIP-seq file = %s" % (options.tfile))
  95     logging.info("#0 control file = %s" % (options.cfile))
  96     logging.info("#0 effective genome size = %.2e" % (options.gsize))
  97     logging.info("#0 tag size = %d" % (options.tsize))
  98     logging.info("#0 band width = %d" % (options.bw))
  99     logging.info("#0 model fold = %s" % (options.mfold))
 100     #logging.info("#0 model peaks = %s" % (options.mpeaks))
 101     logging.info("#0 pvalue cutoff = %.2e" % (options.pvalue))
 102
 103     log_pvalue = math.log(options.pvalue,10)*-10
 104
 105     #1 Read tag files
 106     logging.info("#1 read tag files...")
 107     format = options.format.upper()
 108     (treat, control) = load_tag_files (options.tfile, options.cfile,format)
 109     logging.debug("#1.3 calculate max duplicate tags in single position based on binomal distribution...")
 110     max_dup_tags = cal_max_dup_tags(options.gsize,treat.total_w_duplicates)
 111     logging.debug("#1.3  max_dup_tags = %d" % (max_dup_tags))
 112     logging.debug("#1.3  unique tags in treatment: %d" % (treat.total))
 113     logging.debug("#1.3  total tags in treatment: %d" % (treat.total_w_duplicates))
 114     if control:
 115         logging.debug("#1.3  unique tags in control: %d" % (control.total))
 116         logging.debug("#1.3  total tags in control: %d" % (control.total_w_duplicates))
 117     bg_redundant = bg_r (treat,max_dup_tags)
 118     logging.info("#1.3  Background Redundant rate: %.2f" % (bg_redundant))
 119     logging.info("#1.3 finished!")
 120     #2 Build Model
 121     logging.info("#2 Build Peak Model...")
 122     peakmodel = PeakModel(treatment = treat,
 123                           gz = options.gsize, # mappable genome size
 124                           bw = options.bw,
 125                           fold = options.mfold,
 126                           max_pairnum = 1000,
 127                           ts = options.tsize,
 128                           bg = bg_redundant,
 129                           max_dup_tags = max_dup_tags
 130                           )
 131     logging.info("#2 finished!")
 132     logging.debug("#2  Summary Model:")
 133     logging.debug("#2   min_tags: %d" % (peakmodel.min_tags))
 134     logging.debug("#2   frag_length: %d" % (peakmodel.frag_length))
 135     logging.debug("#2   scan_window: %d" % (peakmodel.scan_window))
 136     logging.info("#2.2 Generate R script for model")
 137     model2r_script(peakmodel,options.name)
 138     #3 Call Peaks
 139     logging.info("#3 Call peaks...")
 140     peakdetect = PeakDetect(treat = treat,
 141                             control = control,
 142                             frag_length = peakmodel.frag_length,
 143                             scan_window = peakmodel.scan_window,
 144                             pvalue = log_pvalue,
 145                             gsize = options.gsize
 146                             )
 147     peakdetect.call_peaks()
 148     #4 output
 149     #4.1 peaks in XLS
 150     ofile_xls = options.name+"_peaks.xls"
 151     logging.info("#4.1 Write output xls file... %s" % (ofile_xls))
 152     ofhd_xls = open(ofile_xls,"w")
 153     ofhd_xls.write("# This file is generated by MACS\n")
 154     ofhd_xls.write("# name = %s\n" % (options.name))
 155     ofhd_xls.write("# ChIP-seq file = %s\n" % (options.tfile))
 156     ofhd_xls.write("# control file = %s\n" % (options.cfile))
 157     ofhd_xls.write("# effective genome size = %.2e\n" % (options.gsize))
 158     ofhd_xls.write("# tag size = %d\n" % (options.tsize))
 159     ofhd_xls.write("# band width = %d\n" % (options.bw))
 160     ofhd_xls.write("# model fold = %s\n" % (options.mfold))
 161     ofhd_xls.write("# pvalue cutoff = %.2e\n" % (options.pvalue))
 162     ofhd_xls.write("# frag_len = %d\n" % (peakdetect.frag_length))
 163     ofhd_xls.write(peakdetect.toxls())
 164     ofhd_xls.close()
 165     #4.2 peaks in BED
 166     ofile_bed = options.name+"_peaks.bed"
 167     logging.info("#4.2 Write output bed file... %s" % (ofile_bed))
 168     ofhd_bed = open(ofile_bed,"w")
 169     ofhd_bed.write(peakdetect.tobed())
 170     ofhd_bed.close()
 171
 172     #4.3 negative peaks in XLS
 173     if options.cfile:
 174         ofile_xls = options.name+"_negative_peaks.xls"
 175         logging.info("#4.3 Write output xls file for negative peaks... %s" % (ofile_xls))
 176         ofhd_xls = open(ofile_xls,"w")
 177         ofhd_xls.write(peakdetect.neg_toxls())
 178         ofhd_xls.close()
 179
 180     logging.info("#5 Done! Check the output files!\n")
 181
 182 def bg_r (trackI, max_dup_tags):
 183     total_tags  = trackI.total_w_duplicates
 184     total_duplicates = 0
 185     chrs = trackI.get_chr_names()
 186     chrs.sort()
 187     for chrom in chrs:
 188         countlist = trackI.get_comments_by_chr (chrom)
 189         for c in countlist[0]:
 190             total_duplicates += max(c-max_dup_tags,0)
 191         for c in countlist[1]:
 192             total_duplicates += max(c-max_dup_tags,0)
 193     return float(total_duplicates)/(total_tags)
 194
 195
 196 def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
 197     for i in range (1,1000):
 198         if binom_cdf_Q(1.0/genome_size,tags_number,i) < p:
 199             return i
 200     raise Exception("LAMBDA > 1000!")
 201
 202 def load_tag_files ( tfile, cfile, format ):
 203     """From the options, load treatment tags and control tags (if available).
 204
 205     """
 206     logging.info("#1.1 read treatment tags...")
 207     treat = _merge_then_load(tfile,format)
 208     treat.merge_overlap()
 209     if cfile:
 210         logging.info("#1.2 read input tags...")
 211         control = _merge_then_load(cfile,format)
 212         control.merge_overlap()
 213     else:
 214         control = None
 215     return (treat, control)
 216
 217 def _merge_then_load ( p,format ):
 218     """Merge several tag files and load it to a single FWTrackI object.
 219
 220     """
 221     #s = os.system("cat %s > macs_tmpfile.t" % p)
 222     #if s == 0:
 223     t = _load_tag_file(p,format)
 224     #os.remove("macs_tmpfile.t")
 225     #else:
 226     #    raise Exception("unable to cat files: %s!" % p)
 227     return t
 228
 229 def _load_tag_file ( f, format ):
 230     """Load tag file into single FWTrackI object.
 231
 232     """
 233     fhd = file(f,mode="r")
 234     if format == "ELAND":
 235         p = ELANDParser()
 236     elif format == "BED":
 237         p = BEDParser()
 238     else:
 239         raise Exception("Format \"%s\" cannot be recognized!" % (format))
 240     trackI = p.build_fwtrack(fhd)
 241     return trackI
 242
 243 def model2r_script(model,name):
 244     rfhd = open(name+"_model.r","w")
 245     p = model.plus_line
 246     m = model.minus_line
 247     s = model.shifted_line
 248     w = len(p)
 249     norm_p = [0]*w
 250     norm_m = [0]*w
 251     norm_s = [0]*w
 252     sum_p = sum(p)
 253     sum_m = sum(m)
 254     sum_s = sum(s)
 255     for i in range(w):
 256         norm_p[i] = float(p[i])*100/sum_p
 257         norm_m[i] = float(m[i])*100/sum_m
 258         norm_s[i] = float(s[i])*100/sum_s
 259     rfhd.write("# R script for Peak Model\n")
 260     rfhd.write("#  -- generated by MACS\n")
 261
 262     rfhd.write("""p <- c(%s)
 263 m <- c(%s)
 264 s <- c(%s)
 265 x <- seq.int((length(p)-1)/2*-1,(length(p)-1)/2)
 266 pdf("%s_model.pdf",height=4,width=4)
 267 plot(x,p,type="l",col=c("red"),main="Peak Model",xlab="Distance to the middle",ylab="Percentage")
 268 lines(x,m,col=c("blue"))
 269 lines(x,s,col=c("black"))
 270 abline(v=median(x[p==max(p)]),lty=2,col=c("red"))
 271 abline(v=median(x[m==max(m)]),lty=2,col=c("blue"))
 272 abline(v=median(x[s==max(s)]),lty=2,col=c("black"))
 273 dev.off()
 274 """ % (','.join(map(str,norm_p)),','.join(map(str,norm_m)),','.join(map(str,norm_s)),name))
 275     rfhd.close()
 276
 277 if __name__ == '__main__':
 278     try:
 279         main()
 280     except KeyboardInterrupt:
 281         sys.stderr.write("User interrupt me! ;-) See you!\n")
 282         sys.exit(0)