2 # Time-stamp: <2008-05-21 18:19:59 Tao Liu>
6 Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the Artistic License (see the file COPYING included
10 with the distribution).
14 @author: Yong Zhang, Tao Liu
15 @contact: taoliu@jimmy.harvard.edu
18 # ------------------------------------
20 # ------------------------------------
26 from optparse import OptionParser
27 # ------------------------------------
29 # ------------------------------------
30 from MACS.gsl.cdf import binom_cdf_Q
31 from MACS.TabIO import BEDParser, ELANDParser
32 from MACS.PeakModel import PeakModel
33 from MACS.PeakDetect import PeakDetect
35 # ------------------------------------
37 # ------------------------------------
39 usage = "usage: %prog <-t tfile> [options]"
40 description = "MACS -- Model-based Analysis for ChIP-Sequencing"
43 optparser = OptionParser(version="%prog 1.0 beta",description=description,usage=usage,add_help_option=False)
44 optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
45 optparser.add_option("-t","--treatment",dest="tfile",type="string",
46 help="ChIP-seq treatment files. REQUIRED")
47 optparser.add_option("-c","--control",dest="cfile",type="string",
48 help="control files.")
49 optparser.add_option("--name",dest="name",type="string",
50 help="experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
52 optparser.add_option("--format",dest="format",type="string",
53 help="Format of tag file, \"ELAND\" or \"BED\". DEFAULT: \"BED\"",
55 optparser.add_option("--gsize",dest="gsize",type="int",default=2700000000,
56 help="genome size, default:2700000000")
57 optparser.add_option("--tsize",dest="tsize",type="int",default=25,
58 help="tag size. DEFAULT: 25")
59 optparser.add_option("--bw",dest="bw",type="int",default=300,
60 help="band width. DEFAULT: 300")
61 optparser.add_option("--pvalue",dest="pvalue",type="float",default=1e-5,
62 help="pvalue cutoff for peak detection. DEFAULT: 1e-5")
63 optparser.add_option("--mfold",dest="mfold",type="int",default=32,
64 help="select the regions with MFOLD high-confidence enrichment ratio against background to build model. DEFAULT:32")
65 # optparser.add_option("--wig",dest="wig",action="store_true",
66 # help="Whether or not to save the peak shape for every peak region in wiggle file",
68 optparser.add_option("--verbose",dest="verbose",type="int",default=2,
69 help="set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
70 (options,args) = optparser.parse_args()
73 optparser.print_help()
76 # configure the logging instance
77 logging.basicConfig(level=(4-options.verbose)*10,
78 format='%(levelname)-5s @ %(asctime)s: %(message)s ',
79 datefmt='%a, %d %b %Y %H:%M:%S',
84 if not os.path.isfile (options.tfile):
85 logging.error("No such file: %s!" % options.tfile)
87 if options.cfile and not os.path.isfile (options.cfile):
88 logging.error("No such file: %s!" % options.cfile)
92 logging.info("#0 arguments list:")
93 logging.info("#0 name = %s" % (options.name))
94 logging.info("#0 ChIP-seq file = %s" % (options.tfile))
95 logging.info("#0 control file = %s" % (options.cfile))
96 logging.info("#0 effective genome size = %.2e" % (options.gsize))
97 logging.info("#0 tag size = %d" % (options.tsize))
98 logging.info("#0 band width = %d" % (options.bw))
99 logging.info("#0 model fold = %s" % (options.mfold))
100 #logging.info("#0 model peaks = %s" % (options.mpeaks))
101 logging.info("#0 pvalue cutoff = %.2e" % (options.pvalue))
103 log_pvalue = math.log(options.pvalue,10)*-10
106 logging.info("#1 read tag files...")
107 format = options.format.upper()
108 (treat, control) = load_tag_files (options.tfile, options.cfile,format)
109 logging.debug("#1.3 calculate max duplicate tags in single position based on binomal distribution...")
110 max_dup_tags = cal_max_dup_tags(options.gsize,treat.total_w_duplicates)
111 logging.debug("#1.3 max_dup_tags = %d" % (max_dup_tags))
112 logging.debug("#1.3 unique tags in treatment: %d" % (treat.total))
113 logging.debug("#1.3 total tags in treatment: %d" % (treat.total_w_duplicates))
115 logging.debug("#1.3 unique tags in control: %d" % (control.total))
116 logging.debug("#1.3 total tags in control: %d" % (control.total_w_duplicates))
117 bg_redundant = bg_r (treat,max_dup_tags)
118 logging.info("#1.3 Background Redundant rate: %.2f" % (bg_redundant))
119 logging.info("#1.3 finished!")
121 logging.info("#2 Build Peak Model...")
122 peakmodel = PeakModel(treatment = treat,
123 gz = options.gsize, # mappable genome size
125 fold = options.mfold,
129 max_dup_tags = max_dup_tags
131 logging.info("#2 finished!")
132 logging.debug("#2 Summary Model:")
133 logging.debug("#2 min_tags: %d" % (peakmodel.min_tags))
134 logging.debug("#2 frag_length: %d" % (peakmodel.frag_length))
135 logging.debug("#2 scan_window: %d" % (peakmodel.scan_window))
136 logging.info("#2.2 Generate R script for model")
137 model2r_script(peakmodel,options.name)
139 logging.info("#3 Call peaks...")
140 peakdetect = PeakDetect(treat = treat,
142 frag_length = peakmodel.frag_length,
143 scan_window = peakmodel.scan_window,
145 gsize = options.gsize
147 peakdetect.call_peaks()
150 ofile_xls = options.name+"_peaks.xls"
151 logging.info("#4.1 Write output xls file... %s" % (ofile_xls))
152 ofhd_xls = open(ofile_xls,"w")
153 ofhd_xls.write("# This file is generated by MACS\n")
154 ofhd_xls.write("# name = %s\n" % (options.name))
155 ofhd_xls.write("# ChIP-seq file = %s\n" % (options.tfile))
156 ofhd_xls.write("# control file = %s\n" % (options.cfile))
157 ofhd_xls.write("# effective genome size = %.2e\n" % (options.gsize))
158 ofhd_xls.write("# tag size = %d\n" % (options.tsize))
159 ofhd_xls.write("# band width = %d\n" % (options.bw))
160 ofhd_xls.write("# model fold = %s\n" % (options.mfold))
161 ofhd_xls.write("# pvalue cutoff = %.2e\n" % (options.pvalue))
162 ofhd_xls.write("# frag_len = %d\n" % (peakdetect.frag_length))
163 ofhd_xls.write(peakdetect.toxls())
166 ofile_bed = options.name+"_peaks.bed"
167 logging.info("#4.2 Write output bed file... %s" % (ofile_bed))
168 ofhd_bed = open(ofile_bed,"w")
169 ofhd_bed.write(peakdetect.tobed())
172 #4.3 negative peaks in XLS
174 ofile_xls = options.name+"_negative_peaks.xls"
175 logging.info("#4.3 Write output xls file for negative peaks... %s" % (ofile_xls))
176 ofhd_xls = open(ofile_xls,"w")
177 ofhd_xls.write(peakdetect.neg_toxls())
180 logging.info("#5 Done! Check the output files!\n")
182 def bg_r (trackI, max_dup_tags):
183 total_tags = trackI.total_w_duplicates
185 chrs = trackI.get_chr_names()
188 countlist = trackI.get_comments_by_chr (chrom)
189 for c in countlist[0]:
190 total_duplicates += max(c-max_dup_tags,0)
191 for c in countlist[1]:
192 total_duplicates += max(c-max_dup_tags,0)
193 return float(total_duplicates)/(total_tags)
196 def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
197 for i in range (1,1000):
198 if binom_cdf_Q(1.0/genome_size,tags_number,i) < p:
200 raise Exception("LAMBDA > 1000!")
202 def load_tag_files ( tfile, cfile, format ):
203 """From the options, load treatment tags and control tags (if available).
206 logging.info("#1.1 read treatment tags...")
207 treat = _merge_then_load(tfile,format)
208 treat.merge_overlap()
210 logging.info("#1.2 read input tags...")
211 control = _merge_then_load(cfile,format)
212 control.merge_overlap()
215 return (treat, control)
217 def _merge_then_load ( p,format ):
218 """Merge several tag files and load it to a single FWTrackI object.
221 #s = os.system("cat %s > macs_tmpfile.t" % p)
223 t = _load_tag_file(p,format)
224 #os.remove("macs_tmpfile.t")
226 # raise Exception("unable to cat files: %s!" % p)
229 def _load_tag_file ( f, format ):
230 """Load tag file into single FWTrackI object.
233 fhd = file(f,mode="r")
234 if format == "ELAND":
236 elif format == "BED":
239 raise Exception("Format \"%s\" cannot be recognized!" % (format))
240 trackI = p.build_fwtrack(fhd)
243 def model2r_script(model,name):
244 rfhd = open(name+"_model.r","w")
247 s = model.shifted_line
256 norm_p[i] = float(p[i])*100/sum_p
257 norm_m[i] = float(m[i])*100/sum_m
258 norm_s[i] = float(s[i])*100/sum_s
259 rfhd.write("# R script for Peak Model\n")
260 rfhd.write("# -- generated by MACS\n")
262 rfhd.write("""p <- c(%s)
265 x <- seq.int((length(p)-1)/2*-1,(length(p)-1)/2)
266 pdf("%s_model.pdf",height=4,width=4)
267 plot(x,p,type="l",col=c("red"),main="Peak Model",xlab="Distance to the middle",ylab="Percentage")
268 lines(x,m,col=c("blue"))
269 lines(x,s,col=c("black"))
270 abline(v=median(x[p==max(p)]),lty=2,col=c("red"))
271 abline(v=median(x[m==max(m)]),lty=2,col=c("blue"))
272 abline(v=median(x[s==max(s)]),lty=2,col=c("black"))
274 """ % (','.join(map(str,norm_p)),','.join(map(str,norm_m)),','.join(map(str,norm_s)),name))
277 if __name__ == '__main__':
280 except KeyboardInterrupt:
281 sys.stderr.write("User interrupt me! ;-) See you!\n")