1 # Time-stamp: <2008-05-28 13:38:06 Tao Liu>
5 Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the Artistic License (see the file COPYING included
9 with the distribution).
13 @author: Yong Zhang, Tao Liu
14 @contact: taoliu@jimmy.harvard.edu
19 from MACS.gsl.cdf import poisson_cdf_Q
24 """Class to do the peak calling.
27 >>> from MACS.PeakDetect import PeakDetect
28 >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, frag_length=100, scan_window=200, gsize=3000000000)
32 def __init__ (self,treat=None, control=None, pvalue=None, frag_length=None, scan_window=None, gsize = None):
33 """Initialize the PeakDetect object.
36 1. treat : treatment data which is a FWTrackI instance
37 2. control : control data which is a FWTrackI instance
38 3. pvalue : pvalue cutoff, the pvalue is actually -10*log(10) real_pvalue
39 4. frag_length : fragment length predicted by PeakModel process
40 5. scan_window : scan window width predicted by PeakModel process
41 6. gsize : effective genome size
44 self.control = control
45 self.ratio_treat2control = None
47 self.frag_length = frag_length
48 self.shift_size = frag_length/2
49 self.scan_window = scan_window
51 self.gap = self.shift_size
52 self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize
53 logging.debug("#3 background lambda: %.2f " % (self.lambda_bg))
54 self.min_tags = self.inverse_poisson_cdf(self.lambda_bg,self.pvalue)+1
55 logging.debug("#3 min tags: %d" % (self.min_tags))
57 self.final_peaks = None
58 self.final_negative_peaks = None
60 def inverse_poisson_cdf (self, lam, q ):
61 """Very Naive inverse CDF (PMF) for poisson distribution.
63 The maximum return value is 1000, if it is over 1000, return
67 for i in range(1,1000):
68 if poisson_cdf_Q(lam,i) < p:
72 def call_peaks (self):
73 """Call peaks function.
75 Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
76 self.final_peaks and self.final_negative_peaks.
79 self.peaks = self.__call_peaks_w_control ()
81 self.peaks = self.__call_peaks_wo_control ()
87 if self.control and self.peaks:
88 text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)", "fold_change", "FDR(%)"))+"\n"
90 text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)", "fold_change"))+"\n"
94 chrs = self.peaks.keys()
97 for peak in self.peaks[chrom]:
98 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
99 peak_summit_relative_pos = peak[3]-peak[0]+1
100 text += "\t%d" % (peak_summit_relative_pos)
101 text += "\t%d\t%.2f" % (peak[5],peak[6])
102 text += "\t%.2f" % (peak[7])
104 text += "\t%.2f" % (peak[8])
108 def neg_toxls (self):
110 text += "\t".join(("chr","start", "end", "length", "summit", "tags", "-10*log10(pvalue)","fold_change"))+"\n"
111 chrs = self.final_negative_peaks.keys()
114 for peak in self.final_negative_peaks[chrom]:
115 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
116 peak_summit_relative_pos = peak[3]-peak[0]+1
117 text += "\t%d" % (peak_summit_relative_pos)
118 text += "\t%d\t%.2f" % (peak[5],peak[6])
119 text += "\t%.2f" % (peak[7])
125 chrs = self.peaks.keys()
128 for peak in self.peaks[chrom]:
129 text+= "%s\t%d\t%d\n" % (chrom,peak[0],peak[1])
132 def __add_fdr (self, final, negative):
134 A peak info type is a: dictionary
136 key value: chromosome
138 items: (peak start,peak end, peak length, peak summit, peak
139 height, number of tags in peak region, peak pvalue, peak
140 foldchange, fdr) <-- tuple type
144 pvalues_negative = []
147 for i in final[chrom]:
148 pvalues_final.append(i[6]) # i[6] is pvalue in peak info
149 pvalue2fdr[i[6]]=None
150 chrs = negative.keys()
152 for i in negative[chrom]:
153 pvalues_negative.append(i[6])
154 pvalues_final.sort(reverse=True)
155 pvalues_negative.sort(reverse=True)
156 for p in pvalue2fdr.keys():
157 n_final = pvalues_final.index(p) + pvalues_final.count(p)
159 for i in range(len(pvalues_negative)):
160 if pvalues_negative[i] >= p:
164 pvalue2fdr[p] = 100.0 * n_negative / n_final
170 for i in final[chrom]:
172 tmp.append(pvalue2fdr[i[6]])
173 new_info[chrom].append(tuple(tmp)) # i[6] is pvalue in peak info
176 def __call_peaks_w_control (self):
177 """To call peaks with control data.
179 A peak info type is a: dictionary
181 key value: chromosome
183 items: (peak start,peak end, peak length, peak summit, peak
184 height, number of tags in peak region, peak pvalue, peak
185 foldchange) <-- tuple type
187 self.ratio_treat2control = float(self.treat.total)/self.control.total
188 logging.info("#3.1 shift treatment data")
189 self.__shift_trackI(self.treat)
190 logging.info("#3.1 merge +/- strand of treatment data")
191 self.treat.merge_plus_minus_ranges_w_duplicates ()
192 logging.debug("#3.1 after shift and merging, tags: %d" % (self.treat.total))
193 logging.info("#3.1 call peak candidates")
194 peak_candidates = self.__call_peaks_from_trackI (self.treat)
196 logging.info("#3.2 shift control data")
197 logging.info("#3.2 merge +/- strand of control data")
198 self.__shift_trackI(self.control)
199 self.control.merge_plus_minus_ranges_w_duplicates ()
200 logging.debug("#3.2 after shift and merging, tags: %d" % (self.control.total))
201 logging.info("#3.2 call negative peak candidates")
202 negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
205 logging.info("#3.3 use control data to filter peak candidates...")
206 self.final_peaks = self.__filter_w_control(peak_candidates,self.control, self.ratio_treat2control)
207 logging.info("#3.3 find negative peaks by reversing treat and control")
208 self.final_negative_peaks = self.__filter_w_control(negative_peak_candidates,self.treat, 1.0/self.ratio_treat2control)
209 return self.__add_fdr (self.final_peaks, self.final_negative_peaks)
211 def __cal_wig_for_peaks (self):
212 loose_peak_regions = self.__call_peaks_from_trackI (self.treat,pvalue=1e-3)
216 def __call_peaks_wo_control (self):
217 """To call peaks w/o control data.
220 logging.info("#3 shift treatment data")
221 self.__shift_trackI(self.treat)
222 logging.info("#3 merge +/- strand of treatment data")
223 self.treat.merge_plus_minus_ranges_w_duplicates ()
224 logging.debug("#3 after shift and merging, tags: %d" % (self.treat.total))
225 logging.info("#3 call peak candidates")
226 peak_candidates = self.__call_peaks_from_trackI (self.treat)
227 logging.info("#3 use self to calculate local lambda and filter peak candidates...")
228 self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,1,pass_1k=True)
229 return self.final_peaks
231 def __print_peak_info (self, peak_info):
232 """Print out peak information.
234 A peak info type is a: dictionary
236 key value: chromosome
238 items: (peak start,peak end, peak length, peak summit, peak
239 height, number of tags in peak region, peak pvalue, peak
240 foldchange) <-- tuple type
242 chrs = peak_info.keys()
245 peak_list = peak_info[chrom]
246 for peak in peak_list:
247 print chrom+"\t"+"\t".join(map(str,peak))
249 def __filter_w_control (self, peak_info, control, treat2control_ratio, pass_1k=False, write2wig= False ):
250 """Use control data to calculate several lambda values around
251 1k, 5k and 10k region around peak summit. Choose the highest
252 one as local lambda, then calculate p-value in poisson
255 Return value type in this format:
257 key value : chromosome
258 items : array of (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange)
261 chrs = peak_info.keys()
265 logging.debug("#3 Chromosome %s" % (chrom))
267 final_peak_info[chrom] = []
268 peak_list = peak_info[chrom]
269 (ctags,tmp) = control.get_ranges_by_chr(chrom)
271 flag_find_ctag_in_peak = False
273 len_ctags =len(ctags)
274 for i in range(len(peak_list)):
275 (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags) = peak_list[i]
276 left_10k = peak_summit-5000
277 left_5k = peak_summit-2500
278 left_1k = peak_summit-500
279 right_10k = peak_summit+5000
280 right_5k = peak_summit+2500
281 right_1k = peak_summit+500
285 while index_ctag < len_ctags:
286 if ctags[index_ctag] < left_10k:
287 # go to next control tag
289 elif right_10k < ctags[index_ctag]:
290 # finalize and go to next peak region
291 flag_find_ctag_in_peak = False
292 index_ctag = prev_index_ctag
295 # in -10k ~ 10k region
296 if not flag_find_ctag_in_peak:
297 flag_find_ctag_in_peak = True
298 prev_index_ctag = index_ctag
300 if left_1k <ctags[index_ctag]< right_1k:
303 elif left_5k <ctags[index_ctag]< right_5k:
306 window_size_4_lambda = max(peak_length,self.scan_window)
307 lambda_10k = float(num_10k)/10000*treat2control_ratio*window_size_4_lambda
308 lambda_5k = float(num_5k)/5000*treat2control_ratio*window_size_4_lambda
309 lambda_1k = float(num_1k)/1000*treat2control_ratio*window_size_4_lambda
310 lambda_bg = self.lambda_bg/self.scan_window*window_size_4_lambda
312 local_lambda = max(lambda_bg,lambda_10k,lambda_5k)
314 local_lambda = max(lambda_bg,lambda_10k,lambda_5k,lambda_1k)
315 # logging.debug("lambda report:")
316 # if local_lambda == lambda_1k:
317 # logging.debug("lambda_1k %f is used" % lambda_1k)
318 # if local_lambda == lambda_5k:
319 # logging.debug("lambda_5k %f is used" % lambda_5k)
320 # if local_lambda == lambda_10k:
321 # logging.debug("lambda_10k %f is used" % lambda_10k)
322 # if local_lambda == lambda_bg:
323 # logging.debug("lambda_bg %f is used" % lambda_bg)
325 # logging.debug("\t".join(map(str,(lambda_bg,lambda_10k,lambda_5k,lambda_1k))))
327 p_tmp = poisson_cdf_Q(local_lambda,peak_num_tags)
331 peak_pvalue = math.log(p_tmp,10) * -10
332 if peak_pvalue > self.pvalue:
335 peak_foldchange = float(peak_num_tags)/local_lambda
336 final_peak_info[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange))
338 # logging.debug("lambda rules")
340 logging.debug("#3 peaks whose pvalue < cutoff: %d" % (n_chrom))
341 logging.info("#3 Finally, %d peaks are called!" % (total))
342 return final_peak_info
344 def __call_peaks_from_trackI (self, trackI):
345 """Call peak candidates from trackI data. Using every tag as
346 step and scan the self.scan_window region around the tag. If
347 tag number is greater than self.min_tags, then the position is
350 Return: data in this format. (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags)
353 logging.debug("#3 search peak condidates...")
354 chrs = trackI.get_chr_names()
358 logging.debug("#3 Chromosome %s" % (chrom))
360 peak_candidates[chrom] = []
361 (tags,tmp) = trackI.get_ranges_by_chr(chrom)
363 cpr_tags = [] # Candidate Peak Region tags
364 cpr_tags.extend(tags[:self.min_tags-1])
365 number_cpr_tags = self.min_tags-1
366 p = self.min_tags-1 # Next Tag Index
368 if number_cpr_tags >= self.min_tags:
369 if tags[p] - cpr_tags[-1*self.min_tags+1] <= self.scan_window:
370 # add next tag, if the new tag is less than self.scan_window away from previous no. self.min_tags tag
371 cpr_tags.append(tags[p])
375 # candidate peak region is ready, call peak...
376 (peak_start,peak_end,peak_length,peak_summit,peak_height) = self.__tags_call_peak (cpr_tags)
377 peak_candidates[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags))
378 cpr_tags = [tags[p]] # reset
384 # add next tag, but if the first one in cpr_tags
385 # is more than self.scan_window away from this new
386 # tag, remove the first one from the list
387 if tags[p] - cpr_tags[0] >= self.scan_window:
390 cpr_tags.append(tags[p])
393 logging.debug("#3 peak candidates: %d" % (n_chrom))
394 logging.debug("#3 Total number of candidates: %d" % (total))
395 return peak_candidates
397 def __tags_call_peak (self, tags ):
398 """Project tags to a line. Then find the highest point.
401 start = tags[0]-self.frag_length/2
402 end = tags[-1]+self.frag_length/2+1
403 region_length = end - start
404 line= [0]*region_length
406 tag_projected_start = tag-start-self.frag_length/2
407 tag_projected_end = tag-start+self.frag_length/2
408 for i in range(tag_projected_start,tag_projected_end):
412 for i in range(len(line)):
413 if line[i] > top_height:
416 elif line[i] == top_height:
418 peak_summit = tops[len(tops)/2]+start
419 return (start,end,region_length,peak_summit,top_height)
422 def __shift_trackI (self, trackI):
423 """Shift trackI data to right (for plus strand) or left (for
427 chrs = trackI.get_chr_names()
430 tags = trackI.get_ranges_by_chr(chrom)
432 for i in range(len(tags[0])):
433 tags[0][i]=tags[0][i]+self.shift_size
435 for i in range(len(tags[1])):
436 tags[1][i]=tags[1][i]-self.shift_size