Added script front-end for primer-design code
[htsworkflow.git] / htswanalysis / MACS / lib / PeakDetect.py
1 # Time-stamp: <2008-05-28 13:38:06 Tao Liu>
2
3 """Module Description
4
5 Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
6
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the Artistic License (see the file COPYING included
9 with the distribution).
10
11 @status:  experimental
12 @version: $Revision$
13 @author:  Yong Zhang, Tao Liu
14 @contact: taoliu@jimmy.harvard.edu
15 """
16
17 import logging
18 import math
19 from MACS.gsl.cdf import poisson_cdf_Q
20
21 SAVE_WIGGLE = False
22
23 class PeakDetect:
24     """Class to do the peak calling.
25
26     e.g:
27     >>> from MACS.PeakDetect import PeakDetect
28     >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, frag_length=100, scan_window=200, gsize=3000000000)
29     >>> pd.call_peaks()
30     >>> print pd.toxls()
31     """
32     def __init__ (self,treat=None, control=None, pvalue=None, frag_length=None, scan_window=None, gsize = None):
33         """Initialize the PeakDetect object.
34
35         Parameters:
36         1. treat   : treatment data which is a FWTrackI instance
37         2. control : control data which is a FWTrackI instance
38         3. pvalue  : pvalue cutoff, the pvalue is actually -10*log(10) real_pvalue
39         4. frag_length : fragment length predicted by PeakModel process
40         5. scan_window : scan window width predicted by PeakModel process
41         6. gsize       : effective genome size
42         """
43         self.treat = treat
44         self.control = control
45         self.ratio_treat2control = None
46         self.pvalue = pvalue
47         self.frag_length = frag_length
48         self.shift_size = frag_length/2
49         self.scan_window = scan_window
50         self.gsize = gsize
51         self.gap = self.shift_size
52         self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize
53         logging.debug("#3 background lambda: %.2f " % (self.lambda_bg))
54         self.min_tags = self.inverse_poisson_cdf(self.lambda_bg,self.pvalue)+1
55         logging.debug("#3 min tags: %d" % (self.min_tags))
56         self.peaks = None
57         self.final_peaks = None
58         self.final_negative_peaks = None
59
60     def inverse_poisson_cdf (self, lam, q ):
61         """Very Naive inverse CDF (PMF) for poisson distribution.
62
63         The maximum return value is 1000, if it is over 1000, return
64         None.
65         """
66         p = 10**(q/10*-1)
67         for i in range(1,1000):
68             if poisson_cdf_Q(lam,i) < p:
69                 return i
70         return None
71
72     def call_peaks (self):
73         """Call peaks function.
74
75         Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
76         self.final_peaks and self.final_negative_peaks.
77         """
78         if self.control:
79             self.peaks = self.__call_peaks_w_control ()
80         else:
81             self.peaks = self.__call_peaks_wo_control ()
82
83     def toxls (self):
84         """
85         """
86         text = ""
87         if self.control and self.peaks:
88             text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)", "fold_change", "FDR(%)"))+"\n"
89         elif self.peaks:
90             text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)", "fold_change"))+"\n"
91         else:
92             return None
93         
94         chrs = self.peaks.keys()
95         chrs.sort()
96         for chrom in chrs:
97             for peak in self.peaks[chrom]:
98                 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
99                 peak_summit_relative_pos = peak[3]-peak[0]+1
100                 text += "\t%d" % (peak_summit_relative_pos)
101                 text += "\t%d\t%.2f" % (peak[5],peak[6])
102                 text += "\t%.2f" % (peak[7])
103                 if self.control:
104                     text += "\t%.2f" % (peak[8])
105                 text+= "\n"
106         return text
107
108     def neg_toxls (self):
109         text = ""
110         text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)","fold_change"))+"\n"
111         chrs = self.final_negative_peaks.keys()
112         chrs.sort()
113         for chrom in chrs:
114             for peak in self.final_negative_peaks[chrom]:
115                 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
116                 peak_summit_relative_pos = peak[3]-peak[0]+1
117                 text += "\t%d" % (peak_summit_relative_pos)
118                 text += "\t%d\t%.2f" % (peak[5],peak[6])
119                 text += "\t%.2f" % (peak[7])
120                 text+= "\n"
121         return text
122
123     def tobed (self):
124         text = ""
125         chrs = self.peaks.keys()
126         chrs.sort()
127         for chrom in chrs:
128             for peak in self.peaks[chrom]:
129                 text+= "%s\t%d\t%d\n" % (chrom,peak[0],peak[1])
130         return text
131
132     def __add_fdr (self, final, negative): 
133         """
134         A peak info type is a: dictionary
135
136         key value: chromosome
137
138         items: (peak start,peak end, peak length, peak summit, peak
139         height, number of tags in peak region, peak pvalue, peak
140         foldchange, fdr) <-- tuple type
141         """
142         pvalue2fdr = {}
143         pvalues_final = []
144         pvalues_negative = []
145         chrs = final.keys()
146         for chrom in chrs:
147             for i in final[chrom]:
148                 pvalues_final.append(i[6]) # i[6] is pvalue in peak info
149                 pvalue2fdr[i[6]]=None
150         chrs = negative.keys()
151         for chrom in chrs:
152             for i in negative[chrom]:
153                 pvalues_negative.append(i[6])
154         pvalues_final.sort(reverse=True)
155         pvalues_negative.sort(reverse=True)
156         for p in pvalue2fdr.keys():
157             n_final = pvalues_final.index(p) + pvalues_final.count(p)
158             n_negative = 0
159             for i in range(len(pvalues_negative)):
160                 if pvalues_negative[i] >= p:
161                     n_negative += 1
162                 else:
163                     break
164             pvalue2fdr[p] = 100.0 * n_negative / n_final
165         #print pvalue2fdr
166         new_info = {}
167         chrs = final.keys()
168         for chrom in chrs:
169             new_info[chrom] = []
170             for i in final[chrom]:
171                 tmp = list(i)
172                 tmp.append(pvalue2fdr[i[6]])
173                 new_info[chrom].append(tuple(tmp))      # i[6] is pvalue in peak info
174         return new_info
175
176     def __call_peaks_w_control (self):
177         """To call peaks with control data.
178
179         A peak info type is a: dictionary
180
181         key value: chromosome
182
183         items: (peak start,peak end, peak length, peak summit, peak
184         height, number of tags in peak region, peak pvalue, peak
185         foldchange) <-- tuple type
186         """
187         self.ratio_treat2control = float(self.treat.total)/self.control.total
188         logging.info("#3.1 shift treatment data")
189         self.__shift_trackI(self.treat)
190         logging.info("#3.1 merge +/- strand of treatment data")
191         self.treat.merge_plus_minus_ranges_w_duplicates ()
192         logging.debug("#3.1 after shift and merging, tags: %d" % (self.treat.total))
193         logging.info("#3.1 call peak candidates")
194         peak_candidates = self.__call_peaks_from_trackI (self.treat)
195
196         logging.info("#3.2 shift control data")
197         logging.info("#3.2 merge +/- strand of control data")
198         self.__shift_trackI(self.control)
199         self.control.merge_plus_minus_ranges_w_duplicates ()
200         logging.debug("#3.2 after shift and merging, tags: %d" % (self.control.total))
201         logging.info("#3.2 call negative peak candidates")
202         negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
203         
204         
205         logging.info("#3.3 use control data to filter peak candidates...")
206         self.final_peaks = self.__filter_w_control(peak_candidates,self.control, self.ratio_treat2control)
207         logging.info("#3.3 find negative peaks by reversing treat and control")
208         self.final_negative_peaks = self.__filter_w_control(negative_peak_candidates,self.treat, 1.0/self.ratio_treat2control)
209         return self.__add_fdr (self.final_peaks, self.final_negative_peaks)
210
211     def __cal_wig_for_peaks (self):
212         loose_peak_regions = self.__call_peaks_from_trackI (self.treat,pvalue=1e-3)
213         
214
215
216     def __call_peaks_wo_control (self):
217         """To call peaks w/o control data.
218
219         """
220         logging.info("#3 shift treatment data")
221         self.__shift_trackI(self.treat)
222         logging.info("#3 merge +/- strand of treatment data")
223         self.treat.merge_plus_minus_ranges_w_duplicates ()
224         logging.debug("#3 after shift and merging, tags: %d" % (self.treat.total))
225         logging.info("#3 call peak candidates")
226         peak_candidates = self.__call_peaks_from_trackI (self.treat)
227         logging.info("#3 use self to calculate local lambda and  filter peak candidates...")
228         self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,1,pass_1k=True)
229         return self.final_peaks
230
231     def __print_peak_info (self, peak_info):
232         """Print out peak information.
233
234         A peak info type is a: dictionary
235
236         key value: chromosome
237
238         items: (peak start,peak end, peak length, peak summit, peak
239         height, number of tags in peak region, peak pvalue, peak
240         foldchange) <-- tuple type
241         """
242         chrs = peak_info.keys()
243         chrs.sort()
244         for chrom in chrs:
245             peak_list = peak_info[chrom]
246             for peak in peak_list:
247                 print chrom+"\t"+"\t".join(map(str,peak))
248
249     def __filter_w_control (self, peak_info, control, treat2control_ratio, pass_1k=False, write2wig= False ):
250         """Use control data to calculate several lambda values around
251         1k, 5k and 10k region around peak summit. Choose the highest
252         one as local lambda, then calculate p-value in poisson
253         distribution.
254
255         Return value type in this format:
256         a dictionary
257         key value : chromosome
258         items : array of (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange)
259         """
260         final_peak_info = {}
261         chrs = peak_info.keys()
262         chrs.sort()
263         total = 0
264         for chrom in chrs:
265             logging.debug("#3 Chromosome %s" % (chrom))
266             n_chrom = 0
267             final_peak_info[chrom] = []
268             peak_list = peak_info[chrom]
269             (ctags,tmp) = control.get_ranges_by_chr(chrom)
270             index_ctag = 0
271             flag_find_ctag_in_peak = False
272             prev_index_ctag = 0
273             len_ctags =len(ctags)
274             for i in range(len(peak_list)):
275                 (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags) = peak_list[i]
276                 left_10k = peak_summit-5000
277                 left_5k = peak_summit-2500
278                 left_1k = peak_summit-500
279                 right_10k = peak_summit+5000
280                 right_5k = peak_summit+2500
281                 right_1k = peak_summit+500
282                 num_10k = 0
283                 num_5k = 0
284                 num_1k = 0
285                 while index_ctag < len_ctags:
286                     if ctags[index_ctag] < left_10k:
287                         # go to next control tag
288                         index_ctag+=1
289                     elif right_10k < ctags[index_ctag]:
290                         # finalize and go to next peak region
291                         flag_find_ctag_in_peak = False
292                         index_ctag = prev_index_ctag 
293                         break
294                     else:
295                         # in -10k ~ 10k region
296                         if not flag_find_ctag_in_peak:
297                             flag_find_ctag_in_peak = True
298                             prev_index_ctag = index_ctag
299                         num_10k +=1
300                         if left_1k <ctags[index_ctag]< right_1k:
301                             num_1k+=1
302                             num_5k+=1
303                         elif left_5k <ctags[index_ctag]< right_5k:
304                             num_5k+=1
305                         index_ctag += 1
306                 window_size_4_lambda = max(peak_length,self.scan_window)
307                 lambda_10k = float(num_10k)/10000*treat2control_ratio*window_size_4_lambda
308                 lambda_5k = float(num_5k)/5000*treat2control_ratio*window_size_4_lambda
309                 lambda_1k = float(num_1k)/1000*treat2control_ratio*window_size_4_lambda
310                 lambda_bg = self.lambda_bg/self.scan_window*window_size_4_lambda
311                 if pass_1k:
312                     local_lambda = max(lambda_bg,lambda_10k,lambda_5k)
313                 else:
314                     local_lambda = max(lambda_bg,lambda_10k,lambda_5k,lambda_1k)
315 #                 logging.debug("lambda report:")
316 #                 if local_lambda == lambda_1k:
317 #                     logging.debug("lambda_1k %f is used" % lambda_1k)
318 #                 if local_lambda == lambda_5k:
319 #                     logging.debug("lambda_5k %f is used" % lambda_5k)
320 #                 if local_lambda == lambda_10k:
321 #                     logging.debug("lambda_10k %f is used" % lambda_10k)
322 #                 if local_lambda == lambda_bg:
323 #                     logging.debug("lambda_bg %f is used" % lambda_bg)
324 #                 else:
325 #                     logging.debug("\t".join(map(str,(lambda_bg,lambda_10k,lambda_5k,lambda_1k))))
326
327                 p_tmp = poisson_cdf_Q(local_lambda,peak_num_tags)
328                 if p_tmp <= 0:
329                     peak_pvalue = 3100
330                 else:
331                     peak_pvalue = math.log(p_tmp,10) * -10
332                 if peak_pvalue > self.pvalue:
333                     n_chrom += 1
334                     total += 1
335                     peak_foldchange = float(peak_num_tags)/local_lambda
336                     final_peak_info[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange))
337 #                 else:
338 #                     logging.debug("lambda rules")
339
340             logging.debug("#3 peaks whose pvalue < cutoff: %d" % (n_chrom))
341         logging.info("#3 Finally, %d peaks are called!" % (total))
342         return final_peak_info
343
344     def __call_peaks_from_trackI (self, trackI):
345         """Call peak candidates from trackI data. Using every tag as
346         step and scan the self.scan_window region around the tag. If
347         tag number is greater than self.min_tags, then the position is
348         recorded.
349
350         Return: data in this format. (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags)
351         """
352         peak_candidates = {}
353         logging.debug("#3 search peak condidates...")
354         chrs = trackI.get_chr_names()
355         chrs.sort()
356         total = 0
357         for chrom in chrs:
358             logging.debug("#3 Chromosome %s" % (chrom))
359             n_chrom = 0
360             peak_candidates[chrom] = []
361             (tags,tmp) = trackI.get_ranges_by_chr(chrom)
362             len_t = len(tags)
363             cpr_tags = []       # Candidate Peak Region tags
364             cpr_tags.extend(tags[:self.min_tags-1])
365             number_cpr_tags = self.min_tags-1
366             p = self.min_tags-1 # Next Tag Index
367             while p < len_t:
368                 if number_cpr_tags >= self.min_tags:
369                     if tags[p] - cpr_tags[-1*self.min_tags+1] <= self.scan_window:
370                         # add next tag, if the new tag is less than self.scan_window away from previous no. self.min_tags tag
371                         cpr_tags.append(tags[p])
372                         number_cpr_tags += 1
373                         p+=1
374                     else:
375                         # candidate peak region is ready, call peak...
376                         (peak_start,peak_end,peak_length,peak_summit,peak_height) = self.__tags_call_peak (cpr_tags)
377                         peak_candidates[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags))
378                         cpr_tags = [tags[p]] # reset
379
380                         number_cpr_tags = 1
381                         total += 1
382                         n_chrom += 1
383                 else:
384                     # add next tag, but if the first one in cpr_tags
385                     # is more than self.scan_window away from this new
386                     # tag, remove the first one from the list
387                     if tags[p] - cpr_tags[0] >= self.scan_window:
388                         cpr_tags.pop(0)
389                         number_cpr_tags -= 1
390                     cpr_tags.append(tags[p])
391                     number_cpr_tags += 1
392                     p+=1
393             logging.debug("#3 peak candidates: %d" % (n_chrom))
394         logging.debug("#3 Total number of candidates: %d" % (total))
395         return peak_candidates
396                 
397     def __tags_call_peak (self, tags ):
398         """Project tags to a line. Then find the highest point.
399
400         """
401         start = tags[0]-self.frag_length/2
402         end = tags[-1]+self.frag_length/2+1
403         region_length = end - start
404         line= [0]*region_length
405         for tag in tags:
406             tag_projected_start = tag-start-self.frag_length/2
407             tag_projected_end = tag-start+self.frag_length/2
408             for i in range(tag_projected_start,tag_projected_end):
409                 line[i] += 1
410         tops = []
411         top_height = 0
412         for i in range(len(line)):
413             if line[i] > top_height:
414                 top_height = line[i]
415                 tops = [i]
416             elif line[i] == top_height:
417                 tops.append(i)
418         peak_summit = tops[len(tops)/2]+start
419         return (start,end,region_length,peak_summit,top_height)
420             
421
422     def __shift_trackI (self, trackI):
423         """Shift trackI data to right (for plus strand) or left (for
424         minus strand).
425
426         """
427         chrs = trackI.get_chr_names()
428         chrs.sort()
429         for chrom in chrs:
430             tags = trackI.get_ranges_by_chr(chrom)
431             # plus
432             for i in range(len(tags[0])):
433                 tags[0][i]=tags[0][i]+self.shift_size
434             # minus
435             for i in range(len(tags[1])):
436                 tags[1][i]=tags[1][i]-self.shift_size
437         return
438