htswanalysis/MACS/lib/PeakDetect.py

   1 # Time-stamp: <2008-05-28 13:38:06 Tao Liu>
   2
   3 """Module Description
   4
   5 Copyright (c) 2008 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the Artistic License (see the file COPYING included
   9 with the distribution).
  10
  11 @status:  experimental
  12 @version: $Revision$
  13 @author:  Yong Zhang, Tao Liu
  14 @contact: taoliu@jimmy.harvard.edu
  15 """
  16
  17 import logging
  18 import math
  19 from MACS.gsl.cdf import poisson_cdf_Q
  20
  21 SAVE_WIGGLE = False
  22
  23 class PeakDetect:
  24     """Class to do the peak calling.
  25
  26     e.g:
  27     >>> from MACS.PeakDetect import PeakDetect
  28     >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, frag_length=100, scan_window=200, gsize=3000000000)
  29     >>> pd.call_peaks()
  30     >>> print pd.toxls()
  31     """
  32     def __init__ (self,treat=None, control=None, pvalue=None, frag_length=None, scan_window=None, gsize = None):
  33         """Initialize the PeakDetect object.
  34
  35         Parameters:
  36         1. treat   : treatment data which is a FWTrackI instance
  37         2. control : control data which is a FWTrackI instance
  38         3. pvalue  : pvalue cutoff, the pvalue is actually -10*log(10) real_pvalue
  39         4. frag_length : fragment length predicted by PeakModel process
  40         5. scan_window : scan window width predicted by PeakModel process
  41         6. gsize       : effective genome size
  42         """
  43         self.treat = treat
  44         self.control = control
  45         self.ratio_treat2control = None
  46         self.pvalue = pvalue
  47         self.frag_length = frag_length
  48         self.shift_size = frag_length/2
  49         self.scan_window = scan_window
  50         self.gsize = gsize
  51         self.gap = self.shift_size
  52         self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize
  53         logging.debug("#3 background lambda: %.2f " % (self.lambda_bg))
  54         self.min_tags = self.inverse_poisson_cdf(self.lambda_bg,self.pvalue)+1
  55         logging.debug("#3 min tags: %d" % (self.min_tags))
  56         self.peaks = None
  57         self.final_peaks = None
  58         self.final_negative_peaks = None
  59
  60     def inverse_poisson_cdf (self, lam, q ):
  61         """Very Naive inverse CDF (PMF) for poisson distribution.
  62
  63         The maximum return value is 1000, if it is over 1000, return
  64         None.
  65         """
  66         p = 10**(q/10*-1)
  67         for i in range(1,1000):
  68             if poisson_cdf_Q(lam,i) < p:
  69                 return i
  70         return None
  71
  72     def call_peaks (self):
  73         """Call peaks function.
  74
  75         Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
  76         self.final_peaks and self.final_negative_peaks.
  77         """
  78         if self.control:
  79             self.peaks = self.__call_peaks_w_control ()
  80         else:
  81             self.peaks = self.__call_peaks_wo_control ()
  82
  83     def toxls (self):
  84         """
  85         """
  86         text = ""
  87         if self.control and self.peaks:
  88             text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)", "fold_change", "FDR(%)"))+"\n"
  89         elif self.peaks:
  90             text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)", "fold_change"))+"\n"
  91         else:
  92             return None
  93
  94         chrs = self.peaks.keys()
  95         chrs.sort()
  96         for chrom in chrs:
  97             for peak in self.peaks[chrom]:
  98                 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
  99                 peak_summit_relative_pos = peak[3]-peak[0]+1
 100                 text += "\t%d" % (peak_summit_relative_pos)
 101                 text += "\t%d\t%.2f" % (peak[5],peak[6])
 102                 text += "\t%.2f" % (peak[7])
 103                 if self.control:
 104                     text += "\t%.2f" % (peak[8])
 105                 text+= "\n"
 106         return text
 107
 108     def neg_toxls (self):
 109         text = ""
 110         text += "\t".join(("chr","start", "end",  "length",  "summit", "tags", "-10*log10(pvalue)","fold_change"))+"\n"
 111         chrs = self.final_negative_peaks.keys()
 112         chrs.sort()
 113         for chrom in chrs:
 114             for peak in self.final_negative_peaks[chrom]:
 115                 text += "%s\t%d\t%d\t%d" % (chrom,peak[0]+1,peak[1],peak[2])
 116                 peak_summit_relative_pos = peak[3]-peak[0]+1
 117                 text += "\t%d" % (peak_summit_relative_pos)
 118                 text += "\t%d\t%.2f" % (peak[5],peak[6])
 119                 text += "\t%.2f" % (peak[7])
 120                 text+= "\n"
 121         return text
 122
 123     def tobed (self):
 124         text = ""
 125         chrs = self.peaks.keys()
 126         chrs.sort()
 127         for chrom in chrs:
 128             for peak in self.peaks[chrom]:
 129                 text+= "%s\t%d\t%d\n" % (chrom,peak[0],peak[1])
 130         return text
 131
 132     def __add_fdr (self, final, negative):
 133         """
 134         A peak info type is a: dictionary
 135
 136         key value: chromosome
 137
 138         items: (peak start,peak end, peak length, peak summit, peak
 139         height, number of tags in peak region, peak pvalue, peak
 140         foldchange, fdr) <-- tuple type
 141         """
 142         pvalue2fdr = {}
 143         pvalues_final = []
 144         pvalues_negative = []
 145         chrs = final.keys()
 146         for chrom in chrs:
 147             for i in final[chrom]:
 148                 pvalues_final.append(i[6]) # i[6] is pvalue in peak info
 149                 pvalue2fdr[i[6]]=None
 150         chrs = negative.keys()
 151         for chrom in chrs:
 152             for i in negative[chrom]:
 153                 pvalues_negative.append(i[6])
 154         pvalues_final.sort(reverse=True)
 155         pvalues_negative.sort(reverse=True)
 156         for p in pvalue2fdr.keys():
 157             n_final = pvalues_final.index(p) + pvalues_final.count(p)
 158             n_negative = 0
 159             for i in range(len(pvalues_negative)):
 160                 if pvalues_negative[i] >= p:
 161                     n_negative += 1
 162                 else:
 163                     break
 164             pvalue2fdr[p] = 100.0 * n_negative / n_final
 165         #print pvalue2fdr
 166         new_info = {}
 167         chrs = final.keys()
 168         for chrom in chrs:
 169             new_info[chrom] = []
 170             for i in final[chrom]:
 171                 tmp = list(i)
 172                 tmp.append(pvalue2fdr[i[6]])
 173                 new_info[chrom].append(tuple(tmp))      # i[6] is pvalue in peak info
 174         return new_info
 175
 176     def __call_peaks_w_control (self):
 177         """To call peaks with control data.
 178
 179         A peak info type is a: dictionary
 180
 181         key value: chromosome
 182
 183         items: (peak start,peak end, peak length, peak summit, peak
 184         height, number of tags in peak region, peak pvalue, peak
 185         foldchange) <-- tuple type
 186         """
 187         self.ratio_treat2control = float(self.treat.total)/self.control.total
 188         logging.info("#3.1 shift treatment data")
 189         self.__shift_trackI(self.treat)
 190         logging.info("#3.1 merge +/- strand of treatment data")
 191         self.treat.merge_plus_minus_ranges_w_duplicates ()
 192         logging.debug("#3.1 after shift and merging, tags: %d" % (self.treat.total))
 193         logging.info("#3.1 call peak candidates")
 194         peak_candidates = self.__call_peaks_from_trackI (self.treat)
 195
 196         logging.info("#3.2 shift control data")
 197         logging.info("#3.2 merge +/- strand of control data")
 198         self.__shift_trackI(self.control)
 199         self.control.merge_plus_minus_ranges_w_duplicates ()
 200         logging.debug("#3.2 after shift and merging, tags: %d" % (self.control.total))
 201         logging.info("#3.2 call negative peak candidates")
 202         negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
 203
 204
 205         logging.info("#3.3 use control data to filter peak candidates...")
 206         self.final_peaks = self.__filter_w_control(peak_candidates,self.control, self.ratio_treat2control)
 207         logging.info("#3.3 find negative peaks by reversing treat and control")
 208         self.final_negative_peaks = self.__filter_w_control(negative_peak_candidates,self.treat, 1.0/self.ratio_treat2control)
 209         return self.__add_fdr (self.final_peaks, self.final_negative_peaks)
 210
 211     def __cal_wig_for_peaks (self):
 212         loose_peak_regions = self.__call_peaks_from_trackI (self.treat,pvalue=1e-3)
 213
 214
 215
 216     def __call_peaks_wo_control (self):
 217         """To call peaks w/o control data.
 218
 219         """
 220         logging.info("#3 shift treatment data")
 221         self.__shift_trackI(self.treat)
 222         logging.info("#3 merge +/- strand of treatment data")
 223         self.treat.merge_plus_minus_ranges_w_duplicates ()
 224         logging.debug("#3 after shift and merging, tags: %d" % (self.treat.total))
 225         logging.info("#3 call peak candidates")
 226         peak_candidates = self.__call_peaks_from_trackI (self.treat)
 227         logging.info("#3 use self to calculate local lambda and  filter peak candidates...")
 228         self.final_peaks = self.__filter_w_control(peak_candidates,self.treat,1,pass_1k=True)
 229         return self.final_peaks
 230
 231     def __print_peak_info (self, peak_info):
 232         """Print out peak information.
 233
 234         A peak info type is a: dictionary
 235
 236         key value: chromosome
 237
 238         items: (peak start,peak end, peak length, peak summit, peak
 239         height, number of tags in peak region, peak pvalue, peak
 240         foldchange) <-- tuple type
 241         """
 242         chrs = peak_info.keys()
 243         chrs.sort()
 244         for chrom in chrs:
 245             peak_list = peak_info[chrom]
 246             for peak in peak_list:
 247                 print chrom+"\t"+"\t".join(map(str,peak))
 248
 249     def __filter_w_control (self, peak_info, control, treat2control_ratio, pass_1k=False, write2wig= False ):
 250         """Use control data to calculate several lambda values around
 251         1k, 5k and 10k region around peak summit. Choose the highest
 252         one as local lambda, then calculate p-value in poisson
 253         distribution.
 254
 255         Return value type in this format:
 256         a dictionary
 257         key value : chromosome
 258         items : array of (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange)
 259         """
 260         final_peak_info = {}
 261         chrs = peak_info.keys()
 262         chrs.sort()
 263         total = 0
 264         for chrom in chrs:
 265             logging.debug("#3 Chromosome %s" % (chrom))
 266             n_chrom = 0
 267             final_peak_info[chrom] = []
 268             peak_list = peak_info[chrom]
 269             (ctags,tmp) = control.get_ranges_by_chr(chrom)
 270             index_ctag = 0
 271             flag_find_ctag_in_peak = False
 272             prev_index_ctag = 0
 273             len_ctags =len(ctags)
 274             for i in range(len(peak_list)):
 275                 (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags) = peak_list[i]
 276                 left_10k = peak_summit-5000
 277                 left_5k = peak_summit-2500
 278                 left_1k = peak_summit-500
 279                 right_10k = peak_summit+5000
 280                 right_5k = peak_summit+2500
 281                 right_1k = peak_summit+500
 282                 num_10k = 0
 283                 num_5k = 0
 284                 num_1k = 0
 285                 while index_ctag < len_ctags:
 286                     if ctags[index_ctag] < left_10k:
 287                         # go to next control tag
 288                         index_ctag+=1
 289                     elif right_10k < ctags[index_ctag]:
 290                         # finalize and go to next peak region
 291                         flag_find_ctag_in_peak = False
 292                         index_ctag = prev_index_ctag
 293                         break
 294                     else:
 295                         # in -10k ~ 10k region
 296                         if not flag_find_ctag_in_peak:
 297                             flag_find_ctag_in_peak = True
 298                             prev_index_ctag = index_ctag
 299                         num_10k +=1
 300                         if left_1k <ctags[index_ctag]< right_1k:
 301                             num_1k+=1
 302                             num_5k+=1
 303                         elif left_5k <ctags[index_ctag]< right_5k:
 304                             num_5k+=1
 305                         index_ctag += 1
 306                 window_size_4_lambda = max(peak_length,self.scan_window)
 307                 lambda_10k = float(num_10k)/10000*treat2control_ratio*window_size_4_lambda
 308                 lambda_5k = float(num_5k)/5000*treat2control_ratio*window_size_4_lambda
 309                 lambda_1k = float(num_1k)/1000*treat2control_ratio*window_size_4_lambda
 310                 lambda_bg = self.lambda_bg/self.scan_window*window_size_4_lambda
 311                 if pass_1k:
 312                     local_lambda = max(lambda_bg,lambda_10k,lambda_5k)
 313                 else:
 314                     local_lambda = max(lambda_bg,lambda_10k,lambda_5k,lambda_1k)
 315 #                 logging.debug("lambda report:")
 316 #                 if local_lambda == lambda_1k:
 317 #                     logging.debug("lambda_1k %f is used" % lambda_1k)
 318 #                 if local_lambda == lambda_5k:
 319 #                     logging.debug("lambda_5k %f is used" % lambda_5k)
 320 #                 if local_lambda == lambda_10k:
 321 #                     logging.debug("lambda_10k %f is used" % lambda_10k)
 322 #                 if local_lambda == lambda_bg:
 323 #                     logging.debug("lambda_bg %f is used" % lambda_bg)
 324 #                 else:
 325 #                     logging.debug("\t".join(map(str,(lambda_bg,lambda_10k,lambda_5k,lambda_1k))))
 326
 327                 p_tmp = poisson_cdf_Q(local_lambda,peak_num_tags)
 328                 if p_tmp <= 0:
 329                     peak_pvalue = 3100
 330                 else:
 331                     peak_pvalue = math.log(p_tmp,10) * -10
 332                 if peak_pvalue > self.pvalue:
 333                     n_chrom += 1
 334                     total += 1
 335                     peak_foldchange = float(peak_num_tags)/local_lambda
 336                     final_peak_info[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags,peak_pvalue,peak_foldchange))
 337 #                 else:
 338 #                     logging.debug("lambda rules")
 339
 340             logging.debug("#3 peaks whose pvalue < cutoff: %d" % (n_chrom))
 341         logging.info("#3 Finally, %d peaks are called!" % (total))
 342         return final_peak_info
 343
 344     def __call_peaks_from_trackI (self, trackI):
 345         """Call peak candidates from trackI data. Using every tag as
 346         step and scan the self.scan_window region around the tag. If
 347         tag number is greater than self.min_tags, then the position is
 348         recorded.
 349
 350         Return: data in this format. (peak_start,peak_end,peak_length,peak_summit,peak_height,peak_num_tags)
 351         """
 352         peak_candidates = {}
 353         logging.debug("#3 search peak condidates...")
 354         chrs = trackI.get_chr_names()
 355         chrs.sort()
 356         total = 0
 357         for chrom in chrs:
 358             logging.debug("#3 Chromosome %s" % (chrom))
 359             n_chrom = 0
 360             peak_candidates[chrom] = []
 361             (tags,tmp) = trackI.get_ranges_by_chr(chrom)
 362             len_t = len(tags)
 363             cpr_tags = []       # Candidate Peak Region tags
 364             cpr_tags.extend(tags[:self.min_tags-1])
 365             number_cpr_tags = self.min_tags-1
 366             p = self.min_tags-1 # Next Tag Index
 367             while p < len_t:
 368                 if number_cpr_tags >= self.min_tags:
 369                     if tags[p] - cpr_tags[-1*self.min_tags+1] <= self.scan_window:
 370                         # add next tag, if the new tag is less than self.scan_window away from previous no. self.min_tags tag
 371                         cpr_tags.append(tags[p])
 372                         number_cpr_tags += 1
 373                         p+=1
 374                     else:
 375                         # candidate peak region is ready, call peak...
 376                         (peak_start,peak_end,peak_length,peak_summit,peak_height) = self.__tags_call_peak (cpr_tags)
 377                         peak_candidates[chrom].append((peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags))
 378                         cpr_tags = [tags[p]] # reset
 379
 380                         number_cpr_tags = 1
 381                         total += 1
 382                         n_chrom += 1
 383                 else:
 384                     # add next tag, but if the first one in cpr_tags
 385                     # is more than self.scan_window away from this new
 386                     # tag, remove the first one from the list
 387                     if tags[p] - cpr_tags[0] >= self.scan_window:
 388                         cpr_tags.pop(0)
 389                         number_cpr_tags -= 1
 390                     cpr_tags.append(tags[p])
 391                     number_cpr_tags += 1
 392                     p+=1
 393             logging.debug("#3 peak candidates: %d" % (n_chrom))
 394         logging.debug("#3 Total number of candidates: %d" % (total))
 395         return peak_candidates
 396
 397     def __tags_call_peak (self, tags ):
 398         """Project tags to a line. Then find the highest point.
 399
 400         """
 401         start = tags[0]-self.frag_length/2
 402         end = tags[-1]+self.frag_length/2+1
 403         region_length = end - start
 404         line= [0]*region_length
 405         for tag in tags:
 406             tag_projected_start = tag-start-self.frag_length/2
 407             tag_projected_end = tag-start+self.frag_length/2
 408             for i in range(tag_projected_start,tag_projected_end):
 409                 line[i] += 1
 410         tops = []
 411         top_height = 0
 412         for i in range(len(line)):
 413             if line[i] > top_height:
 414                 top_height = line[i]
 415                 tops = [i]
 416             elif line[i] == top_height:
 417                 tops.append(i)
 418         peak_summit = tops[len(tops)/2]+start
 419         return (start,end,region_length,peak_summit,top_height)
 420
 421
 422     def __shift_trackI (self, trackI):
 423         """Shift trackI data to right (for plus strand) or left (for
 424         minus strand).
 425
 426         """
 427         chrs = trackI.get_chr_names()
 428         chrs.sort()
 429         for chrom in chrs:
 430             tags = trackI.get_ranges_by_chr(chrom)
 431             # plus
 432             for i in range(len(tags[0])):
 433                 tags[0][i]=tags[0][i]+self.shift_size
 434             # minus
 435             for i in range(len(tags[1])):
 436                 tags[1][i]=tags[1][i]-self.shift_size
 437         return
 438