htswanalysis/MACS/lib/TabIO/__init__.py

   1 # Time-stamp: <2008-05-28 13:25:06 Tao Liu>
   2
   3 """Module for FWTrackI, BEDParser and ELANDParser classes
   4
   5 Copyright (c) 2007 Tao Liu <taoliu@jimmy.harvard.edu>
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the Artistic License (see the file COPYING included
   9 with the distribution).
  10
  11 @status:  experimental
  12 @version: $Revision$
  13 @author:  Tao Liu
  14 @contact: taoliu@jimmy.harvard.edu
  15 """
  16
  17 # ------------------------------------
  18 # python modules
  19 # ------------------------------------
  20 import re
  21 import sys
  22 import logging
  23 # ------------------------------------
  24 # constants
  25 # ------------------------------------
  26 __version__ = "TabIO $Revision$"
  27 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
  28 __doc__ = "FWTrackI, BEDParser and ELANDParser classes"
  29
  30 # ------------------------------------
  31 # Misc functions
  32 # ------------------------------------
  33
  34 # ------------------------------------
  35 # Classes
  36 # ------------------------------------
  37
  38 class FWTrackI:
  39     """Fixed Width Ranges along the whole genome (commonly with the
  40     same annotation type), which are stored in a dict.
  41
  42     Ranges are stored and organized by sequence names (chr names) in a
  43     dict. They can be sorted by calling self.sort() function.
  44
  45     Example:
  46        >>> tabfile = TabFile("tag.bed",format="bed",mode="r")
  47        >>> track = FWTrackI()
  48        >>> for (chromosome,rg) in tabfile:
  49        ...    track.add_range(chromosome,rg)
  50        >>> track.get_ranges_by_chr["chr1"] # all ranges in chr1
  51     """
  52     def __init__ (self,fw=0,anno=""):
  53         """fw is the fixed-width for all ranges
  54         """
  55         self.fw = fw
  56         self.__ranges = {}
  57         self.__comments = {}    # store the comments if available for each range
  58         self.__well_sorted = False
  59         self.__well_merged = False
  60         self.total = 0
  61         self.total_w_duplicates = 0
  62         self.annotation = anno   # need to be figured out
  63
  64     def add_range (self, chromosome, range):
  65         """Add a range to the list according to the sequence name.
  66
  67         chromosome -- mostly the chromosome name
  68         range -- RangeI objectleverage
  69         """
  70         if not isinstance(range,RangeI):
  71             raise Exception("Must add a RangeI object!")
  72         if not self.__ranges.has_key(chromosome):
  73             self.__ranges[chromosome] = [[],[]] # for (+strand, -strand)
  74             self.__comments[chromosome] = [[],[]] # for (+,-)
  75         #print "add: "+str(range)
  76         if range.strand == 1:
  77             self.__ranges[chromosome][0].append(range.start)
  78             self.__comments[chromosome][0].append(1)
  79         elif range.strand == -1:
  80             self.__ranges[chromosome][1].append(range.end)
  81             self.__comments[chromosome][1].append(1)
  82         self.total+=1
  83         self.__well_sorted = False
  84         self.__well_merged = False
  85
  86     def add_loc (self, chromosome, fiveendpos, strand):
  87         """Add a range to the list according to the sequence name.
  88
  89         chromosome -- mostly the chromosome name
  90         fiveendpos -- 5' end pos, left for plus strand, neg for neg strand
  91         strand     -- 0: plus, 1: minus
  92         """
  93         if not self.__ranges.has_key(chromosome):
  94             self.__ranges[chromosome] = [[],[]] # for (+strand, -strand)
  95             self.__comments[chromosome] = [[],[]] # for (+,-)
  96         #print "add: "+str(range)
  97         self.__ranges[chromosome][strand].append(fiveendpos)
  98         self.__comments[chromosome][strand].append(1)
  99         self.total+=1
 100         self.__well_sorted = False
 101         self.__well_merged = False
 102
 103     def get_ranges_by_chr (self, chromosome):
 104         """Return array of locations by chromosome.
 105
 106         Not recommanded! Use generate_rangeI_by_chr() instead.
 107         """
 108         if self.__ranges.has_key(chromosome):
 109             return self.__ranges[chromosome]
 110         else:
 111             raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
 112
 113     def get_comments_by_chr (self, chromosome):
 114         """Return array of comments by chromosome.
 115
 116         """
 117         if self.__comments.has_key(chromosome):
 118             return self.__comments[chromosome]
 119         else:
 120             raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
 121
 122
 123     def generate_rangeI_by_chr_strand (self, chromosome,strand):
 124         """A generator to return RangeI objects on given chromosome and given strand.
 125
 126         strand: 0 for plus, 1 for minus strand
 127
 128         Recommended function.
 129         """
 130         if self.__ranges.has_key(chromosome):
 131             for i in self.__ranges[chromosome][strand]:
 132                 yield RangeI(start=i,end=i+self.fw,strand=strand) # buggy!!
 133         else:
 134             raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
 135
 136     def get_chr_names (self):
 137         """Return all the chromosome names stored in this track object.
 138         """
 139         l = self.__ranges.keys()
 140         l.sort()
 141         return l
 142
 143     def length (self):
 144         return self.total*self.fw
 145
 146     def merge_overlap (self):
 147        """merge the SAME ranges. Record the duplicate number in self.__comments{}
 148
 149 *Note: different with the merge_overlap() in TrackI class, which merges the overlapped ranges.
 150
 151        """
 152        if not self.__well_sorted:
 153            self.sort()
 154        self.total = 0
 155        self.total_w_duplicates = 0
 156        for k in self.__ranges.keys(): # for each chromosome
 157            #logging.debug(" ... mergeing chr: %s" % (k))
 158            (plus,minus) = self.__ranges[k]
 159            (plus_c,minus_c) = self.__comments[k]
 160            (new_plus,new_minus,new_plus_c,new_minus_c) = ([],[],[],[])
 161            # + strand
 162            if len(plus)>0:
 163                new_plus.append(plus[0])
 164                new_plus_c.append(1)
 165                n = 0                # the position in new list
 166                for p in plus[1:]:
 167                    if p == new_plus[n]:
 168                        new_plus_c[n]+=1
 169                    else:
 170                        new_plus.append(p)
 171                        new_plus_c.append(1)
 172                        n += 1
 173                self.total +=  len(new_plus)
 174                self.total_w_duplicates += sum(new_plus_c)
 175
 176            # - strand
 177            if len(minus)>0:
 178                new_minus.append(minus[0])
 179                new_minus_c.append(1)
 180                n = 0                # the position in new list
 181                for p in minus[1:]:
 182                    if p == new_minus[n]:
 183                        new_minus_c[n]+=1
 184                    else:
 185                        new_minus.append(p)
 186                        new_minus_c.append(1)
 187                        n += 1
 188                self.total +=  len(new_minus)
 189                self.total_w_duplicates += sum(new_minus_c)
 190
 191            self.__ranges[k]=[new_plus,new_minus]
 192            self.__comments[k]=[new_plus_c,new_minus_c]
 193        self.__well_merged = True
 194
 195     def merge_plus_minus_ranges_w_duplicates (self):
 196         """Merge minus strand ranges to plus strand. Reset the comments. keep duplicates.
 197
 198         """
 199         self.total = 0
 200         self.total_w_duplicates = None
 201         for chrom in self.__ranges.keys():
 202             (plus_tags,minus_tags) = self.__ranges[chrom]
 203             new_plus_tags = []
 204             #reset comments
 205             self.__comments[chrom][0] = []
 206             self.__comments[chrom][1] = []
 207             ip = 0
 208             im = 0
 209             lenp = len(plus_tags)
 210             lenm = len(minus_tags)
 211             while ip < lenp and im < lenm:
 212                 if plus_tags[ip] < minus_tags[im]:
 213                     new_plus_tags.append(plus_tags[ip])
 214                     ip += 1
 215                 else:
 216                     new_plus_tags.append(minus_tags[im])
 217                     im += 1
 218             if im < lenm:
 219                 # add rest of minus tags
 220                 new_plus_tags.extend(minus_tags[im:])
 221             if ip < lenp:
 222                 # add rest of plus tags
 223                 new_plus_tags.extend(plus_tags[ip:])
 224
 225             self.__ranges[chrom] = [new_plus_tags,None]
 226             self.total += len(new_plus_tags)
 227
 228     def status (self):
 229         """Return the information whether or not the track is sorted and/or merged.
 230
 231         Return value: tuple (boolean,boolean) means (sorted?,merged?)
 232         """
 233         return (self.__well_sorted,self.__well_merged)
 234
 235     def sort (self):
 236         """Sort all ranges in list. Recommanded after you import all ranges.
 237         """
 238         if self.__well_sorted:
 239             pass
 240         for key in self.__ranges.keys():
 241             self.__ranges[key][0].sort(lambda x,y: cmp(x,y))
 242             self.__ranges[key][1].sort(lambda x,y: cmp(x,y))
 243         self.__well_sorted = True
 244
 245
 246     def __str__ (self):
 247         return self.__to_wiggle()
 248
 249     def __to_wiggle (self):
 250         t = "track type=wiggle_0 name=\"variableStep\" description=\"%s\"\n" % (self.annotation)
 251         for key in self.__ranges.keys():
 252             t += "variableStep chrom=%s span=%d\n" % (key,self.fw)
 253             for i in self.__ranges[key][0]:
 254                 t += str(i)+"\n"
 255             for i in self.__ranges[key][1]:
 256                 t += str(i)+"\n"
 257         return t
 258
 259 class BEDParser:
 260     """File Parser Class for tabular File.
 261
 262     """
 263     def __init__ (self):
 264         """All parameters except for format are the same as python's
 265         builtin file class.
 266
 267         Format parameter can be "bed","gff" or other particular string
 268         for other format of file. For example, if your file is '\\t'
 269         delimited, and the first column of the file is chromosome
 270         name, the second column is start position, third column is end
 271         position, fourth column is the strand, the coordinates are
 272         0-indexed and the range is closed, then you should write the
 273         format string as "123401\\t" (six numbers and the delimiter).
 274
 275         Note: Use the fifth and sixth number in format string to
 276         indicate whether the coordinates are 0-indexed (0) or
 277         1-indexed (1) and whether the range is opened (0) or closed
 278         (1) (i.e. whether the end position is included in the range
 279         or not)
 280         """
 281         self.__format__ = "123600\t"
 282         self.__delimiter__ = self.__format__[6:]
 283         self.__chr_col__ = int(self.__format__[0])-1
 284         self.__start_col__ = int(self.__format__[1])-1
 285         self.__end_col__ = int(self.__format__[2])-1
 286         self.__strand_col__ = int(self.__format__[3])-1
 287         self.__start_shift__ = int(self.__format__[4])
 288         self.__end_shift__ = int(self.__format__[5])
 289
 290     def build_fwtrack (self, fhd):
 291         """Build FWTrackI from all lines, return a FWTrackI object.
 292
 293         Note: All ranges will be merged (exclude the same
 294         range) then sorted after the track is built.
 295
 296         If both_strand is True, it will store strand information in
 297         FWTrackI object.
 298
 299         if do_merge is False, it will not merge the same range after
 300         the track is built.
 301         """
 302         fwtrack = FWTrackI()
 303         i = 0
 304         m = 0
 305         for thisline in fhd:
 306             (chromosome,fpos,strand) = self.__fw_parse_line(thisline)
 307             i+=1
 308             if i == 1000000:
 309                 m += 1
 310                 logging.info(" %d" % (m*1000000))
 311                 i=0
 312             if not fpos or not chromosome:
 313                 continue
 314             fwtrack.add_loc(chromosome,fpos,strand)
 315         return fwtrack
 316
 317     def __fw_parse_line (self, thisline ):
 318         if not thisline: return (None,None,None)
 319         thisline = thisline.rstrip()
 320         if not thisline: return ("blank",None,None)
 321
 322         if thisline.startswith("#") or thisline.startswith("track"):
 323             return ("comment line",None,None) # comment line is skipped
 324         thisfields = thisline.split("\t")
 325         if len(thisfields) > 5 :
 326             strand = thisfields[5]
 327         else:
 328             strand = "+"        # default pos strand if no strand info can be found
 329         if strand == "+":
 330             return (thisfields[0],
 331                     int(thisfields[1]),
 332                     0)
 333         elif strand == "-":
 334             return (thisfields[0],
 335                     int(thisfields[2]),
 336                     1)
 337         else:
 338             raise self.StrandFormatError(thisline,strand)
 339
 340     class StrandFormatError(Exception):
 341         def __init__ (self, string, strand):
 342             self.message = "Strand information can not be recognized in this line: \"%s\",\"%s\"" % (string,strand)
 343
 344         def __str__ (self):
 345             return repr(self.message)
 346
 347
 348 class ELANDParser:
 349     """File Parser Class for tabular File.
 350
 351     """
 352     def __init__ (self):
 353         """
 354         """
 355         pass
 356
 357     def build_fwtrack (self, fhd):
 358         """Build FWTrackI from all lines, return a FWTrackI object.
 359
 360         Note: All ranges will be merged (exclude the same
 361         range) then sorted after the track is built.
 362
 363         If both_strand is True, it will store strand information in
 364         FWTrackI object.
 365
 366         if do_merge is False, it will not merge the same range after
 367         the track is built.
 368         """
 369         fwtrack = FWTrackI()
 370         i = 0
 371         m = 0
 372         for thisline in fhd:
 373             (chromosome,fpos,strand) = self.__fw_parse_line(thisline)
 374             i+=1
 375             if i == 1000000:
 376                 m += 1
 377                 logging.info(" %d" % (m*1000000))
 378                 i=0
 379             if not fpos or not chromosome:
 380                 continue
 381             fwtrack.add_loc(chromosome,fpos,strand)
 382         return fwtrack
 383
 384     def __fw_parse_line (self, thisline ):
 385         if not thisline: return (None,None,None)
 386         thisline = thisline.rstrip()
 387         if not thisline: return ("blank",None,None)
 388
 389         if thisline[0] == "#": return ("comment line",None,None) # comment line is skipped
 390         thisfields = thisline.split("\t")
 391         thistaglength = len(thisfields[1])
 392
 393         if thisfields[2] == "U0" or thisfields[2]=="U1" or thisfields[2]=="U2":
 394             strand = thisfields[8]
 395             if strand == "F":
 396                 return (thisfields[6],
 397                         int(thisfields[7])-1,
 398                         0)
 399             elif strand == "R":
 400                 return (thisfields[6],
 401                         int(thisfields[7])+thistaglength-2,
 402                         1)
 403             else:
 404                 raise self.StrandFormatError(thisline,strand)
 405         else:
 406             return (None,None,None)
 407
 408     class StrandFormatError(Exception):
 409         def __init__ (self, string, strand):
 410             self.message = "Strand information can not be recognized in this line: \"%s\",\"%s\"" % (string,strand)
 411
 412         def __str__ (self):
 413             return repr(self.message)