1 # Time-stamp: <2008-05-28 13:25:06 Tao Liu>
3 """Module for FWTrackI, BEDParser and ELANDParser classes
5 Copyright (c) 2007 Tao Liu <taoliu@jimmy.harvard.edu>
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the Artistic License (see the file COPYING included
9 with the distribution).
14 @contact: taoliu@jimmy.harvard.edu
17 # ------------------------------------
19 # ------------------------------------
23 # ------------------------------------
25 # ------------------------------------
26 __version__ = "TabIO $Revision$"
27 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
28 __doc__ = "FWTrackI, BEDParser and ELANDParser classes"
30 # ------------------------------------
32 # ------------------------------------
34 # ------------------------------------
36 # ------------------------------------
39 """Fixed Width Ranges along the whole genome (commonly with the
40 same annotation type), which are stored in a dict.
42 Ranges are stored and organized by sequence names (chr names) in a
43 dict. They can be sorted by calling self.sort() function.
46 >>> tabfile = TabFile("tag.bed",format="bed",mode="r")
47 >>> track = FWTrackI()
48 >>> for (chromosome,rg) in tabfile:
49 ... track.add_range(chromosome,rg)
50 >>> track.get_ranges_by_chr["chr1"] # all ranges in chr1
52 def __init__ (self,fw=0,anno=""):
53 """fw is the fixed-width for all ranges
57 self.__comments = {} # store the comments if available for each range
58 self.__well_sorted = False
59 self.__well_merged = False
61 self.total_w_duplicates = 0
62 self.annotation = anno # need to be figured out
64 def add_range (self, chromosome, range):
65 """Add a range to the list according to the sequence name.
67 chromosome -- mostly the chromosome name
68 range -- RangeI objectleverage
70 if not isinstance(range,RangeI):
71 raise Exception("Must add a RangeI object!")
72 if not self.__ranges.has_key(chromosome):
73 self.__ranges[chromosome] = [[],[]] # for (+strand, -strand)
74 self.__comments[chromosome] = [[],[]] # for (+,-)
75 #print "add: "+str(range)
77 self.__ranges[chromosome][0].append(range.start)
78 self.__comments[chromosome][0].append(1)
79 elif range.strand == -1:
80 self.__ranges[chromosome][1].append(range.end)
81 self.__comments[chromosome][1].append(1)
83 self.__well_sorted = False
84 self.__well_merged = False
86 def add_loc (self, chromosome, fiveendpos, strand):
87 """Add a range to the list according to the sequence name.
89 chromosome -- mostly the chromosome name
90 fiveendpos -- 5' end pos, left for plus strand, neg for neg strand
91 strand -- 0: plus, 1: minus
93 if not self.__ranges.has_key(chromosome):
94 self.__ranges[chromosome] = [[],[]] # for (+strand, -strand)
95 self.__comments[chromosome] = [[],[]] # for (+,-)
96 #print "add: "+str(range)
97 self.__ranges[chromosome][strand].append(fiveendpos)
98 self.__comments[chromosome][strand].append(1)
100 self.__well_sorted = False
101 self.__well_merged = False
103 def get_ranges_by_chr (self, chromosome):
104 """Return array of locations by chromosome.
106 Not recommanded! Use generate_rangeI_by_chr() instead.
108 if self.__ranges.has_key(chromosome):
109 return self.__ranges[chromosome]
111 raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
113 def get_comments_by_chr (self, chromosome):
114 """Return array of comments by chromosome.
117 if self.__comments.has_key(chromosome):
118 return self.__comments[chromosome]
120 raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
123 def generate_rangeI_by_chr_strand (self, chromosome,strand):
124 """A generator to return RangeI objects on given chromosome and given strand.
126 strand: 0 for plus, 1 for minus strand
128 Recommended function.
130 if self.__ranges.has_key(chromosome):
131 for i in self.__ranges[chromosome][strand]:
132 yield RangeI(start=i,end=i+self.fw,strand=strand) # buggy!!
134 raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
136 def get_chr_names (self):
137 """Return all the chromosome names stored in this track object.
139 l = self.__ranges.keys()
144 return self.total*self.fw
146 def merge_overlap (self):
147 """merge the SAME ranges. Record the duplicate number in self.__comments{}
149 *Note: different with the merge_overlap() in TrackI class, which merges the overlapped ranges.
152 if not self.__well_sorted:
155 self.total_w_duplicates = 0
156 for k in self.__ranges.keys(): # for each chromosome
157 #logging.debug(" ... mergeing chr: %s" % (k))
158 (plus,minus) = self.__ranges[k]
159 (plus_c,minus_c) = self.__comments[k]
160 (new_plus,new_minus,new_plus_c,new_minus_c) = ([],[],[],[])
163 new_plus.append(plus[0])
165 n = 0 # the position in new list
173 self.total += len(new_plus)
174 self.total_w_duplicates += sum(new_plus_c)
178 new_minus.append(minus[0])
179 new_minus_c.append(1)
180 n = 0 # the position in new list
182 if p == new_minus[n]:
186 new_minus_c.append(1)
188 self.total += len(new_minus)
189 self.total_w_duplicates += sum(new_minus_c)
191 self.__ranges[k]=[new_plus,new_minus]
192 self.__comments[k]=[new_plus_c,new_minus_c]
193 self.__well_merged = True
195 def merge_plus_minus_ranges_w_duplicates (self):
196 """Merge minus strand ranges to plus strand. Reset the comments. keep duplicates.
200 self.total_w_duplicates = None
201 for chrom in self.__ranges.keys():
202 (plus_tags,minus_tags) = self.__ranges[chrom]
205 self.__comments[chrom][0] = []
206 self.__comments[chrom][1] = []
209 lenp = len(plus_tags)
210 lenm = len(minus_tags)
211 while ip < lenp and im < lenm:
212 if plus_tags[ip] < minus_tags[im]:
213 new_plus_tags.append(plus_tags[ip])
216 new_plus_tags.append(minus_tags[im])
219 # add rest of minus tags
220 new_plus_tags.extend(minus_tags[im:])
222 # add rest of plus tags
223 new_plus_tags.extend(plus_tags[ip:])
225 self.__ranges[chrom] = [new_plus_tags,None]
226 self.total += len(new_plus_tags)
229 """Return the information whether or not the track is sorted and/or merged.
231 Return value: tuple (boolean,boolean) means (sorted?,merged?)
233 return (self.__well_sorted,self.__well_merged)
236 """Sort all ranges in list. Recommanded after you import all ranges.
238 if self.__well_sorted:
240 for key in self.__ranges.keys():
241 self.__ranges[key][0].sort(lambda x,y: cmp(x,y))
242 self.__ranges[key][1].sort(lambda x,y: cmp(x,y))
243 self.__well_sorted = True
247 return self.__to_wiggle()
249 def __to_wiggle (self):
250 t = "track type=wiggle_0 name=\"variableStep\" description=\"%s\"\n" % (self.annotation)
251 for key in self.__ranges.keys():
252 t += "variableStep chrom=%s span=%d\n" % (key,self.fw)
253 for i in self.__ranges[key][0]:
255 for i in self.__ranges[key][1]:
260 """File Parser Class for tabular File.
264 """All parameters except for format are the same as python's
267 Format parameter can be "bed","gff" or other particular string
268 for other format of file. For example, if your file is '\\t'
269 delimited, and the first column of the file is chromosome
270 name, the second column is start position, third column is end
271 position, fourth column is the strand, the coordinates are
272 0-indexed and the range is closed, then you should write the
273 format string as "123401\\t" (six numbers and the delimiter).
275 Note: Use the fifth and sixth number in format string to
276 indicate whether the coordinates are 0-indexed (0) or
277 1-indexed (1) and whether the range is opened (0) or closed
278 (1) (i.e. whether the end position is included in the range
281 self.__format__ = "123600\t"
282 self.__delimiter__ = self.__format__[6:]
283 self.__chr_col__ = int(self.__format__[0])-1
284 self.__start_col__ = int(self.__format__[1])-1
285 self.__end_col__ = int(self.__format__[2])-1
286 self.__strand_col__ = int(self.__format__[3])-1
287 self.__start_shift__ = int(self.__format__[4])
288 self.__end_shift__ = int(self.__format__[5])
290 def build_fwtrack (self, fhd):
291 """Build FWTrackI from all lines, return a FWTrackI object.
293 Note: All ranges will be merged (exclude the same
294 range) then sorted after the track is built.
296 If both_strand is True, it will store strand information in
299 if do_merge is False, it will not merge the same range after
306 (chromosome,fpos,strand) = self.__fw_parse_line(thisline)
310 logging.info(" %d" % (m*1000000))
312 if not fpos or not chromosome:
314 fwtrack.add_loc(chromosome,fpos,strand)
317 def __fw_parse_line (self, thisline ):
318 if not thisline: return (None,None,None)
319 thisline = thisline.rstrip()
320 if not thisline: return ("blank",None,None)
322 if thisline.startswith("#") or thisline.startswith("track"):
323 return ("comment line",None,None) # comment line is skipped
324 thisfields = thisline.split("\t")
325 if len(thisfields) > 5 :
326 strand = thisfields[5]
328 strand = "+" # default pos strand if no strand info can be found
330 return (thisfields[0],
334 return (thisfields[0],
338 raise self.StrandFormatError(thisline,strand)
340 class StrandFormatError(Exception):
341 def __init__ (self, string, strand):
342 self.message = "Strand information can not be recognized in this line: \"%s\",\"%s\"" % (string,strand)
345 return repr(self.message)
349 """File Parser Class for tabular File.
357 def build_fwtrack (self, fhd):
358 """Build FWTrackI from all lines, return a FWTrackI object.
360 Note: All ranges will be merged (exclude the same
361 range) then sorted after the track is built.
363 If both_strand is True, it will store strand information in
366 if do_merge is False, it will not merge the same range after
373 (chromosome,fpos,strand) = self.__fw_parse_line(thisline)
377 logging.info(" %d" % (m*1000000))
379 if not fpos or not chromosome:
381 fwtrack.add_loc(chromosome,fpos,strand)
384 def __fw_parse_line (self, thisline ):
385 if not thisline: return (None,None,None)
386 thisline = thisline.rstrip()
387 if not thisline: return ("blank",None,None)
389 if thisline[0] == "#": return ("comment line",None,None) # comment line is skipped
390 thisfields = thisline.split("\t")
391 thistaglength = len(thisfields[1])
393 if thisfields[2] == "U0" or thisfields[2]=="U1" or thisfields[2]=="U2":
394 strand = thisfields[8]
396 return (thisfields[6],
397 int(thisfields[7])-1,
400 return (thisfields[6],
401 int(thisfields[7])+thistaglength-2,
404 raise self.StrandFormatError(thisline,strand)
406 return (None,None,None)
408 class StrandFormatError(Exception):
409 def __init__ (self, string, strand):
410 self.message = "Strand information can not be recognized in this line: \"%s\",\"%s\"" % (string,strand)
413 return repr(self.message)