Add support for converting mutli-eland files from pipeline 0.3 to
authorDiane Trout <diane@caltech.edu>
Fri, 29 Aug 2008 16:51:24 +0000 (16:51 +0000)
committerDiane Trout <diane@caltech.edu>
Fri, 29 Aug 2008 16:51:24 +0000 (16:51 +0000)
bedfiles

gaworkflow/util/makebed.py
gaworkflow/util/test/test_makebed.py [new file with mode: 0644]
scripts/makebed

index 738b815b270d2d8352d6157a7b5e1948b437e3a3..6f1511ca974baaa0da9c9c55dad268a1deca7e86 100755 (executable)
@@ -2,6 +2,22 @@
 Utility functions to make bedfiles.
 """
 import os
+import re
+
+# map eland_result.txt sense 
+sense_map = { 'F': '+', 'R': '-'}
+sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
+
+def write_bed_header(outstream, name, description):
+  """
+  Produce the headerline for a bedfile
+  """
+  # provide default track names
+  if name is None: name = "track"
+  if description is None: description = "eland result file"
+  bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
+  bed_header += os.linesep
+  outstream.write(bed_header % (name, description))
 
 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
   """
@@ -12,15 +28,8 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom
   CHR = 6
   START = 7
   SENSE = 8
-  # map eland_result.txt sense 
-  sense_map = { 'F': '+', 'R': '-'}
-  sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
-  # provide default track names
-  if name is None: name = "track"
-  if description is None: description = "eland result file"
-  bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
-  bed_header += os.linesep
-  outstream.write(bed_header % (name, description))
+
+  write_bed_header(outstream, name, description)
 
   for line in instream:
     fields = line.split()
@@ -42,6 +51,72 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom
       os.linesep  
     ))
 
+
+def make_bed_from_multi_eland_stream(
+  instream, 
+  outstream, 
+  name, 
+  description, 
+  chr_prefix='chr', 
+  max_reads=255
+  ):
+  """
+  read a multi eland stream and write a bedfile
+  """
+  write_bed_header(outstream, name, description)
+  parse_multi_eland(instream, outstream, chr_prefix, max_reads)
+
+def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
+
+  loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
+  other_pattern = '(?P<chr>[^:,]+)'
+  split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
+
+  for line in instream:
+    rec = line.split()
+    if len(rec) > 3:
+      # colony_id = rec[0]
+      seq = rec[1]
+      # number of matches for 0, 1, and 2 mismatches
+      # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
+      compressed_reads = rec[3]
+      cur_chr = ""
+      reads = {0: [], 1: [], 2:[]}
+
+      for token in split_re.finditer(compressed_reads):
+        if token.group('chr') is not None:
+          cur_chr =  token.group('chr')[:-3] # strip off .fa
+        elif token.group('fullloc') is not None:
+          matches = int(token.group('count'))
+          # only emit a bed line if 
+          #  our current chromosome starts with chromosome pattern
+          if chr_prefix is None or cur_chr.startswith(chr_prefix):
+            start = int(token.group('start'))
+            stop = start + len(seq)
+            orientation = token.group('dir')
+            strand = sense_map[orientation]
+            color = sense_color[orientation]
+            # build up list of reads for this record
+            reads[matches].append((cur_chr, start, stop, strand, color))
+
+      # report up to our max_read threshold reporting the fewer-mismatch
+      # matches first
+      reported_reads = 0
+      keys = [0,1,2]
+      for mismatch, read_list in ((k, reads[k]) for k in keys): 
+        reported_reads += len(read_list)
+        if reported_reads <= max_reads:
+          for cur_chr, start, stop, strand, color in read_list:
+            reported_reads += 1
+            outstream.write('%s %d %d read 0 %s - - %s%s' % (
+                cur_chr,
+                start,
+                stop,
+                sense_map[orientation],
+                sense_color[orientation],
+                os.linesep
+            ))
+
 def make_description(database, flowcell_id, lane):
     """
     compute a bedfile name and description from the fctracker database
diff --git a/gaworkflow/util/test/test_makebed.py b/gaworkflow/util/test/test_makebed.py
new file mode 100644 (file)
index 0000000..e96f29b
--- /dev/null
@@ -0,0 +1,51 @@
+import os
+from StringIO import StringIO
+import unittest
+
+from gaworkflow.util import makebed
+
+class testMakeBed(unittest.TestCase):
+    def test_multi_1_0_0_limit_1(self):
+      instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383    TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0   mm9_chr13_random.fa:1240R0')
+      out = StringIO()
+
+      makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+      self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n')
+
+    def test_multi_1_0_0_limit_255(self):
+      instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383    TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0   mm9_chr13_random.fa:1240R0')
+      out = StringIO()
+
+      makebed.parse_multi_eland(instream, out, 'mm9_chr', 255)
+      self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n')
+
+
+    def test_multi_2_0_0_limit_1(self):
+      instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586    GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0   mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0')
+      out = StringIO()
+
+      makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+      self.failUnlessEqual(out.len, 0)
+
+    def test_multi_2_0_0_limit_255(self):
+      instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586    GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0   mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0')
+      out = StringIO()
+
+      makebed.parse_multi_eland(instream, out, 'mm9_chr', 255)
+      self.failUnlessEqual(out.len, 98)
+
+    def test_multi_0_2_0_limit_1(self):
+      instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495    TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG        0:2:1   mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1')
+      out = StringIO()
+
+      makebed.parse_multi_eland(instream, out, 'mm9_chr', 1)
+      print out.getvalue()
+      self.failUnlessEqual(out.len, 0)
+
+def suite():
+    return unittest.makeSuite(testMakeBed, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
+
index 4ffa39fe231c95a344efbddf2a23bde544203026..a4a414b27d57608d89cc5dca4b9b93f629c2fb62 100755 (executable)
@@ -3,7 +3,7 @@ import optparse
 import sys
 import os
 
-from gaworkflow.util.makebed import make_bed_from_eland_stream, make_description
+from gaworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description
 
 def make_parser():
   parser = optparse.OptionParser()
@@ -29,6 +29,19 @@ def make_parser():
   parser.add_option("--lane", dest='lane',
                     help='specify which lane to use when retrieving description from database',
                     default=None)
+
+  multi = optparse.OptionGroup(parser, 'Multi-read ELAND support')
+
+  multi.add_option('-m', '--multi', action='store_true',
+                    help='Enable parsing multi-read eland files',
+                    default=False)
+  multi.add_option('--reads', type='int',
+                   help='limit reporting multi reads to this many reads'
+                        '(most usefully --reads=1 will turn a multi-read '
+                        'file into a single read file)',
+                   default=255)
+  parser.add_option_group(multi)
+
   return parser
 
 def main(command_line=None):
@@ -42,17 +55,26 @@ def main(command_line=None):
     parser.error("Need eland input file name")
     return 1
 
-  if options.outname is None:
-    options.outname = os.path.splitext(options.inname)[0]+'.bed'
-    print >>sys.stderr, "defaulting to outputname", options.outname
-
-  if os.path.exists(options.inname):
+  if options.inname == '-':
+    instream = sys.stdin
+  elif os.path.exists(options.inname):
     instream = open(options.inname, 'r')
   else:
     parser.error('%s was not found' % (options.inname))
     return 1
 
-  if os.path.exists(options.outname):
+  if options.outname is None:
+      # if outname wasn't defined, and we're reading from stdout
+      if instream is sys.stdin:
+          # write to stdout
+          outstream = sys.stdout
+      else:
+          # if there's a name write to name.bde
+          options.outname = os.path.splitext(options.inname)[0]+'.bed'
+          print >>sys.stderr, "defaulting to outputname", options.outname
+  elif options.outname == '-':
+      outstream = sys.stdout
+  elif os.path.exists(options.outname):
       parser.error("not overwriting %s" % (options.outname))
       return 1
   else:
@@ -67,7 +89,16 @@ def main(command_line=None):
     name = options.name
     description = options.description
 
-  make_bed_from_eland_stream(instream, outstream, name, description, options.prefix)
+  if options.multi:
+    make_bed_from_multi_eland_stream(instream, outstream, 
+                                     name, description, 
+                                     options.prefix,
+                                     options.reads)
+
+  else:
+    make_bed_from_eland_stream(instream, outstream, 
+                               name, description, 
+                               options.prefix)
   return 0
 
 if __name__ == "__main__":