Add support for converting mutli-eland files from pipeline 0.3 to
[htsworkflow.git] / gaworkflow / util / makebed.py
index 738b815b270d2d8352d6157a7b5e1948b437e3a3..6f1511ca974baaa0da9c9c55dad268a1deca7e86 100755 (executable)
@@ -2,6 +2,22 @@
 Utility functions to make bedfiles.
 """
 import os
+import re
+
+# map eland_result.txt sense 
+sense_map = { 'F': '+', 'R': '-'}
+sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
+
+def write_bed_header(outstream, name, description):
+  """
+  Produce the headerline for a bedfile
+  """
+  # provide default track names
+  if name is None: name = "track"
+  if description is None: description = "eland result file"
+  bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
+  bed_header += os.linesep
+  outstream.write(bed_header % (name, description))
 
 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
   """
@@ -12,15 +28,8 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom
   CHR = 6
   START = 7
   SENSE = 8
-  # map eland_result.txt sense 
-  sense_map = { 'F': '+', 'R': '-'}
-  sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
-  # provide default track names
-  if name is None: name = "track"
-  if description is None: description = "eland result file"
-  bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
-  bed_header += os.linesep
-  outstream.write(bed_header % (name, description))
+
+  write_bed_header(outstream, name, description)
 
   for line in instream:
     fields = line.split()
@@ -42,6 +51,72 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom
       os.linesep  
     ))
 
+
+def make_bed_from_multi_eland_stream(
+  instream, 
+  outstream, 
+  name, 
+  description, 
+  chr_prefix='chr', 
+  max_reads=255
+  ):
+  """
+  read a multi eland stream and write a bedfile
+  """
+  write_bed_header(outstream, name, description)
+  parse_multi_eland(instream, outstream, chr_prefix, max_reads)
+
+def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
+
+  loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
+  other_pattern = '(?P<chr>[^:,]+)'
+  split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
+
+  for line in instream:
+    rec = line.split()
+    if len(rec) > 3:
+      # colony_id = rec[0]
+      seq = rec[1]
+      # number of matches for 0, 1, and 2 mismatches
+      # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
+      compressed_reads = rec[3]
+      cur_chr = ""
+      reads = {0: [], 1: [], 2:[]}
+
+      for token in split_re.finditer(compressed_reads):
+        if token.group('chr') is not None:
+          cur_chr =  token.group('chr')[:-3] # strip off .fa
+        elif token.group('fullloc') is not None:
+          matches = int(token.group('count'))
+          # only emit a bed line if 
+          #  our current chromosome starts with chromosome pattern
+          if chr_prefix is None or cur_chr.startswith(chr_prefix):
+            start = int(token.group('start'))
+            stop = start + len(seq)
+            orientation = token.group('dir')
+            strand = sense_map[orientation]
+            color = sense_color[orientation]
+            # build up list of reads for this record
+            reads[matches].append((cur_chr, start, stop, strand, color))
+
+      # report up to our max_read threshold reporting the fewer-mismatch
+      # matches first
+      reported_reads = 0
+      keys = [0,1,2]
+      for mismatch, read_list in ((k, reads[k]) for k in keys): 
+        reported_reads += len(read_list)
+        if reported_reads <= max_reads:
+          for cur_chr, start, stop, strand, color in read_list:
+            reported_reads += 1
+            outstream.write('%s %d %d read 0 %s - - %s%s' % (
+                cur_chr,
+                start,
+                stop,
+                sense_map[orientation],
+                sense_color[orientation],
+                os.linesep
+            ))
+
 def make_description(database, flowcell_id, lane):
     """
     compute a bedfile name and description from the fctracker database