htsworkflow/util/makebed.py

   1 """
   2 Utility functions to make bedfiles.
   3 """
   4 import os
   5 import re
   6
   7 # map eland_result.txt sense
   8 sense_map = { 'F': '+', 'R': '-'}
   9 sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
  10
  11 def write_bed_header(outstream, name, description):
  12   """
  13   Produce the headerline for a bedfile
  14   """
  15   # provide default track names
  16   if name is None: name = "track"
  17   if description is None: description = "eland result file"
  18   bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"'
  19   bed_header += os.linesep
  20   outstream.write(bed_header % (name, description))
  21
  22 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
  23   """
  24   read an eland result file from instream and write a bedfile to outstream
  25   """
  26   # indexes into fields in eland_result.txt file
  27   SEQ = 1
  28   CHR = 6
  29   START = 7
  30   SENSE = 8
  31
  32   write_bed_header(outstream, name, description)
  33   prefix_len = len(chromosome_prefix)
  34
  35   for line in instream:
  36     fields = line.split()
  37     # we need more than the CHR field, and it needs to match a chromosome
  38     if len(fields) <= CHR or fields[CHR][:prefix_len] != chromosome_prefix:
  39       continue
  40     start = fields[START]
  41     stop = int(start) + len(fields[SEQ])
  42     # strip off filename extension
  43     chromosome = fields[CHR].split('.')[0]
  44
  45     outstream.write('%s %s %d read 0 %s - - %s%s' % (
  46       chromosome,
  47       start,
  48       stop,
  49       sense_map[fields[SENSE]],
  50       sense_color[fields[SENSE]],
  51       os.linesep
  52     ))
  53
  54
  55 def make_bed_from_multi_eland_stream(
  56   instream,
  57   outstream,
  58   name,
  59   description,
  60   chr_prefix='chr',
  61   max_reads=255
  62   ):
  63   """
  64   read a multi eland stream and write a bedfile
  65   """
  66   write_bed_header(outstream, name, description)
  67   parse_multi_eland(instream, outstream, chr_prefix, max_reads)
  68
  69 def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255):
  70
  71   loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9]+))'
  72   other_pattern = '(?P<chr>[^:,]+)'
  73   split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
  74
  75   for line in instream:
  76     rec = line.split()
  77     if len(rec) > 3:
  78       # colony_id = rec[0]
  79       seq = rec[1]
  80       # number of matches for 0, 1, and 2 mismatches
  81       # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
  82       compressed_reads = rec[3]
  83       cur_chr = ""
  84       reads = {0: [], 1: [], 2:[]}
  85
  86       for token in split_re.finditer(compressed_reads):
  87         if token.group('chr') is not None:
  88           cur_chr =  token.group('chr')
  89           # strip off extension if present
  90           cur_chr = os.path.splitext(cur_chr)[0]
  91         elif token.group('fullloc') is not None:
  92           matches = int(token.group('count'))
  93           # only emit a bed line if
  94           #  our current chromosome starts with chromosome pattern
  95           if chr_prefix is None or cur_chr.startswith(chr_prefix):
  96             start = int(token.group('start'))
  97             stop = start + len(seq)
  98             orientation = token.group('dir')
  99             strand = sense_map[orientation]
 100             color = sense_color[orientation]
 101             # build up list of reads for this record
 102             reads[matches].append((cur_chr, start, stop, strand, color))
 103
 104       # report up to our max_read threshold reporting the fewer-mismatch
 105       # matches first
 106       reported_reads = 0
 107       keys = [0,1,2]
 108       for mismatch, read_list in ((k, reads[k]) for k in keys):
 109         reported_reads += len(read_list)
 110         if reported_reads <= max_reads:
 111           for cur_chr, start, stop, strand, color in read_list:
 112             reported_reads += 1
 113             outstream.write('%s %d %d read 0 %s - - %s%s' % (
 114                 cur_chr,
 115                 start,
 116                 stop,
 117                 sense_map[orientation],
 118                 sense_color[orientation],
 119                 os.linesep
 120             ))
 121
 122 def make_description(database, flowcell_id, lane):
 123     """
 124     compute a bedfile name and description from the fctracker database
 125     """
 126     from htsworkflow.util.fctracker import fctracker
 127
 128     fc = fctracker(database)
 129     cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id))
 130     if len(cells) != 1:
 131       raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id))
 132     lane = int(lane)
 133     if lane < 1 or lane > 8:
 134       raise RuntimeError("flowcells only have lanes 1-8")
 135
 136     name = "%s-%s" % (flowcell_id, lane)
 137
 138     cell_id, cell = cells.items()[0]
 139     assert cell_id == flowcell_id
 140
 141     cell_library_id = cell['lane_%d_library_id' %(lane,)]
 142     cell_library = cell['lane_%d_library' %(lane,)]
 143     description = "%s-%s" % (cell_library['library_name'], cell_library_id)
 144     return name, description