htsworkflow/util/makebed.py

   1 """
   2 Utility functions to make bedfiles.
   3 """
   4 import os
   5 import re
   6
   7 __docformat__ = "restructredtext en"
   8
   9 # map eland_result.txt sense
  10 sense_map = { 'F': '+', 'R': '-'}
  11 sense_color = { 'F': '0,0,255', 'R': '255,255,0' }
  12
  13 def create_bed_header(name, description):
  14   """
  15   Produce the headerline for a bedfile
  16   """
  17   # provide default track names
  18   if name is None: name = "track"
  19   if description is None: description = "eland result file"
  20   bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' % (name, description)
  21   bed_header += os.linesep
  22   return bed_header
  23
  24 def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'):
  25   """
  26   read an eland result file from instream and write a bedfile to outstream
  27
  28   :Parameters:
  29     - `instream`: stream containing the output from eland
  30     - `outstream`: stream to write the bed file too
  31     - `name`: name of bed-file (must be unique)
  32     - `description`: longer description of the bed file
  33     - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
  34   """
  35   for line in make_bed_from_eland_generator(instream, name, description, chromosome_prefix):
  36       outstream.write(line)
  37
  38 def make_bed_from_eland_generator(instream, name, description, chromosome_prefix='chr'):
  39   """
  40   read an eland result file from instream and write a bedfile to outstream
  41
  42   :Parameters:
  43     - `instream`: stream containing the output from eland
  44     - `name`: name of bed-file (must be unique)
  45     - `description`: longer description of the bed file
  46     - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
  47
  48   :Return: generator which yields lines of bedfile
  49   """
  50   # indexes into fields in eland_result.txt file
  51   SEQ = 1
  52   CHR = 6
  53   START = 7
  54   SENSE = 8
  55
  56   yield create_bed_header(name, description)
  57   prefix_len = len(chromosome_prefix)
  58
  59   for line in instream:
  60     fields = line.split()
  61     # we need more than the CHR field, and it needs to match a chromosome
  62     if len(fields) <= CHR or fields[CHR][:prefix_len] != chromosome_prefix:
  63       continue
  64     start = fields[START]
  65     stop = int(start) + len(fields[SEQ])
  66     # strip off filename extension
  67     chromosome = fields[CHR].split('.')[0]
  68
  69     yield '%s %s %d read 0 %s - - %s%s' % (
  70       chromosome,
  71       start,
  72       stop,
  73       sense_map[fields[SENSE]],
  74       sense_color[fields[SENSE]],
  75       os.linesep
  76     )
  77
  78 def make_bed_from_multi_eland_stream(
  79   instream,
  80   outstream,
  81   name,
  82   description,
  83   chr_prefix='chr',
  84   max_reads=255
  85   ):
  86   """
  87   read a multi eland result file from instream and write the bedfile to outstream
  88
  89   :Parameters:
  90     - `instream`: stream containing the output from eland
  91     - `outstream`: stream to write the bed file too
  92     - `name`: name of bed-file (must be unique)
  93     - `description`: longer description of the bed file
  94     - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern
  95     - `max_reads`: maximum number of reads to write to bed stream
  96   """
  97   for lane in make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads):
  98       outstream.write(lane)
  99
 100 def make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads=255):
 101   loc_pattern = '(?P<fullloc>(?P<start>[0-9]+)(?P<dir>[FR])(?P<count>[0-9AGCT]+))'
 102   other_pattern = '(?P<chr>[^:,]+)'
 103   split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern))
 104
 105   yield create_bed_header(name, description)
 106   for line in instream:
 107     rec = line.split()
 108     if len(rec) > 3:
 109       # colony_id = rec[0]
 110       seq = rec[1]
 111       # number of matches for 0, 1, and 2 mismatches
 112       # m0, m1, m2 = [int(x) for x in rec[2].split(':')]
 113       compressed_reads = rec[3]
 114       cur_chr = ""
 115       reads = {0: [], 1: [], 2:[]}
 116
 117       for token in split_re.finditer(compressed_reads):
 118         if token.group('chr') is not None:
 119           cur_chr =  token.group('chr')
 120           # strip off extension if present
 121           cur_chr = os.path.splitext(cur_chr)[0]
 122         elif token.group('fullloc') is not None:
 123           matches = int(token.group('count'))
 124           # only emit a bed line if
 125           #  our current chromosome starts with chromosome pattern
 126           if chr_prefix is None or cur_chr.startswith(chr_prefix):
 127             start = int(token.group('start'))
 128             stop = start + len(seq)
 129             orientation = token.group('dir')
 130             strand = sense_map[orientation]
 131             color = sense_color[orientation]
 132             # build up list of reads for this record
 133             reads[matches].append((cur_chr, start, stop, strand, color))
 134
 135       # report up to our max_read threshold reporting the fewer-mismatch
 136       # matches first
 137       reported_reads = 0
 138       keys = [0,1,2]
 139       for mismatch, read_list in ((k, reads[k]) for k in keys):
 140         reported_reads += len(read_list)
 141         if reported_reads <= max_reads:
 142           for cur_chr, start, stop, strand, color in read_list:
 143             reported_reads += 1
 144             yield '%s %d %d read 0 %s - - %s%s' % (
 145                 cur_chr,
 146                 start,
 147                 stop,
 148                 sense_map[orientation],
 149                 sense_color[orientation],
 150                 os.linesep
 151             )
 152
 153 def make_description(flowcell_id, lane):
 154     """
 155     compute a bedfile name and description from the django database
 156     """
 157     from htsworkflow.frontend.experiments import models as experiments
 158
 159     lane = int(lane)
 160     if lane < 1 or lane > 8:
 161       raise RuntimeError("flowcells only have lanes 1-8")
 162
 163     cell = experiments.FlowCell.objects.get(flowcell_id=flowcell_id)
 164
 165     name = "%s-%s" % (flowcell_id, lane)
 166
 167     cell_library = getattr(cell, 'lane_%d_library' %(lane,))
 168     cell_library_id = cell_library.library_id
 169     description = "%s-%s" % (cell_library.library_name, cell_library_id)
 170     return name, description