Loading really old run xml files caused the website to crash
[htsworkflow.git] / htsworkflow / pipelines / eland.py
index a010e1de069d179180fe1e2dc2d6dc905215733f..2e0d9dce3a907f97974c13a7f3edf49b07b85bd8 100644 (file)
@@ -8,36 +8,84 @@ import os
 import re
 import stat
 
-from htsworkflow.pipelines.runfolder import ElementTree
+from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
 from htsworkflow.util.ethelp import indent, flatten
 from htsworkflow.util.opener import autoopen
 
-class ElandLane(object):
+SAMPLE_NAME = 'SampleName'
+LANE_ID = 'LaneID'
+END = 'End'
+READS = 'Reads'
+
+GENOME_MAP = 'GenomeMap'
+GENOME_ITEM = 'GenomeItem'
+MAPPED_READS = 'MappedReads'
+MAPPED_ITEM = 'MappedItem'
+MATCH_CODES = 'MatchCodes'
+MATCH_ITEM = 'Code'
+READS = 'Reads'
+
+ELAND_SINGLE = 0
+ELAND_MULTI = 1
+ELAND_EXTENDED = 2
+ELAND_EXPORT = 3
+
+
+class ResultLane(object):
     """
-    Process an eland result file
+    Base class for result lanes
     """
-    XML_VERSION = 1
-    LANE = 'ElandLane'
-    SAMPLE_NAME = 'SampleName'
-    LANE_ID = 'LaneID'
-    GENOME_MAP = 'GenomeMap'
-    GENOME_ITEM = 'GenomeItem'
-    MAPPED_READS = 'MappedReads'
-    MAPPED_ITEM = 'MappedItem'
-    MATCH_CODES = 'MatchCodes'
-    MATCH_ITEM = 'Code'
-    READS = 'Reads'
-
-    ELAND_SINGLE = 0
-    ELAND_MULTI = 1
-    ELAND_EXTENDED = 2
-    ELAND_EXPORT = 3
-
-    def __init__(self, pathname=None, genome_map=None, eland_type=None, xml=None):
+    XML_VERSION = 2
+    LANE = 'ResultLane'
+
+    def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
         self.pathname = pathname
         self._sample_name = None
-        self._lane_id = None
+        self.lane_id = lane_id
+        self.end = end
         self._reads = None
+        
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        raise NotImplementedError("Can't count abstract classes")
+
+    def _update_name(self):
+        # extract the sample name
+        if self.pathname is None:
+            return
+
+        path, name = os.path.split(self.pathname)
+        split_name = name.split('_')
+        self._sample_name = split_name[0]
+
+    def _get_sample_name(self):
+        if self._sample_name is None:
+            self._update_name()
+        return self._sample_name
+    sample_name = property(_get_sample_name)
+
+    def _get_reads(self):
+        if self._reads is None:
+            self._update()
+        return self._reads
+    reads = property(_get_reads)
+
+
+class ElandLane(ResultLane):
+    """
+    Process an eland result file
+    """
+    XML_VERSION = 2
+    LANE = "ElandLane"
+
+    def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
+        super(ElandLane, self).__init__(pathname, lane_id, end)
+
         self._mapped_reads = None
         self._match_codes = None
         if genome_map is None:
@@ -53,15 +101,15 @@ class ElandLane(object):
           # attempt autodetect eland file type
           pathn, name = os.path.split(pathname)
           if re.search('result', name):
-            self.eland_type = ElandLane.ELAND_SINGLE
+            self.eland_type = ELAND_SINGLE
           elif re.search('multi', name):
-            self.eland_type = ElandLane.ELAND_MULTI
+            self.eland_type = ELAND_MULTI
           elif re.search('extended', name):
-            self.eland_type = ElandLane.ELAND_EXTENDED
+            self.eland_type = ELAND_EXTENDED
           elif re.search('export', name):
-            self.eland_type = ElandLane.ELAND_EXPORT
+            self.eland_type = ELAND_EXPORT
           else:
-            self.eland_type = ElandLane.ELAND_SINGLE
+            self.eland_type = ELAND_SINGLE
 
     def _update(self):
         """
@@ -77,10 +125,10 @@ class ElandLane(object):
 
         logging.info("summarizing results for %s" % (self.pathname))
 
-        if self.eland_type == ElandLane.ELAND_SINGLE:
+        if self.eland_type == ELAND_SINGLE:
           result = self._update_eland_result(self.pathname)
-        elif self.eland_type == ElandLane.ELAND_MULTI or \
-             self.eland_type == ElandLane.ELAND_EXTENDED:
+        elif self.eland_type == ELAND_MULTI or \
+             self.eland_type == ELAND_EXTENDED:
           result = self._update_eland_multi(self.pathname)
         else:
           raise NotImplementedError("Only support single/multi/extended eland files")
@@ -125,9 +173,11 @@ class ElandLane(object):
             if groups is None:
                 match_codes[fields[2]] += 1
             else:
-                # when there are too many hit, eland writes a - where
-                # it would have put the list of hits
-                if fields[3] == '-':
+                # when there are too many hit, eland  writes a - where
+                # it would have put the list of hits.
+                # or in a different version of eland, it just leaves
+                # that column blank, and only outputs 3 fields.     
+                if len(fields) < 4 or fields[3] == '-':
                   continue
                 zero_mismatches = int(groups.group(1))
                 if zero_mismatches == 1:
@@ -159,34 +209,6 @@ class ElandLane(object):
                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
         return match_codes, mapped_reads, reads
 
-    def _update_name(self):
-        # extract the sample name
-        if self.pathname is None:
-            return
-
-        path, name = os.path.split(self.pathname)
-        split_name = name.split('_')
-        self._sample_name = split_name[0]
-        self._lane_id = int(split_name[1])
-
-    def _get_sample_name(self):
-        if self._sample_name is None:
-            self._update_name()
-        return self._sample_name
-    sample_name = property(_get_sample_name)
-
-    def _get_lane_id(self):
-        if self._lane_id is None:
-            self._update_name()
-        return self._lane_id
-    lane_id = property(_get_lane_id)
-
-    def _get_reads(self):
-        if self._reads is None:
-            self._update()
-        return self._reads
-    reads = property(_get_reads)
-
     def _get_mapped_reads(self):
         if self._mapped_reads is None:
             self._update()
@@ -199,30 +221,77 @@ class ElandLane(object):
         return self._match_codes
     match_codes = property(_get_match_codes)
 
+    def _get_no_match(self):
+        if self._mapped_reads is None:
+            self._update()  
+        return self._match_codes['NM']
+    no_match = property(_get_no_match, 
+                        doc="total reads that didn't match the target genome.")
+
+    def _get_no_match_percent(self):
+        return float(self.no_match)/self.reads * 100 
+    no_match_percent = property(_get_no_match_percent,
+                                doc="no match reads as percent of total")
+
+    def _get_qc_failed(self):
+        if self._mapped_reads is None:
+            self._update()  
+        return self._match_codes['QC']
+    qc_failed = property(_get_qc_failed,
+                        doc="total reads that didn't match the target genome.")
+
+    def _get_qc_failed_percent(self):
+        return float(self.qc_failed)/self.reads * 100 
+    qc_failed_percent = property(_get_qc_failed_percent,
+                                 doc="QC failed reads as percent of total")
+
+    def _get_unique_reads(self):
+        if self._mapped_reads is None:
+           self._update()
+        sum = 0
+        for code in ['U0','U1','U2']:
+            sum += self._match_codes[code]
+        return sum
+    unique_reads = property(_get_unique_reads,
+                            doc="total unique reads")
+
+    def _get_repeat_reads(self):
+        if self._mapped_reads is None:
+           self._update()
+        sum = 0
+        for code in ['R0','R1','R2']:
+            sum += self._match_codes[code]
+        return sum
+    repeat_reads = property(_get_repeat_reads,
+                            doc="total repeat reads")
+    
     def get_elements(self):
         lane = ElementTree.Element(ElandLane.LANE,
                                    {'version':
                                     unicode(ElandLane.XML_VERSION)})
-        sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
+        sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
         sample_tag.text = self.sample_name
-        lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
+        lane_tag = ElementTree.SubElement(lane, LANE_ID)
         lane_tag.text = str(self.lane_id)
-        genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+        if self.end is not None:
+            end_tag = ElementTree.SubElement(lane, END)
+            end_tag.text = str(self.end)
+        genome_map = ElementTree.SubElement(lane, GENOME_MAP)
         for k, v in self.genome_map.items():
             item = ElementTree.SubElement(
-                genome_map, ElandLane.GENOME_ITEM,
+                genome_map, GENOME_ITEM,
                 {'name':k, 'value':unicode(v)})
-        mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+        mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
         for k, v in self.mapped_reads.items():
             item = ElementTree.SubElement(
-                mapped_reads, ElandLane.MAPPED_ITEM,
+                mapped_reads, MAPPED_ITEM,
                 {'name':k, 'value':unicode(v)})
-        match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+        match_codes = ElementTree.SubElement(lane, MATCH_CODES)
         for k, v in self.match_codes.items():
             item = ElementTree.SubElement(
-                match_codes, ElandLane.MATCH_ITEM,
+                match_codes, MATCH_ITEM,
                 {'name':k, 'value':unicode(v)})
-        reads = ElementTree.SubElement(lane, ElandLane.READS)
+        reads = ElementTree.SubElement(lane, READS)
         reads.text = unicode(self.reads)
 
         return lane
@@ -237,69 +306,160 @@ class ElandLane(object):
 
         for element in tree:
             tag = element.tag.lower()
-            if tag == ElandLane.SAMPLE_NAME.lower():
+            if tag == SAMPLE_NAME.lower():
                 self._sample_name = element.text
-            elif tag == ElandLane.LANE_ID.lower():
-                self._lane_id = int(element.text)
-            elif tag == ElandLane.GENOME_MAP.lower():
+            elif tag == LANE_ID.lower():
+                self.lane_id = int(element.text)
+            elif tag == END.lower():
+                self.end = int(element.text)
+            elif tag == GENOME_MAP.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = child.attrib['value']
                     self.genome_map[name] = value
-            elif tag == ElandLane.MAPPED_READS.lower():
+            elif tag == MAPPED_READS.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = child.attrib['value']
                     self._mapped_reads[name] = int(value)
-            elif tag == ElandLane.MATCH_CODES.lower():
+            elif tag == MATCH_CODES.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = int(child.attrib['value'])
                     self._match_codes[name] = value
-            elif tag == ElandLane.READS.lower():
+            elif tag == READS.lower():
                 self._reads = int(element.text)
             else:
                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
 
+class SequenceLane(ResultLane):
+    XML_VERSION=1
+    LANE = 'SequenceLane'
+    SEQUENCE_TYPE = 'SequenceType'
+
+    NONE_TYPE = None
+    SCARF_TYPE = 1
+    FASTQ_TYPE = 2
+    SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
+
+    def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
+        self.sequence_type = None
+        super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
+
+    def _guess_sequence_type(self, pathname):
+        """
+        Determine if we have a scarf or fastq sequence file
+        """
+        f = open(pathname,'r')
+        l = f.readline()
+        f.close()
+
+        if l[0] == '@':
+            # fastq starts with a @
+            self.sequence_type = SequenceLane.FASTQ_TYPE
+        else:
+            self.sequence_type = SequenceLane.SCARF_TYPE
+        return self.sequence_type
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        # can't do anything if we don't have a file to process
+        if self.pathname is None:
+            return
+
+        if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+            raise RuntimeError("Sequencing isn't done, try again later.")
+
+        self._guess_sequence_type(self.pathname)
+
+        logging.info("summarizing results for %s" % (self.pathname))
+        lines = 0
+        f = open(self.pathname)
+        for l in f.xreadlines():
+            lines += 1
+        f.close()
+
+        if self.sequence_type == SequenceLane.SCARF_TYPE:
+            self._reads = lines
+        elif self.sequence_type == SequenceLane.FASTQ_TYPE:
+            self._reads = lines / 4
+        else:
+          raise NotImplementedError("This only supports scarf or fastq squence files")
+
+    def get_elements(self):
+        lane = ElementTree.Element(SequenceLane.LANE,
+                                   {'version':
+                                    unicode(SequenceLane.XML_VERSION)})
+        sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
+        sample_tag.text = self.sample_name
+        lane_tag = ElementTree.SubElement(lane, LANE_ID)
+        lane_tag.text = str(self.lane_id)
+        if self.end is not None:
+            end_tag = ElementTree.SubElement(lane, END)
+            end_tag.text = str(self.end)
+        reads = ElementTree.SubElement(lane, READS)
+        reads.text = unicode(self.reads)
+        sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
+        sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
+
+        return lane
+
+    def set_elements(self, tree):
+        if tree.tag != SequenceLane.LANE:
+            raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
+        lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
+
+        for element in tree:
+            tag = element.tag.lower()
+            if tag == SAMPLE_NAME.lower():
+                self._sample_name = element.text
+            elif tag == LANE_ID.lower():
+                self.lane_id = int(element.text)
+            elif tag == END.lower():
+                self.end = int(element.text)
+            elif tag == READS.lower():
+                self._reads = int(element.text)
+            elif tag == SequenceLane.SEQUENCE_TYPE.lower():
+                self.sequence_type = lookup_sequence_type.get(element.text, None)
+            else:
+                logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
+
 class ELAND(object):
     """
     Summarize information from eland files
     """
-    XML_VERSION = 1
+    XML_VERSION = 3
 
     ELAND = 'ElandCollection'
     LANE = 'Lane'
     LANE_ID = 'id'
+    END = 'end'
 
     def __init__(self, xml=None):
         # we need information from the gerald config.xml
-        self.results = {}
+        self.results = [{},{}]
 
         if xml is not None:
             self.set_elements(xml)
 
-    def __len__(self):
-        return len(self.results)
+        if len(self.results[0]) == 0:
+            # Initialize our eland object with meaningless junk
+            for l in  LANE_LIST:
+                self.results[0][l] = ResultLane(lane_id=l, end=0)
 
-    def keys(self):
-        return self.results.keys()
-
-    def values(self):
-        return self.results.values()
-
-    def items(self):
-        return self.results.items()
-
-    def __getitem__(self, key):
-        return self.results[key]
 
     def get_elements(self):
         root = ElementTree.Element(ELAND.ELAND,
                                    {'version': unicode(ELAND.XML_VERSION)})
-        for lane_id, lane in self.results.items():
-            eland_lane = lane.get_elements()
-            eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
-            root.append(eland_lane)
+        for end in range(len(self.results)):
+           end_results = self.results[end]
+           for lane_id, lane in end_results.items():
+                eland_lane = lane.get_elements()
+                eland_lane.attrib[ELAND.END] = unicode (end)
+                eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
+                root.append(eland_lane)
         return root
 
     def set_elements(self, tree):
@@ -307,64 +467,114 @@ class ELAND(object):
             raise ValueError('Expecting %s', ELAND.ELAND)
         for element in list(tree):
             lane_id = int(element.attrib[ELAND.LANE_ID])
-            lane = ElandLane(xml=element)
-            self.results[lane_id] = lane
+            end = int(element.attrib.get(ELAND.END, 0)) 
+            if element.tag.lower() == ElandLane.LANE.lower():
+                lane = ElandLane(xml=element)
+            elif element.tag.lower() == SequenceLane.LANE.lower():
+                lane = SequenceLane(xml=element)
+
+            self.results[end][lane_id] = lane
 
-def check_for_eland_file(basedir, lane_id, pattern):
-   basename = pattern % (lane_id,)
+def check_for_eland_file(basedir, pattern, lane_id, end):
+   if end is None:
+      full_lane_id = lane_id
+   else:
+      full_lane_id = "%d_%d" % ( lane_id, end )
+
+   basename = pattern % (full_lane_id,)
    pathname = os.path.join(basedir, basename)
    if os.path.exists(pathname):
+       logging.info('found eland file in %s' % (pathname,))
        return pathname
    else:
        return None
 
-def eland(basedir, gerald=None, genome_maps=None):
+def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
+    # yes the lane_id is also being computed in ElandLane._update
+    # I didn't want to clutter up my constructor
+    # but I needed to persist the sample_name/lane_id for
+    # runfolder summary_report
+    path, name = os.path.split(pathname)
+    logging.info("Adding eland file %s" %(name,))
+    # split_name = name.split('_')
+    # lane_id = int(split_name[1])
+
+    if genome_maps is not None:
+        genome_map = genome_maps[lane_id]
+    elif gerald is not None:
+        genome_dir = gerald.lanes[lane_id].eland_genome
+        genome_map = build_genome_fasta_map(genome_dir)
+    else:
+        genome_map = {}
+
+    lane = ElandLane(pathname, lane_id, end, genome_map)
+    
+    if end is None:
+        effective_end =  0
+    else:
+        effective_end = end - 1
+
+    results[effective_end][lane_id] = lane
+
+def update_result_with_sequence(gerald, results, lane_id, end, pathname):
+    result = SequenceLane(pathname, lane_id, end)
+
+    if end is None:
+        effective_end =  0
+    else:
+        effective_end = end - 1
+
+    results[effective_end][lane_id] = result
+
+
+def eland(gerald_dir, gerald=None, genome_maps=None):
     e = ELAND()
 
-    file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
-    if len(file_list) == 0:
-        # lets handle compressed eland files too
-        file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
-
     lane_ids = range(1,9)
+    ends = [None, 1, 2]
+
+    basedirs = [gerald_dir]
+
+    # if there is a basedir/Temp change basedir to point to the temp
+    # directory, as 1.1rc1 moves most of the files we've historically
+    # cared about to that subdirectory.
+    # we should look into what the official 'result' files are.
+    # and 1.3 moves them back
+    basedir_temp = os.path.join(gerald_dir, 'Temp')
+    if os.path.isdir(basedir_temp):
+        basedirs.append(basedir_temp)
+
+   
     # the order in patterns determines the preference for what
     # will be found.
-    patterns = ['s_%d_eland_result.txt',
-                's_%d_eland_result.txt.bz2',
-                's_%d_eland_result.txt.gz',
-                's_%d_eland_extended.txt',
-                's_%d_eland_extended.txt.bz2',
-                's_%d_eland_extended.txt.gz',
-                's_%d_eland_multi.txt',
-                's_%d_eland_multi.txt.bz2',
-                's_%d_eland_multi.txt.gz',]
-
-    for lane_id in lane_ids:
-        for p in patterns:
-            pathname = check_for_eland_file(basedir, lane_id, p)
-            if pathname is not None:
-              break
-        else:
-            continue
-        # yes the lane_id is also being computed in ElandLane._update
-        # I didn't want to clutter up my constructor
-        # but I needed to persist the sample_name/lane_id for
-        # runfolder summary_report
-        path, name = os.path.split(pathname)
-        logging.info("Adding eland file %s" %(name,))
-        split_name = name.split('_')
-        lane_id = int(split_name[1])
-
-        if genome_maps is not None:
-            genome_map = genome_maps[lane_id]
-        elif gerald is not None:
-            genome_dir = gerald.lanes[lane_id].eland_genome
-            genome_map = build_genome_fasta_map(genome_dir)
-        else:
-            genome_map = {}
+    MAPPED_ELAND = 0
+    SEQUENCE = 1
+    patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
+                ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
+                ('s_%s_sequence.txt', SEQUENCE),]
+
+    for basedir in basedirs:
+        for end in ends:
+            for lane_id in lane_ids:
+                for p in patterns:
+                    pathname = check_for_eland_file(basedir, p[0], lane_id, end)
+                    if pathname is not None:
+                        if p[1] == MAPPED_ELAND:
+                            update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
+                        elif p[1] == SEQUENCE:
+                            update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
+                        break
+                else:
+                    logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
+                    continue
 
-        eland_result = ElandLane(pathname, genome_map)
-        e.results[lane_id] = eland_result
     return e
 
 def build_genome_fasta_map(genome_dir):