Handle lanes that were only sequenced.
authorDiane Trout <diane@caltech.edu>
Tue, 5 May 2009 18:02:20 +0000 (18:02 +0000)
committerDiane Trout <diane@caltech.edu>
Tue, 5 May 2009 18:02:20 +0000 (18:02 +0000)
So the report needs to be a bit smaller, and I need to archive a different file
Also this version only counts the number of records in the "sequence" file
which is raw sequence that passed the QC filter, I don't have a uniform way
of determining how many total sequences thre were.

htsworkflow/pipelines/eland.py
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/test/simulate_runfolder.py
htsworkflow/pipelines/test/test_runfolder_ipar130.py

index f9adcd85bbdad0f82cf33672498e4a254b66cfd6..bd6864e984ce3328534611ebf427b906f5bc0e6a 100644 (file)
@@ -12,34 +12,80 @@ from htsworkflow.pipelines.runfolder import ElementTree
 from htsworkflow.util.ethelp import indent, flatten
 from htsworkflow.util.opener import autoopen
 
-class ElandLane(object):
+SAMPLE_NAME = 'SampleName'
+LANE_ID = 'LaneID'
+END = 'End'
+READS = 'Reads'
+
+GENOME_MAP = 'GenomeMap'
+GENOME_ITEM = 'GenomeItem'
+MAPPED_READS = 'MappedReads'
+MAPPED_ITEM = 'MappedItem'
+MATCH_CODES = 'MatchCodes'
+MATCH_ITEM = 'Code'
+READS = 'Reads'
+
+ELAND_SINGLE = 0
+ELAND_MULTI = 1
+ELAND_EXTENDED = 2
+ELAND_EXPORT = 3
+
+
+class ResultLane(object):
     """
-    Process an eland result file
+    Base class for result lanes
     """
     XML_VERSION = 2
-    LANE = 'ElandLane'
-    SAMPLE_NAME = 'SampleName'
-    LANE_ID = 'LaneID'
-    END = 'End'
-    GENOME_MAP = 'GenomeMap'
-    GENOME_ITEM = 'GenomeItem'
-    MAPPED_READS = 'MappedReads'
-    MAPPED_ITEM = 'MappedItem'
-    MATCH_CODES = 'MatchCodes'
-    MATCH_ITEM = 'Code'
-    READS = 'Reads'
-
-    ELAND_SINGLE = 0
-    ELAND_MULTI = 1
-    ELAND_EXTENDED = 2
-    ELAND_EXPORT = 3
+    LANE = 'ResultLane'
 
-    def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
+    def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
         self.pathname = pathname
         self._sample_name = None
         self.lane_id = lane_id
         self.end = end
         self._reads = None
+        
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        raise NotImplementedError("Can't count abstract classes")
+
+    def _update_name(self):
+        # extract the sample name
+        if self.pathname is None:
+            return
+
+        path, name = os.path.split(self.pathname)
+        split_name = name.split('_')
+        self._sample_name = split_name[0]
+
+    def _get_sample_name(self):
+        if self._sample_name is None:
+            self._update_name()
+        return self._sample_name
+    sample_name = property(_get_sample_name)
+
+    def _get_reads(self):
+        if self._reads is None:
+            self._update()
+        return self._reads
+    reads = property(_get_reads)
+
+
+class ElandLane(ResultLane):
+    """
+    Process an eland result file
+    """
+    XML_VERSION = 2
+    LANE = "ElandLane"
+
+    def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
+        super(ElandLane, self).__init__(pathname, lane_id, xml)
+
         self._mapped_reads = None
         self._match_codes = None
         if genome_map is None:
@@ -55,15 +101,15 @@ class ElandLane(object):
           # attempt autodetect eland file type
           pathn, name = os.path.split(pathname)
           if re.search('result', name):
-            self.eland_type = ElandLane.ELAND_SINGLE
+            self.eland_type = ELAND_SINGLE
           elif re.search('multi', name):
-            self.eland_type = ElandLane.ELAND_MULTI
+            self.eland_type = ELAND_MULTI
           elif re.search('extended', name):
-            self.eland_type = ElandLane.ELAND_EXTENDED
+            self.eland_type = ELAND_EXTENDED
           elif re.search('export', name):
-            self.eland_type = ElandLane.ELAND_EXPORT
+            self.eland_type = ELAND_EXPORT
           else:
-            self.eland_type = ElandLane.ELAND_SINGLE
+            self.eland_type = ELAND_SINGLE
 
     def _update(self):
         """
@@ -79,10 +125,10 @@ class ElandLane(object):
 
         logging.info("summarizing results for %s" % (self.pathname))
 
-        if self.eland_type == ElandLane.ELAND_SINGLE:
+        if self.eland_type == ELAND_SINGLE:
           result = self._update_eland_result(self.pathname)
-        elif self.eland_type == ElandLane.ELAND_MULTI or \
-             self.eland_type == ElandLane.ELAND_EXTENDED:
+        elif self.eland_type == ELAND_MULTI or \
+             self.eland_type == ELAND_EXTENDED:
           result = self._update_eland_multi(self.pathname)
         else:
           raise NotImplementedError("Only support single/multi/extended eland files")
@@ -163,27 +209,6 @@ class ElandLane(object):
                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
         return match_codes, mapped_reads, reads
 
-    def _update_name(self):
-        # extract the sample name
-        if self.pathname is None:
-            return
-
-        path, name = os.path.split(self.pathname)
-        split_name = name.split('_')
-        self._sample_name = split_name[0]
-
-    def _get_sample_name(self):
-        if self._sample_name is None:
-            self._update_name()
-        return self._sample_name
-    sample_name = property(_get_sample_name)
-
-    def _get_reads(self):
-        if self._reads is None:
-            self._update()
-        return self._reads
-    reads = property(_get_reads)
-
     def _get_mapped_reads(self):
         if self._mapped_reads is None:
             self._update()
@@ -244,29 +269,29 @@ class ElandLane(object):
         lane = ElementTree.Element(ElandLane.LANE,
                                    {'version':
                                     unicode(ElandLane.XML_VERSION)})
-        sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
+        sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
         sample_tag.text = self.sample_name
-        lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
+        lane_tag = ElementTree.SubElement(lane, LANE_ID)
         lane_tag.text = str(self.lane_id)
         if self.end is not None:
-            end_tag = ElementTree.SubElement(lane, ElandLane.END)
+            end_tag = ElementTree.SubElement(lane, END)
             end_tag.text = str(self.end)
-        genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+        genome_map = ElementTree.SubElement(lane, GENOME_MAP)
         for k, v in self.genome_map.items():
             item = ElementTree.SubElement(
-                genome_map, ElandLane.GENOME_ITEM,
+                genome_map, GENOME_ITEM,
                 {'name':k, 'value':unicode(v)})
-        mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+        mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
         for k, v in self.mapped_reads.items():
             item = ElementTree.SubElement(
-                mapped_reads, ElandLane.MAPPED_ITEM,
+                mapped_reads, MAPPED_ITEM,
                 {'name':k, 'value':unicode(v)})
-        match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+        match_codes = ElementTree.SubElement(lane, MATCH_CODES)
         for k, v in self.match_codes.items():
             item = ElementTree.SubElement(
-                match_codes, ElandLane.MATCH_ITEM,
+                match_codes, MATCH_ITEM,
                 {'name':k, 'value':unicode(v)})
-        reads = ElementTree.SubElement(lane, ElandLane.READS)
+        reads = ElementTree.SubElement(lane, READS)
         reads.text = unicode(self.reads)
 
         return lane
@@ -281,37 +306,132 @@ class ElandLane(object):
 
         for element in tree:
             tag = element.tag.lower()
-            if tag == ElandLane.SAMPLE_NAME.lower():
+            if tag == SAMPLE_NAME.lower():
                 self._sample_name = element.text
-            elif tag == ElandLane.LANE_ID.lower():
+            elif tag == LANE_ID.lower():
                 self.lane_id = int(element.text)
-            elif tag == ElandLane.END.lower():
+            elif tag == END.lower():
                 self.end = int(element.text)
-            elif tag == ElandLane.GENOME_MAP.lower():
+            elif tag == GENOME_MAP.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = child.attrib['value']
                     self.genome_map[name] = value
-            elif tag == ElandLane.MAPPED_READS.lower():
+            elif tag == MAPPED_READS.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = child.attrib['value']
                     self._mapped_reads[name] = int(value)
-            elif tag == ElandLane.MATCH_CODES.lower():
+            elif tag == MATCH_CODES.lower():
                 for child in element:
                     name = child.attrib['name']
                     value = int(child.attrib['value'])
                     self._match_codes[name] = value
-            elif tag == ElandLane.READS.lower():
+            elif tag == READS.lower():
                 self._reads = int(element.text)
             else:
                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
 
+class SequenceLane(ResultLane):
+    XML_VERSION=1
+    LANE = 'SequenceLane'
+    SEQUENCE_TYPE = 'SequenceType'
+
+    NONE_TYPE = None
+    SCARF_TYPE = 1
+    FASTQ_TYPE = 2
+    SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
+
+    def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
+        self.sequence_type = None
+        super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
+
+    def _guess_sequence_type(self, pathname):
+        """
+        Determine if we have a scarf or fastq sequence file
+        """
+        f = open(pathname,'r')
+        l = f.readline()
+        f.close()
+
+        if l[0] == '@':
+            # fastq starts with a @
+            self.sequence_type = SequenceLane.FASTQ_TYPE
+        else:
+            self.sequence_type = SequenceLane.SCARF_TYPE
+        return self.sequence_type
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        # can't do anything if we don't have a file to process
+        if self.pathname is None:
+            return
+
+        if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+            raise RuntimeError("Sequencing isn't done, try again later.")
+
+        self._guess_sequence_type(self.pathname)
+
+        logging.info("summarizing results for %s" % (self.pathname))
+        lines = 0
+        f = open(self.pathname)
+        for l in f.xreadlines():
+            lines += 1
+        f.close()
+
+        if self.sequence_type == SequenceLane.SCARF_TYPE:
+            self._reads = lines
+        elif self.sequence_type == SequenceLane.FASTQ_TYPE:
+            self._reads = lines / 4
+        else:
+          raise NotImplementedError("This only supports scarf or fastq squence files")
+
+    def get_elements(self):
+        lane = ElementTree.Element(SequenceLane.LANE,
+                                   {'version':
+                                    unicode(SequenceLane.XML_VERSION)})
+        sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
+        sample_tag.text = self.sample_name
+        lane_tag = ElementTree.SubElement(lane, LANE_ID)
+        lane_tag.text = str(self.lane_id)
+        if self.end is not None:
+            end_tag = ElementTree.SubElement(lane, END)
+            end_tag.text = str(self.end)
+        reads = ElementTree.SubElement(lane, READS)
+        reads.text = unicode(self.reads)
+        sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
+        sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
+
+        return lane
+
+    def set_elements(self, tree):
+        if tree.tag != SequenceLane.LANE:
+            raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
+        lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
+
+        for element in tree:
+            tag = element.tag.lower()
+            if tag == SAMPLE_NAME.lower():
+                self._sample_name = element.text
+            elif tag == LANE_ID.lower():
+                self.lane_id = int(element.text)
+            elif tag == END.lower():
+                self.end = int(element.text)
+            elif tag == READS.lower():
+                self._reads = int(element.text)
+            elif tag == SequenceLane.SEQUENCE_TYPE.lower():
+                self.sequence_type = lookup_sequence_type.get(element.text, None)
+                print self.sequence_type
+            else:
+                logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
+
 class ELAND(object):
     """
     Summarize information from eland files
     """
-    XML_VERSION = 2
+    XML_VERSION = 3
 
     ELAND = 'ElandCollection'
     LANE = 'Lane'
@@ -343,7 +463,11 @@ class ELAND(object):
         for element in list(tree):
             lane_id = int(element.attrib[ELAND.LANE_ID])
             end = int(element.attrib.get(ELAND.END, 0)) 
-            lane = ElandLane(xml=element)
+            if element.tag.lower() == ElandLane.LANE.lower():
+                lane = ElandLane(xml=element)
+            elif element.tag.lower() == SequenceLane.LANE.lower():
+                lane = SequenceLane(xml=element)
+
             self.results[end][lane_id] = lane
 
 def check_for_eland_file(basedir, pattern, lane_id, end):
@@ -360,6 +484,44 @@ def check_for_eland_file(basedir, pattern, lane_id, end):
    else:
        return None
 
+def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
+    # yes the lane_id is also being computed in ElandLane._update
+    # I didn't want to clutter up my constructor
+    # but I needed to persist the sample_name/lane_id for
+    # runfolder summary_report
+    path, name = os.path.split(pathname)
+    logging.info("Adding eland file %s" %(name,))
+    # split_name = name.split('_')
+    # lane_id = int(split_name[1])
+
+    if genome_maps is not None:
+        genome_map = genome_maps[lane_id]
+    elif gerald is not None:
+        genome_dir = gerald.lanes[lane_id].eland_genome
+        genome_map = build_genome_fasta_map(genome_dir)
+    else:
+        genome_map = {}
+
+    lane = ElandLane(pathname, lane_id, end, genome_map)
+    
+    if end is None:
+        effective_end =  0
+    else:
+        effective_end = end - 1
+
+    results[effective_end][lane_id] = lane
+
+def update_result_with_sequence(gerald, results, lane_id, end, pathname):
+    result = SequenceLane(pathname, lane_id, end)
+
+    if end is None:
+        effective_end =  0
+    else:
+        effective_end = end - 1
+
+    results[effective_end][lane_id] = result
+
+
 def eland(gerald_dir, gerald=None, genome_maps=None):
     e = ELAND()
 
@@ -380,50 +542,34 @@ def eland(gerald_dir, gerald=None, genome_maps=None):
    
     # the order in patterns determines the preference for what
     # will be found.
-    patterns = ['s_%s_eland_result.txt',
-                's_%s_eland_result.txt.bz2',
-                's_%s_eland_result.txt.gz',
-                's_%s_eland_extended.txt',
-                's_%s_eland_extended.txt.bz2',
-                's_%s_eland_extended.txt.gz',
-                's_%s_eland_multi.txt',
-                's_%s_eland_multi.txt.bz2',
-                's_%s_eland_multi.txt.gz',]
+    MAPPED_ELAND = 0
+    SEQUENCE = 1
+    patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
+                ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
+                ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
+                ('s_%s_sequence.txt', SEQUENCE),]
 
     for basedir in basedirs:
         for end in ends:
             for lane_id in lane_ids:
                 for p in patterns:
-                    pathname = check_for_eland_file(basedir, p, lane_id, end)
+                    pathname = check_for_eland_file(basedir, p[0], lane_id, end)
                     if pathname is not None:
-                      break
+                        if p[1] == MAPPED_ELAND:
+                            update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
+                        elif p[1] == SEQUENCE:
+                            update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
+                        break
                 else:
                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
                     continue
 
-                # yes the lane_id is also being computed in ElandLane._update
-                # I didn't want to clutter up my constructor
-                # but I needed to persist the sample_name/lane_id for
-                # runfolder summary_report
-                path, name = os.path.split(pathname)
-                logging.info("Adding eland file %s" %(name,))
-                # split_name = name.split('_')
-                # lane_id = int(split_name[1])
-    
-                if genome_maps is not None:
-                    genome_map = genome_maps[lane_id]
-                elif gerald is not None:
-                    genome_dir = gerald.lanes[lane_id].eland_genome
-                    genome_map = build_genome_fasta_map(genome_dir)
-                else:
-                    genome_map = {}
-
-                eland_result = ElandLane(pathname, lane_id, end, genome_map)
-                if end is None:
-                    effective_end =  0
-                else:
-                    effective_end = end - 1
-                e.results[effective_end][lane_id] = eland_result
     return e
 
 def build_genome_fasta_map(genome_dir):
index 56453aff13bfc1441d16d130d08c12f28fdf5914..11795c20054af2504db7f2129b36363970f93d40 100644 (file)
@@ -313,22 +313,27 @@ def summarize_lane(gerald, lane_id):
       cluster = summary_results[end][eland_result.lane_id].cluster
       report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
       report.append("Total Reads: %d" % (eland_result.reads))
-      mc = eland_result._match_codes
-      nm = mc['NM']
-      nm_percent = float(nm)/eland_result.reads  * 100
-      qc = mc['QC']
-      qc_percent = float(qc)/eland_result.reads * 100
-
-      report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
-      report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
-      report.append('Unique (0,1,2 mismatches) %d %d %d' % \
-                    (mc['U0'], mc['U1'], mc['U2']))
-      report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
-                    (mc['R0'], mc['R1'], mc['R2']))
-      report.append("Mapped Reads")
-      mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads)
-      for name, counts in mapped_reads.items():
-        report.append("  %s: %d" % (name, counts))
+
+      if hasattr(eland_result, 'match_codes'):
+          mc = eland_result.match_codes
+          nm = mc['NM']
+          nm_percent = float(nm)/eland_result.reads  * 100
+          qc = mc['QC']
+          qc_percent = float(qc)/eland_result.reads * 100
+
+          report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
+          report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
+          report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+                        (mc['U0'], mc['U1'], mc['U2']))
+          report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+                        (mc['R0'], mc['R1'], mc['R2']))
+
+      if hasattr(eland_result, 'genome_map'):
+          report.append("Mapped Reads")
+          mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads)
+          for name, counts in mapped_reads.items():
+            report.append("  %s: %d" % (name, counts))
+
       report.append('')
     return report
 
@@ -415,7 +420,8 @@ def extract_results(runs, output_base_dir=None):
       logging.info("Running bzip2: " + " ".join(bzip_cmd))
       logging.info("Writing to %s" %(tar_dest_name))
 
-      tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, 
+      env = {'BZIP': '-9'}
+      tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
                              cwd=scores_path)
       bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
       tar.wait()
index 455d8c1b73bdb537372d710c50f4ca3f5ef980a3..2e340d22da29f890d31554f3d8a6823c559e8980 100644 (file)
@@ -7,7 +7,8 @@ import shutil
 
 TEST_CODE_DIR = os.path.split(__file__)[0]
 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
+LANE_LIST = range(1,9)
+
 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
     firecrest_dir = os.path.join(data_dir, 
                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
@@ -118,7 +119,7 @@ def make_eland_results(gerald_dir):
         f.write(eland_result)
         f.close()
 
-def make_eland_multi(gerald_dir, paired=False):
+def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
@@ -135,16 +136,51 @@ def make_eland_multi(gerald_dir, paired=False):
 """]
     if paired:
         for e in [1,2]:
-            for i in range(1,9):
+            for i in lane_list:
                 pathname = os.path.join(gerald_dir,
                                         's_%d_%d_eland_multi.txt' % (i,e))
                 f = open(pathname, 'w')
                 f.write(eland_multi[e-1])
                 f.close()
     else:
-        for i in range(1,9):
+        for i in lane_list:
             pathname = os.path.join(gerald_dir,
                                     's_%d_eland_multi.txt' % (i,))
             f = open(pathname, 'w')
             f.write(eland_multi[0])
             f.close()
+
+def make_scarf(gerald_dir, lane_list=LANE_LIST):
+    seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
+HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
+HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
+HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
+HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
+"""
+    for l in lane_list:
+        pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
+        f = open(pathname,'w')
+        f.write(seq)
+        f.close()
+
+def make_fastq(gerald_dir, lane_list=LANE_LIST):
+    seq = """@HWI-EAS229:1:2:182:712#0/1
+AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
++HWI-EAS229:1:2:182:712#0/1
+\bab_bbaabbababbaaa]]D]bb_baabbab\baa
+@HWI-EAS229:1:2:198:621#0/1
+CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
++HWI-EAS229:1:2:198:621#0/1
+[aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
+@HWI-EAS229:1:2:209:1321#0/1
+AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
++HWI-EAS229:1:2:209:1321#0/1
+_bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
+"""
+    for l in lane_list:
+        pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
+        f = open(pathname,'w')
+        f.write(seq)
+        f.close()
+
+
index c782e4f642920f01ee0cdeac830cb99f6c140101..9e6ac22de3efff7551c40d736adc303a699b7434 100644 (file)
@@ -6,6 +6,7 @@ import tempfile
 import shutil
 import unittest
 
+from htsworkflow.pipelines import eland
 from htsworkflow.pipelines import ipar
 from htsworkflow.pipelines import bustard
 from htsworkflow.pipelines import gerald
@@ -42,7 +43,9 @@ def make_runfolder(obj=None):
     os.mkdir(gerald_dir)
     make_gerald_config_100(gerald_dir)
     make_summary_ipar130_htm(gerald_dir)
-    make_eland_multi(gerald_dir)
+    make_eland_multi(gerald_dir, lane_list=[1,2,3,4,5,6,])
+    make_scarf(gerald_dir, lane_list=[7,])
+    make_fastq(gerald_dir, lane_list=[8,])
 
     if obj is not None:
         obj.temp_dir = temp_dir
@@ -222,17 +225,18 @@ class RunfolderTests(unittest.TestCase):
                 g2_results = g2_eland.results[0][lane]
                 self.failUnlessEqual(g_results.reads,
                                      g2_results.reads)
-                self.failUnlessEqual(len(g_results.mapped_reads),
-                                     len(g2_results.mapped_reads))
-                for k in g_results.mapped_reads.keys():
-                    self.failUnlessEqual(g_results.mapped_reads[k],
-                                         g2_results.mapped_reads[k])
+                if isinstance(g_results, eland.ElandLane):
+                  self.failUnlessEqual(len(g_results.mapped_reads),
+                                       len(g2_results.mapped_reads))
+                  for k in g_results.mapped_reads.keys():
+                      self.failUnlessEqual(g_results.mapped_reads[k],
+                                           g2_results.mapped_reads[k])
 
-                self.failUnlessEqual(len(g_results.match_codes),
-                                     len(g2_results.match_codes))
-                for k in g_results.match_codes.keys():
-                    self.failUnlessEqual(g_results.match_codes[k],
-                                         g2_results.match_codes[k])
+                  self.failUnlessEqual(len(g_results.match_codes),
+                                       len(g2_results.match_codes))
+                  for k in g_results.match_codes.keys():
+                      self.failUnlessEqual(g_results.match_codes[k],
+                                           g2_results.match_codes[k])
 
 
     def test_eland(self):
@@ -244,10 +248,11 @@ class RunfolderTests(unittest.TestCase):
 
         genome_maps = { 1:hg_map, 2:hg_map, 3:hg_map, 4:hg_map,
                         5:hg_map, 6:hg_map, 7:hg_map, 8:hg_map }
-        eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+        eland_container = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
 
-        for i in range(1,9):
-            lane = eland.results[0][i]
+        # I added sequence lanes to the last 2 lanes of this test case
+        for i in range(1,7):
+            lane = eland_container.results[0][i]
             self.failUnlessEqual(lane.reads, 6)
             self.failUnlessEqual(lane.sample_name, "s")
             self.failUnlessEqual(lane.lane_id, i)
@@ -262,28 +267,47 @@ class RunfolderTests(unittest.TestCase):
             self.failUnlessEqual(lane.match_codes['NM'], 1)
             self.failUnlessEqual(lane.match_codes['QC'], 0)
 
-        xml = eland.get_elements()
+        # test scarf
+        lane = eland_container.results[0][7]
+        self.failUnlessEqual(lane.reads, 5)
+        self.failUnlessEqual(lane.sample_name, 's')
+        self.failUnlessEqual(lane.lane_id, 7)
+        self.failUnlessEqual(lane.sequence_type, eland.SequenceLane.SCARF_TYPE)
+
+        # test fastq
+        lane = eland_container.results[0][8]
+        self.failUnlessEqual(lane.reads, 3)
+        self.failUnlessEqual(lane.sample_name, 's')
+        self.failUnlessEqual(lane.lane_id, 8)
+        self.failUnlessEqual(lane.sequence_type, eland.SequenceLane.FASTQ_TYPE)
+
+        xml = eland_container.get_elements()
         # just make sure that element tree can serialize the tree
         xml_str = ElementTree.tostring(xml)
         e2 = gerald.ELAND(xml=xml)
 
         for i in range(1,9):
-            l1 = eland.results[0][i]
+            l1 = eland_container.results[0][i]
             l2 = e2.results[0][i]
             self.failUnlessEqual(l1.reads, l2.reads)
             self.failUnlessEqual(l1.sample_name, l2.sample_name)
             self.failUnlessEqual(l1.lane_id, l2.lane_id)
-            self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
-            self.failUnlessEqual(len(l1.mapped_reads), 17)
-            for k in l1.mapped_reads.keys():
-                self.failUnlessEqual(l1.mapped_reads[k],
-                                     l2.mapped_reads[k])
-
-            self.failUnlessEqual(len(l1.match_codes), 9)
-            self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
-            for k in l1.match_codes.keys():
-                self.failUnlessEqual(l1.match_codes[k],
-                                     l2.match_codes[k])
+            if isinstance(l1, eland.ElandLane):
+              self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+              self.failUnlessEqual(len(l1.mapped_reads), 17)
+              for k in l1.mapped_reads.keys():
+                  self.failUnlessEqual(l1.mapped_reads[k],
+                                       l2.mapped_reads[k])
+
+              self.failUnlessEqual(len(l1.match_codes), 9)
+              self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+              for k in l1.match_codes.keys():
+                  self.failUnlessEqual(l1.match_codes[k],
+                                       l2.match_codes[k])
+            elif isinstance(l1, eland.SequenceLane):
+                print 'l1', l1.__dict__
+                print 'l2', l2.__dict__
+                self.failUnlessEqual(l1.sequence_type, l2.sequence_type)
 
     def test_runfolder(self):
         runs = runfolder.get_runs(self.runfolder_dir)