From: Diane Trout <diane@caltech.edu>
Date: Mon, 12 Sep 2011 22:29:20 +0000 (-0700)
Subject: Add support for first-gen HiSeq flowcells (e.g. ABXX)
X-Git-Tag: 0.5.4~1
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=0123277f86af843d47170d56d217f45e7d7cb4a6

Add support for first-gen HiSeq flowcells (e.g. ABXX)
---

diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py
index 271a651..bd478aa 100644
--- a/htsworkflow/pipelines/eland.py
+++ b/htsworkflow/pipelines/eland.py
@@ -44,7 +44,7 @@ class ResultLane(object):
         self.lane_id = lane_id
         self.end = end
         self._reads = None
-        
+
         if xml is not None:
             self.set_elements(xml)
 
@@ -182,13 +182,13 @@ class ElandLane(ResultLane):
             reads += 1
             fields = line.split()
             # fields[2] = QC/NM/or number of matches
-            score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
+            score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
                                                        match_codes)
             if score_type == ElandLane.SCORE_READ:
                 # when there are too many hits, eland  writes a - where
                 # it would have put the list of hits.
                 # or in a different version of eland, it just leaves
-                # that column blank, and only outputs 3 fields.     
+                # that column blank, and only outputs 3 fields.
                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
                   continue
 
@@ -213,16 +213,16 @@ class ElandLane(ResultLane):
             reads += 1
             fields = line.split()
             # fields[2] = QC/NM/or number of matches
-            score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
+            score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
                                                        match_codes)
             if score_type == ElandLane.SCORE_UNRECOGNIZED:
                 # export files have three states for the match field
-                # QC code, count of multi-reads, or a single 
+                # QC code, count of multi-reads, or a single
                 # read location. The score_mapped_mismatches function
                 # only understands the first two types.
                 # if we get unrecognized, that implies the field is probably
                 # a location.
-                code = self._count_mapped_export(mapped_reads, 
+                code = self._count_mapped_export(mapped_reads,
                                                  fields[LOCATION_INDEX],
                                                  fields[DESCRIPTOR_INDEX])
                 match_codes[code] += 1
@@ -232,7 +232,7 @@ class ElandLane(ResultLane):
 
     def _score_mapped_mismatches(self, match, match_codes):
         """Update match_codes with eland map counts, or failure code.
-        
+
         Returns True if the read mapped, false if it was an error code.
         """
         groups = ElandLane.MATCH_COUNTS_RE.match(match)
@@ -260,12 +260,12 @@ class ElandLane(ResultLane):
                 match_codes['U1'] += 1
             elif one_mismatches < 255:
                 match_codes['R1'] += one_mismatches
-    
+
             if two_mismatches == 1:
                 match_codes['U2'] += 1
             elif two_mismatches < 255:
                 match_codes['R2'] += two_mismatches
-                
+
             return ElandLane.SCORE_READ
 
 
@@ -284,9 +284,9 @@ class ElandLane(ResultLane):
 
     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
         """Count a read as defined in an export file
-        
+
         match_string contains the chromosome
-        descriptor contains the an ecoding of bases that match, mismatch, 
+        descriptor contains the an ecoding of bases that match, mismatch,
                    and have indels.
         returns the "best" match code
 
@@ -320,25 +320,25 @@ class ElandLane(ResultLane):
 
     def _get_no_match(self):
         if self._mapped_reads is None:
-            self._update()  
+            self._update()
         return self._match_codes['NM']
-    no_match = property(_get_no_match, 
+    no_match = property(_get_no_match,
                         doc="total reads that didn't match the target genome.")
 
     def _get_no_match_percent(self):
-        return float(self.no_match)/self.reads * 100 
+        return float(self.no_match)/self.reads * 100
     no_match_percent = property(_get_no_match_percent,
                                 doc="no match reads as percent of total")
 
     def _get_qc_failed(self):
         if self._mapped_reads is None:
-            self._update()  
+            self._update()
         return self._match_codes['QC']
     qc_failed = property(_get_qc_failed,
                         doc="total reads that didn't match the target genome.")
 
     def _get_qc_failed_percent(self):
-        return float(self.qc_failed)/self.reads * 100 
+        return float(self.qc_failed)/self.reads * 100
     qc_failed_percent = property(_get_qc_failed_percent,
                                  doc="QC failed reads as percent of total")
 
@@ -361,7 +361,7 @@ class ElandLane(ResultLane):
         return sum
     repeat_reads = property(_get_repeat_reads,
                             doc="total repeat reads")
-    
+
     def get_elements(self):
         lane = ElementTree.Element(ElandLane.LANE,
                                    {'version':
@@ -565,7 +565,7 @@ class ELAND(object):
             raise ValueError('Expecting %s', ELAND.ELAND)
         for element in list(tree):
             lane_id = int(element.attrib[ELAND.LANE_ID])
-            end = int(element.attrib.get(ELAND.END, 0)) 
+            end = int(element.attrib.get(ELAND.END, 0))
             if element.tag.lower() == ElandLane.LANE.lower():
                 lane = ElandLane(xml=element)
             elif element.tag.lower() == SequenceLane.LANE.lower():
@@ -598,16 +598,16 @@ def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_map
     # split_name = name.split('_')
     # lane_id = int(split_name[1])
 
+    genome_map = {}
     if genome_maps is not None:
         genome_map = genome_maps[lane_id]
     elif gerald is not None:
         genome_dir = gerald.lanes[lane_id].eland_genome
-        genome_map = build_genome_fasta_map(genome_dir)
-    else:
-        genome_map = {}
+        if genome_dir is not None:
+            genome_map = build_genome_fasta_map(genome_dir)
 
     lane = ElandLane(pathname, lane_id, end, genome_map)
-    
+
     if end is None:
         effective_end =  0
     else:
@@ -643,7 +643,7 @@ def eland(gerald_dir, gerald=None, genome_maps=None):
     if os.path.isdir(basedir_temp):
         basedirs.append(basedir_temp)
 
-   
+
     # the order in patterns determines the preference for what
     # will be found.
     MAPPED_ELAND = 0
@@ -723,7 +723,7 @@ def main(cmdline=None):
         e = eland(a)
         print e.get_elements()
 
-    return 
+    return
 
 
 if __name__ == "__main__":
diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py
index 58f3081..f62b4f2 100644
--- a/htsworkflow/pipelines/gerald.py
+++ b/htsworkflow/pipelines/gerald.py
@@ -56,14 +56,18 @@ class Gerald(object):
             # default to the chipwide parameters if there isn't an
             # entry in the lane specific paramaters
             if genome is None:
-                subtree = self._gerald.tree.find('ChipWideRunParameters')
-                container = subtree.find('ELAND_GENOME')
-                genome = container.text
+                genome = self._gerald._get_chip_attribute('ELAND_GENOME')
+            # ignore flag value
+            if genome == 'Need_to_specify_ELAND_genome_directory':
+                genome = None
             return genome
         eland_genome = property(_get_eland_genome)
 
         def _get_read_length(self):
-            return self.__get_attribute('READ_LENGTH')
+            read_length = self.__get_attribute('READ_LENGTH')
+            if read_length is None:
+                read_length = self._gerald._get_chip_attribute('READ_LENGTH')
+            return read_length
         read_length = property(_get_read_length)
 
         def _get_use_bases(self):
@@ -154,7 +158,7 @@ class Gerald(object):
             root = ''
         else:
             root = os.path.join(root,'')
-            
+
         experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
         if experiment_dir is None:
             return None
@@ -165,13 +169,16 @@ class Gerald(object):
         dirnames = experiment_dir.split(os.path.sep)
         return dirnames[0]
     runfolder_name = property(_get_runfolder_name)
-    
+
     def _get_version(self):
         if self.tree is None:
             return None
         return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
     version = property(_get_version)
 
+    def _get_chip_attribute(self, value):
+        return self.tree.findtext('ChipWideRunParameters/%s' % (value,))
+
     def dump(self):
         """
         Debugging function, report current object
diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py
index 5b79c2e..0b06c0b 100644
--- a/htsworkflow/pipelines/test/simulate_runfolder.py
+++ b/htsworkflow/pipelines/test/simulate_runfolder.py
@@ -9,14 +9,18 @@ TEST_CODE_DIR = os.path.split(__file__)[0]
 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
 LANE_LIST = range(1,9)
 TILE_LIST = range(1,101)
+HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
+                   1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
+                   2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
+                   2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
 
 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
-    firecrest_dir = os.path.join(data_dir, 
+    firecrest_dir = os.path.join(data_dir,
                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
                                  )
     os.mkdir(firecrest_dir)
     return firecrest_dir
-    
+
 def make_ipar_dir(data_dir, version='1.01'):
     """
     Construct an artificial ipar parameter file and directory
@@ -58,7 +62,7 @@ def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
     intensities_dir = os.path.join(data_dir, 'Intensities')
     if not os.path.exists(intensities_dir):
       os.mkdir(intensities_dir)
- 
+
     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
 
@@ -71,7 +75,7 @@ def make_rta_basecalls_1460(intensities_dir):
     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
     if not os.path.exists(basecalls_dir):
       os.mkdir(basecalls_dir)
- 
+
     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
 
@@ -84,12 +88,25 @@ def make_rta_intensities_1870(data_dir, version='1.8.70.0'):
     intensities_dir = os.path.join(data_dir, 'Intensities')
     if not os.path.exists(intensities_dir):
       os.mkdir(intensities_dir)
- 
+
     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
 
     return intensities_dir
 
+def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
+    """
+    Construct an artificial RTA Intensities parameter file and directory
+    """
+    intensities_dir = os.path.join(data_dir, 'Intensities')
+    if not os.path.exists(intensities_dir):
+      os.mkdir(intensities_dir)
+
+    param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
+    shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
+
+    return intensities_dir
+
 def make_rta_basecalls_1870(intensities_dir):
     """
     Construct an artificial RTA Intensities parameter file and directory
@@ -97,35 +114,58 @@ def make_rta_basecalls_1870(intensities_dir):
     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
     if not os.path.exists(basecalls_dir):
       os.mkdir(basecalls_dir)
- 
+
     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
 
     return basecalls_dir
 
-def make_qseqs(bustard_dir, in_temp=True):
+def make_rta_basecalls_1_10(intensities_dir):
+    """
+    Construct an artificial RTA Intensities parameter file and directory
+    """
+    basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
+    if not os.path.exists(basecalls_dir):
+        os.mkdir(basecalls_dir)
+
+    make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
+    param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
+    shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
+
+    return basecalls_dir
+
+def make_qseqs(bustard_dir, in_temp=True, basecall_info=None):
     """
     Fill gerald directory with qseq files
     """
+    if basecall_info is None:
+        qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
+        tile_list = TILE_LIST
+        summary_file = '42BRJAAXX_BustardSummary.xml'
+    else:
+        qseq_file = basecall_info.qseq_file
+        tile_list = basecall_info.tile_list
+        summary_file = basecall_info.basecall_summary
+
     # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
     # was actually sequence at the start
-    source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_1_0039_qseq.txt')
+    source = os.path.join(TESTDATA_DIR, qseq_file)
     destdir = bustard_dir
     if not os.path.isdir(destdir):
         os.mkdir(destdir)
-        
+
     for lane in LANE_LIST:
-        for tile in TILE_LIST:
+        for tile in tile_list:
             destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
             shutil.copy(source, destination)
 
     make_matrix_dir(bustard_dir)
     make_phasing_dir(bustard_dir)
 
-    summary_source = os.path.join(TESTDATA_DIR, '42BRJAAXX_BustardSummary.xml')
+    summary_source = os.path.join(TESTDATA_DIR, summary_file)
     summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
     shutil.copy(summary_source, summary_dest)
-    
+
     return destdir
 
 def make_scores(gerald_dir, in_temp=True):
@@ -139,29 +179,29 @@ def make_scores(gerald_dir, in_temp=True):
         destdir = os.path.join(destdir, 'Temp')
     if not os.path.isdir(destdir):
         os.mkdir(destdir)
-        
+
     for lane in LANE_LIST:
         for tile in TILE_LIST:
             destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
             shutil.copy(source, destination)
-            
+
     return destdir
 
 def make_matrix_dir(bustard_dir):
     """
     Create several matrix files in <bustard_dir>/Matrix/
 
-    from pipeline 1.4    
+    from pipeline 1.4
     """
     destdir = os.path.join(bustard_dir, 'Matrix')
     if not os.path.isdir(destdir):
         os.mkdir(destdir)
-        
+
     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
     for lane in LANE_LIST:
         destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
         shutil.copy(source, destination)
-        
+
 def make_matrix(matrix_filename):
     contents = """# Auto-generated frequency response matrix
 > A
@@ -184,13 +224,13 @@ def make_matrix_dir_rta160(bustard_dir):
     destdir = os.path.join(bustard_dir, 'Matrix')
     if not os.path.isdir(destdir):
         os.mkdir(destdir)
-        
+
     source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
     lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
     for fragment in lane_fragments:
         destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
         shutil.copy(source, destination)
-        
+
 def make_phasing_dir(bustard_dir):
     """
     Create several phasing files in <bustard_dir>/Phasing/
@@ -200,12 +240,12 @@ def make_phasing_dir(bustard_dir):
     destdir = os.path.join(bustard_dir, 'Phasing')
     if not os.path.isdir(destdir):
         os.mkdir(destdir)
-        
+
     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
     for lane in LANE_LIST:
         destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
         shutil.copy(source, destination)
-    
+
 def make_phasing_params(bustard_dir):
     for lane in LANE_LIST:
         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
@@ -227,6 +267,11 @@ def make_gerald_config_100(gerald_dir):
     destination = os.path.join(gerald_dir, 'config.xml')
     shutil.copy(source, destination)
 
+def make_gerald_config_1_10(gerald_dir):
+    source = os.path.join(TESTDATA_DIR, 'gerald_config_1.10.xml')
+    destination = os.path.join(gerald_dir, 'config.xml')
+    shutil.copy(source, destination)
+
 def make_summary_htm_100(gerald_dir):
     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
     destination = os.path.join(gerald_dir, 'Summary.htm')
@@ -252,6 +297,13 @@ def make_summary_rta160_xml(gerald_dir):
     destination = os.path.join(gerald_dir, 'Summary.xml')
     shutil.copy(source, destination)
 
+
+def make_summary_rta1_10_xml(gerald_dir):
+    source = os.path.join(TESTDATA_DIR, 'Summary-rta1.10.xml')
+    destination = os.path.join(gerald_dir, 'Summary.xml')
+    shutil.copy(source, destination)
+
+
 def make_eland_results(gerald_dir):
     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
@@ -273,7 +325,7 @@ def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
->HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC 
+>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
@@ -339,3 +391,16 @@ _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
         f.close()
 
 
+class BaseCallInfo(object):
+    """Provide customization for how to setup the base call mock data
+    """
+    def __init__(self, qseq_file, tile_list, basecall_summary):
+        self.qseq_file = qseq_file
+        self.tile_list = tile_list
+        self.basecall_summary = basecall_summary
+
+# First generation HiSeq Flowcell
+ABXX_BASE_CALL_INFO = BaseCallInfo(
+    qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
+    tile_list = HISEQ_TILE_LIST,
+    basecall_summary = 'AA01CCABXX_BustardSummary.xml')