Improve support for eland searching a single fasta containing multiple records.
authorDiane Trout <diane@caltech.edu>
Tue, 10 Mar 2009 01:11:22 +0000 (01:11 +0000)
committerDiane Trout <diane@caltech.edu>
Tue, 10 Mar 2009 01:11:22 +0000 (01:11 +0000)
the problem was that I was assuming / was a path seperator between genome
directory name and chromosome, but eland was also reporting it as
fasta file name / fasta record.

By happy accident in genome map, the fasta file with multiple records would
be stored in the GenomeMap dictionary as having the same name, value pair
while things that had the genome encoded would be fasta filename mapping to
genome/fasta filename.

as a result it appears that splitting a mapped item on the path seperator /
and then looking the "base path" up in the genome map will allow me to
determine if an element is a genome directory "path" or a multi record
fasta file by its absence (for genome dirs) or presence (for multi
fasta records)

htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/test/simulate_runfolder.py
htsworkflow/pipelines/test/test_runfolder110.py
htsworkflow/pipelines/test/test_runfolder_ipar100.py
htsworkflow/pipelines/test/test_runfolder_pair.py

index 3c765ae8ab25b148bc37238ad621f0f9b148711f..e1072400716b1bb3affd5c65158baf63701feadb 100644 (file)
@@ -181,7 +181,7 @@ def get_runs(runfolder):
                     p.gerald = g
                     runs.append(p)
                 except IOError, e:
-                    print "Ignoring", str(e)
+                    logging.error("Ignoring " + str(e))
 
     datadir = os.path.join(runfolder, 'Data')
 
@@ -222,7 +222,7 @@ def extract_run_parameters(runs):
     for run in runs:
       run.save()
 
-def summarize_mapped_reads(mapped_reads):
+def summarize_mapped_reads(genome_map, mapped_reads):
     """
     Summarize per chromosome reads into a genome count
     But handle spike-in/contamination symlinks seperately.
@@ -232,7 +232,7 @@ def summarize_mapped_reads(mapped_reads):
     genome = 'unknown'
     for k, v in mapped_reads.items():
         path, k = os.path.split(k)
-        if len(path) > 0:
+        if len(path) > 0 and not genome_map.has_key(path):
             genome = path
             genome_reads += v
         else:
@@ -263,7 +263,7 @@ def summarize_lane(gerald, lane_id):
       report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
                     (mc['R0'], mc['R1'], mc['R2']))
       report.append("Mapped Reads")
-      mapped_reads = summarize_mapped_reads(eland_result.mapped_reads)
+      mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads)
       for name, counts in mapped_reads.items():
         report.append("  %s: %d" % (name, counts))
       report.append('')
index cc596027bd74c0df627bcbef7d7852c89a292e77..72c1a1a5958ffc9dd93034a0434f13938954d8e6 100644 (file)
@@ -1917,11 +1917,16 @@ def make_eland_multi(gerald_dir, paired=False):
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
+>HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
+>HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC 
 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
->HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1"""]
+>HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
+>HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
+>HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
+"""]
     if paired:
         for e in [1,2]:
             for i in range(1,9):
index 1c42a78df9694ad2a93cbf4887d7b6acdd2b1f7c..fc96bc8a821f74a826a40a9fbbf1e45367326c56 100644 (file)
@@ -225,12 +225,12 @@ class RunfolderTests(unittest.TestCase):
 
         for i in range(1,9):
             lane = eland.results[0][i]
-            self.failUnlessEqual(lane.reads, 4)
+            self.failUnlessEqual(lane.reads, 6)
             self.failUnlessEqual(lane.sample_name, "s")
             self.failUnlessEqual(lane.lane_id, i)
-            self.failUnlessEqual(len(lane.mapped_reads), 15)
+            self.failUnlessEqual(len(lane.mapped_reads), 17)
             self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4)
-            self.failUnlessEqual(lane.match_codes['U0'], 1)
+            self.failUnlessEqual(lane.match_codes['U0'], 3)
             self.failUnlessEqual(lane.match_codes['R0'], 2)
             self.failUnlessEqual(lane.match_codes['U1'], 1)
             self.failUnlessEqual(lane.match_codes['R1'], 9)
@@ -251,7 +251,7 @@ class RunfolderTests(unittest.TestCase):
             self.failUnlessEqual(l1.sample_name, l2.sample_name)
             self.failUnlessEqual(l1.lane_id, l2.lane_id)
             self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
-            self.failUnlessEqual(len(l1.mapped_reads), 15)
+            self.failUnlessEqual(len(l1.mapped_reads), 17)
             for k in l1.mapped_reads.keys():
                 self.failUnlessEqual(l1.mapped_reads[k],
                                      l2.mapped_reads[k])
index 4f07effdae3bce929935ddfa53daa484ca1d7c72..bf1b520124ab11c01c0ce2a39f39a8d9e83e8cd4 100644 (file)
@@ -220,12 +220,14 @@ class RunfolderTests(unittest.TestCase):
 
         for i in range(1,9):
             lane = eland.results[0][i]
-            self.failUnlessEqual(lane.reads, 4)
+            self.failUnlessEqual(lane.reads, 6)
             self.failUnlessEqual(lane.sample_name, "s")
             self.failUnlessEqual(lane.lane_id, i)
-            self.failUnlessEqual(len(lane.mapped_reads), 15)
+            self.failUnlessEqual(len(lane.mapped_reads), 17)
             self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4)
-            self.failUnlessEqual(lane.match_codes['U0'], 1)
+            self.failUnlessEqual(lane.mapped_reads['spike.fa/sample1'], 1)
+            self.failUnlessEqual(lane.mapped_reads['spike.fa/sample2'], 1)
+            self.failUnlessEqual(lane.match_codes['U0'], 3)
             self.failUnlessEqual(lane.match_codes['R0'], 2)
             self.failUnlessEqual(lane.match_codes['U1'], 1)
             self.failUnlessEqual(lane.match_codes['R1'], 9)
@@ -246,7 +248,7 @@ class RunfolderTests(unittest.TestCase):
             self.failUnlessEqual(l1.sample_name, l2.sample_name)
             self.failUnlessEqual(l1.lane_id, l2.lane_id)
             self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
-            self.failUnlessEqual(len(l1.mapped_reads), 15)
+            self.failUnlessEqual(len(l1.mapped_reads), 17)
             for k in l1.mapped_reads.keys():
                 self.failUnlessEqual(l1.mapped_reads[k],
                                      l2.mapped_reads[k])
index 9d8530e6e33f46c018b51b8d8363302231f51a2f..ab38d973d161fd55ef8536945cd72485b2a4712b 100644 (file)
@@ -229,12 +229,12 @@ class RunfolderTests(unittest.TestCase):
         # check first end
         for i in range(1,9):
             lane = eland.results[0][i]
-            self.failUnlessEqual(lane.reads, 4)
+            self.failUnlessEqual(lane.reads, 6)
             self.failUnlessEqual(lane.sample_name, "s")
             self.failUnlessEqual(lane.lane_id, i)
-            self.failUnlessEqual(len(lane.mapped_reads), 15)
+            self.failUnlessEqual(len(lane.mapped_reads), 17)
             self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4)
-            self.failUnlessEqual(lane.match_codes['U0'], 1)
+            self.failUnlessEqual(lane.match_codes['U0'], 3)
             self.failUnlessEqual(lane.match_codes['R0'], 2)
             self.failUnlessEqual(lane.match_codes['U1'], 1)
             self.failUnlessEqual(lane.match_codes['R1'], 9)
@@ -246,12 +246,12 @@ class RunfolderTests(unittest.TestCase):
         # check second end
         for i in range(1,9):
             lane = eland.results[1][i]
-            self.failUnlessEqual(lane.reads, 5)
+            self.failUnlessEqual(lane.reads, 7)
             self.failUnlessEqual(lane.sample_name, "s")
             self.failUnlessEqual(lane.lane_id, i)
-            self.failUnlessEqual(len(lane.mapped_reads), 15)
+            self.failUnlessEqual(len(lane.mapped_reads), 17)
             self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4)
-            self.failUnlessEqual(lane.match_codes['U0'], 1)
+            self.failUnlessEqual(lane.match_codes['U0'], 3)
             self.failUnlessEqual(lane.match_codes['R0'], 2)
             self.failUnlessEqual(lane.match_codes['U1'], 1)
             self.failUnlessEqual(lane.match_codes['R1'], 9)
@@ -273,7 +273,7 @@ class RunfolderTests(unittest.TestCase):
                 self.failUnlessEqual(l1.sample_name, l2.sample_name)
                 self.failUnlessEqual(l1.lane_id, l2.lane_id)
                 self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
-                self.failUnlessEqual(len(l1.mapped_reads), 15)
+                self.failUnlessEqual(len(l1.mapped_reads), 17)
                 for k in l1.mapped_reads.keys():
                     self.failUnlessEqual(l1.mapped_reads[k],
                                          l2.mapped_reads[k])