X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=htsworkflow%2Fsubmission%2Ftest%2Ftest_condorfastq.py;h=1e1c2d9bc56677cdef09b5b2efca48d296e5fdb3;hb=6b0392dd1788ee645fe4a467fce28d8640d2c8f2;hp=cdc9cc707ed3db691270b28defb7d920c9c6f253;hpb=0fa45b5ddb25418323977f154161d5f180236a79;p=htsworkflow.git diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py index cdc9cc7..1e1c2d9 100644 --- a/htsworkflow/submission/test/test_condorfastq.py +++ b/htsworkflow/submission/test/test_condorfastq.py @@ -5,9 +5,18 @@ import os from pprint import pprint import shutil import tempfile -import unittest -from htsworkflow.submission import condorfastq +from django.test import TestCase +from django.test.utils import setup_test_environment, \ + teardown_test_environment +from django.db import connection +from django.conf import settings + +from htsworkflow.submission.condorfastq import CondorFastqExtract +from htsworkflow.submission.results import ResultMap +from htsworkflow.util.rdfhelp import \ + add_default_schemas, load_string_into_model, dump_model +from htsworkflow.util.rdfinfer import Infer FCDIRS = [ 'C02F9ACXX', @@ -15,10 +24,13 @@ FCDIRS = [ 'C02F9ACXX/C1-202/Project_11154', 'C02F9ACXX/C1-202/Project_12342_Index1', 'C02F9ACXX/C1-202/Project_12342_Index2', + 'C02F9ACXX/C1-202/Project_12345', '42JUYAAXX', '42JUYAAXX/C1-76', '30221AAXX', '30221AAXX/C1-33', + '30DY0AAXX', + '30DY0AAXX/C1-151', '61MJTAAXX', '61MJTAAXX/C1-76', ] @@ -28,9 +40,18 @@ DATAFILES = [ 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz', 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz', 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2', @@ -56,6 +77,14 @@ DATAFILES = [ '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf', '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf', '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf', + '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf', '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2', '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2', '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2', @@ -66,67 +95,331 @@ DATAFILES = [ '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2', ] -LIBDATA = { - '11154':{u'antibody_id': None, - u'cell_line': u'Unknown', - u'cell_line_id': 1, - u'experiment_type': u'RNA-seq', - u'experiment_type_id': 4, - u'gel_cut_size': 300, - u'hidden': False, - u'id': u'11154', - u'insert_size': 200, - u'lane_set': [{u'flowcell': u'30221AAXX', - u'lane_number': 4, - u'paired_end': False, - u'read_length': 33, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'42JUYAAXX', - u'lane_number': 5, - u'paired_end': True, - u'read_length': 76, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'61MJTAAXX', - u'lane_number': 6, - u'paired_end': False, - u'read_length': 76, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'C02F9ACXX', - u'lane_number': 3, - u'paired_end': True, - u'read_length': 101, - u'status': u'Unknown', - u'status_code': None}], - u'library_id': u'11154', - u'library_name': u'Paired ends ASDF ', - u'library_species': u'Mus musculus', - u'library_species_id': 9, - u'library_type': u'Paired End (non-multiplexed)', - u'library_type_id': 2, - u'made_by': u'Gary Gygax', - u'made_for': u'TSR', - u'notes': u'300 bp gel fragment', - u'replicate': 1, - u'stopping_point': u'1Aa', - u'successful_pM': None, - u'undiluted_concentration': u'29.7'} - } - -FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'} - -class FakeApi(object): - def __init__(self, *args, **kwargs): - pass - - def get_library(self, libid): - lib_data = LIBDATA[libid] - return copy.deepcopy(lib_data) - -class TestCondorFastq(unittest.TestCase): +lib_turtle = """@prefix : . +@prefix rdfs: . +@prefix dc: . +@prefix xsd: . +@prefix libns: . +@prefix seqns: . +@prefix invns: . + + a libns:Library . + a libns:Library . + a libns:Library . + a libns:Library . + + + a libns:IlluminaFlowcell ; + libns:read_length 33 ; + libns:flowcell_type "Single"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "30221AAXX"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "1" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "2" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "4" . + # paired_end 1; + # read_length 33; + # status "Unknown"@en . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "5" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "6" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "7" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "8" . + + + a libns:IlluminaFlowcell ; + libns:read_length 76 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "42JUYAAXX"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "1" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "2" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "4" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "5" . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "6" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "7" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "8" . + + + a libns:IlluminaFlowcell ; + libns:read_length 76 ; + libns:flowcell_type "Single"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "61MJTAAXX"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "1" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "2" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "4" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "5" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "6" . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "7" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "8" . + + + a libns:IlluminaFlowcell ; + libns:read_length 76 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "30DY0AAXX"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "1" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "2" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "4" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "5" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "6" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "7" . + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "8" . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + + a libns:IlluminaFlowcell ; + libns:read_length 101 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "C02F9ACXX"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + # paired_end 1; + # read_length 101; + # status "Unknown"@en . + + + a libns:IlluminaLane ; + libns:flowcell ; + libns:library ; + libns:lane_number "3" . + # paired_end 1; + # read_length 101; + # status "Unknown"@en . + + + a libns:Library ; + libns:affiliation "TSR"@en; + libns:concentration "29.7"; + libns:date "2012-12-28T00:00:00"^^xsd:dateTime ; + libns:experiment_type "RNA-seq"@en ; + libns:gel_cut 300 ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:insert_size 2000 ; + libns:library_id "11154"@en ; + libns:library_type "Paired End (Multiplexed)"@en ; + libns:made_by "Gary Gygax"@en ; + libns:name "Paired Ends ASDF"@en ; + libns:replicate "1"@en; + libns:species_name "Mus musculus"@en ; + libns:stopping_point "Completed"@en ; + libns:total_unique_locations 8841201 . + # cell_line + + + a libns:Library ; + libns:affiliation "TSR"@en; + libns:concentration "12.345"; + libns:cell_line "Unknown"@en ; + libns:date "2012-12-28T00:00:00"^^xsd:dateTime ; + libns:experiment_type "RNA-seq"@en ; + libns:gel_cut 300 ; + libns:has_lane ; + libns:insert_size 2000 ; + libns:library_id "12345"@en ; + libns:library_type "Paired End (Multiplexed)"@en ; + libns:made_by "Gary Gygax"@en ; + libns:name "Paired Ends THING"@en ; + libns:replicate "1"@en; + libns:species_name "Mus musculus"@en ; + libns:stopping_point "Completed"@en ; + libns:total_unique_locations 8841201 . + # cell_line +""" +HOST = "http://localhost" + +class TestCondorFastq(TestCase): def setUp(self): + self.cwd = os.getcwd() + self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test') self.flowcelldir = os.path.join(self.tempdir, 'flowcells') os.mkdir(self.flowcelldir) @@ -142,134 +435,257 @@ class TestCondorFastq(unittest.TestCase): with open(filename, 'w') as stream: stream.write('testfile') + self.result_map = ResultMap() + for lib_id in [u'11154', u'12345']: + subname = 'sub-%s' % (lib_id,) + sub_dir = os.path.join(self.tempdir, subname) + os.mkdir(sub_dir) + self.result_map[lib_id] = sub_dir + + self.extract = CondorFastqExtract(HOST, + self.flowcelldir, + self.logdir) + load_string_into_model(self.extract.model, 'turtle', lib_turtle) + add_default_schemas(self.extract.model) + inference = Infer(self.extract.model) + errmsgs = list(inference.run_validation()) + self.assertEqual(len(errmsgs), 0) + os.chdir(self.tempdir) + def tearDown(self): shutil.rmtree(self.tempdir) + os.chdir(self.cwd) + + def test_find_relevant_flowcell_ids(self): + expected = set(('30221AAXX', + '42JUYAAXX', + '61MJTAAXX', + '30DY0AAXX', + 'C02F9ACXX')) + flowcell_ids = self.extract.find_relevant_flowcell_ids() + self.assertEqual(flowcell_ids, expected) def test_find_archive_sequence(self): - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - result_map = [('11154', '/notarealplace')] - lib_db = extract.find_archive_sequence_files(result_map) - - self.failUnlessEqual(len(lib_db['11154']['lanes']), 4) - lanes = [ - lib_db['11154']['lanes'][(u'30221AAXX', 4)], - lib_db['11154']['lanes'][(u'42JUYAAXX', 5)], - lib_db['11154']['lanes'][(u'61MJTAAXX', 6)], - lib_db['11154']['lanes'][(u'C02F9ACXX', 3)], - ] - self.failUnlessEqual(len(lanes[0]), 1) - self.failUnlessEqual(len(lanes[1]), 2) - self.failUnlessEqual(len(lanes[2]), 1) - self.failUnlessEqual(len(lanes[3]), 4) + seqs = self.extract.find_archive_sequence_files(self.result_map) + + expected = set([ + (u'11154', u'42JUYAAXX', '5', 1, 76, True, 'qseq'), + (u'11154', u'42JUYAAXX', '5', 2, 76, True, 'qseq'), + (u'11154', u'61MJTAAXX', '6', 1, 76, False, 'qseq'), + (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'), + (u'11154', u'30221AAXX', '4', 1, 33, False, 'srf'), + (u'11154', u'30DY0AAXX', '8', 1, 151, True, 'srf') + ]) + found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs]) + self.assertEqual(expected, found) def test_find_needed_targets(self): + lib_db = self.extract.find_archive_sequence_files(self.result_map) - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - result_map = [('11154', '/notarealplace')] - lib_db = extract.find_archive_sequence_files(result_map) - - needed_targets = extract.find_missing_targets(result_map, - lib_db) - self.failUnlessEqual(len(needed_targets), 6) + needed_targets = self.extract.update_fastq_targets(self.result_map, + lib_db) + self.assertEqual(len(needed_targets), 9) srf_30221 = needed_targets[ - u'/notarealplace/11154_30221AAXX_c33_l4.fastq'] + self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq'] qseq_42JUY_r1 = needed_targets[ - u'/notarealplace/11154_42JUYAAXX_c76_l5_r1.fastq'] + self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq'] qseq_42JUY_r2 = needed_targets[ - u'/notarealplace/11154_42JUYAAXX_c76_l5_r2.fastq'] + self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq'] qseq_61MJT = needed_targets[ - u'/notarealplace/11154_61MJTAAXX_c76_l6.fastq'] + self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq'] split_C02F9_r1 = needed_targets[ - u'/notarealplace/11154_C02F9ACXX_c202_l3_r1.fastq'] + self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq'] split_C02F9_r2 = needed_targets[ - u'/notarealplace/11154_C02F9ACXX_c202_l3_r2.fastq'] - - self.failUnlessEqual(len(srf_30221['srf']), 1) - self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1) - self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1) - self.failUnlessEqual(len(qseq_61MJT['qseq']), 1) - self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2) - self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2) + self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq'] - #print '-------needed targets---------' - #pprint(needed_targets) + self.assertEqual(len(srf_30221['srf']), 1) + self.assertEqual(len(qseq_42JUY_r1['qseq']), 1) + self.assertEqual(len(qseq_42JUY_r2['qseq']), 1) + self.assertEqual(len(qseq_61MJT['qseq']), 1) + self.assertEqual(len(split_C02F9_r1['split_fastq']), 2) + self.assertEqual(len(split_C02F9_r2['split_fastq']), 2) def test_generate_fastqs(self): - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - result_map = [('11154', '/notarealplace')] - commands = extract.build_condor_arguments(result_map) + commands = self.extract.build_condor_arguments(self.result_map) srf = commands['srf'] qseq = commands['qseq'] split = commands['split_fastq'] - self.failUnlessEqual(len(srf), 1) - self.failUnlessEqual(len(qseq), 3) - self.failUnlessEqual(len(split), 2) + self.assertEqual(len(srf), 2) + self.assertEqual(len(qseq), 3) + self.assertEqual(len(split), 4) - srf_data = {u'/notarealplace/11154_30221AAXX_c33_l4.fastq': - [u'30221AAXX', - u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'], - } + srf_data = { + os.path.join(self.result_map['11154'], + '11154_30221AAXX_c33_l4.fastq'): { + 'mid': None, + 'ispaired': False, + 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'], + 'flowcell': u'30221AAXX', + 'target': os.path.join(self.result_map['11154'], + u'11154_30221AAXX_c33_l4.fastq'), + }, + os.path.join(self.result_map['11154'], + '11154_30DY0AAXX_c151_l8_r1.fastq'): { + 'mid': None, + 'ispaired': True, + 'flowcell': u'30DY0AAXX', + 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'], + 'mid': 76, + 'target': + os.path.join(self.result_map['11154'], + u'11154_30DY0AAXX_c151_l8_r1.fastq'), + 'target_right': + os.path.join(self.result_map['11154'], + u'11154_30DY0AAXX_c151_l8_r2.fastq'), + } + } for args in srf: - args = extract_argument_list(args) - expected = srf_data[args[3]] - self.failUnless(expected[0] in args[5]) - self.failUnless(expected[1] in args[0]) - - qseq_data = {u'/notarealplace/11154_42JUYAAXX_c76_l5_r1.fastq': - [u'42JUYAAXX', - u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2'], - u'/notarealplace/11154_61MJTAAXX_c76_l6.fastq': - ['61MJTAAXX', - 'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'], - u'/notarealplace/11154_42JUYAAXX_c76_l5_r2.fastq': - ['42JUYAAXX', - 'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2'], - } + expected = srf_data[args['target']] + self.assertEqual(args['ispaired'], expected['ispaired']) + self.assertEqual(len(args['sources']), 1) + _, source_filename = os.path.split(args['sources'][0]) + self.assertEqual(source_filename, expected['sources'][0]) + self.assertEqual(args['target'], expected['target']) + if args['ispaired']: + self.assertEqual(args['target_right'], + expected['target_right']) + if 'mid' in expected: + self.assertEqual(args['mid'], expected['mid']) + + qseq_data = { + os.path.join(self.result_map['11154'], + '11154_42JUYAAXX_c76_l5_r1.fastq'): { + 'istar': True, + 'ispaired': True, + 'sources': [ + u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2'] + }, + os.path.join(self.result_map['11154'], + '11154_42JUYAAXX_c76_l5_r2.fastq'): { + 'istar': True, + 'ispaired': True, + 'sources': [ + u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2'] + }, + os.path.join(self.result_map['11154'], + '11154_61MJTAAXX_c76_l6.fastq'): { + 'istar': True, + 'ispaired': False, + 'sources': [ + u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'], + }, + } for args in qseq: - args = extract_argument_list(args) - expected = qseq_data[args[1]] - self.failUnless(expected[0] in args[3]) - self.failUnless(expected[1] in args[5]) - - split_data ={u'/notarealplace/11154_C02F9ACXX_c202_l3_r2.fastq': - [u'11154_NoIndex_L003_R2_001.fastq.gz', - u'11154_NoIndex_L003_R2_002.fastq.gz'], - u'/notarealplace/11154_C02F9ACXX_c202_l3_r1.fastq': - [u'11154_NoIndex_L003_R1_001.fastq.gz', - u'11154_NoIndex_L003_R1_002.fastq.gz'], - } - for args in split: - args = extract_argument_list(args) - expected = split_data[args[1]] - self.failUnless(expected[0] in args[2]) - self.failUnless(expected[1] in args[3]) - - #print '-------commands---------' - #pprint (commands) - -def extract_argument_list(condor_argument): - args = condor_argument.split() - # eat the command name, and the trailing queue - return args[1:-1] + expected = qseq_data[args['target']] + self.assertEqual(args['istar'], expected['istar']) + self.assertEqual(args['ispaired'], expected['ispaired']) + for i in range(len(expected['sources'])): + _, filename = os.path.split(args['sources'][i]) + self.assertEqual(filename, expected['sources'][i]) + + + split_test = dict((( x['target'], x) for x in + [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz', + u'11154_NoIndex_L003_R1_002.fastq.gz'], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'}, + {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz', + u'11154_NoIndex_L003_R2_002.fastq.gz'], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}, + {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz', + u'12345_CGATGT_L003_R1_002.fastq.gz', + u'12345_CGATGT_L003_R1_003.fastq.gz', + ], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'}, + {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz', + u'12345_CGATGT_L003_R2_002.fastq.gz', + u'12345_CGATGT_L003_R2_003.fastq.gz', + ], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'} + ] + )) + for arg in split: + _, target = os.path.split(arg['target']) + pyscript = split_test[target]['pyscript'] + self.assertTrue(arg['pyscript'].endswith(pyscript)) + filename = split_test[target]['target'] + self.assertTrue(arg['target'].endswith(filename)) + for s_index in range(len(arg['sources'])): + s1 = arg['sources'][s_index] + s2 = split_test[target]['sources'][s_index] + self.assertTrue(s1.endswith(s2)) + + def test_create_scripts(self): + self.extract.create_scripts(self.result_map) + + self.assertTrue(os.path.exists('srf.condor')) + with open('srf.condor', 'r') as srf: + arguments = [ l for l in srf if l.startswith('argument') ] + arguments.sort() + self.assertEqual(len(arguments), 2) + self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq' + in arguments[0]) + self.assertTrue( + 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in + arguments[1]) + + self.assertTrue(os.path.exists('qseq.condor')) + with open('qseq.condor', 'r') as srf: + arguments = [ l for l in srf if l.startswith('argument') ] + arguments.sort() + self.assertEqual(len(arguments), 3) + self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in + arguments[0]) + self.assertTrue( + 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in + arguments[1]) + self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in + arguments[2]) + + self.assertTrue(os.path.exists('split_fastq.condor')) + with open('split_fastq.condor', 'r') as split: + arguments = [ l for l in split if l.startswith('argument') ] + arguments.sort() + self.assertEqual(len(arguments), 4) + # Lane 3 Read 1 + self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \ + arguments[0]) + # Lane 3 Read 2 + self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \ + arguments[1]) + # Lane 3 Read 1 + self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2]) + self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2]) + self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2]) + self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2]) + + # Lane 3 Read 2 + self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3]) + self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3]) + self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3]) + self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3]) + def suite(): - suite = unittest.makeSuite(TestCondorFastq, 'test') + from unittest2 import TestSuite, defaultTestLoader + suite = TestSuite() + suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestCondorFastq)) return suite if __name__ == "__main__": - unittest.main(defaultTest='suite') - + from unittest2 import main + main(defaultTest='suite')