Fix matching scanned sequence files to library IDs work for hiseq runs.

author Diane Trout <diane@caltech.edu>

Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)

committer Diane Trout <diane@caltech.edu>

Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
author Diane Trout <diane@caltech.edu>
Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
committer Diane Trout <diane@caltech.edu>
Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py

index 64eb6a1892a93ab086adb377552db4a828a2a974..5ed9243096b9061c5ec789651fca472d0268fdf9 100644 (file)
--- a/htsworkflow/submission/condorfastq.py
+++ b/htsworkflow/submission/condorfastq.py
@@ -2,26 +2,34 @@
  """
  import logging
  import os
-from pprint import pformat
+from pprint import pformat,pprint
  import sys
  import types
+from urlparse import urljoin, urlparse
  
-from htsworkflow.pipelines.sequences import scan_for_sequences
+from htsworkflow.pipelines.sequences import scan_for_sequences, \
+     update_model_sequence_library
  from htsworkflow.pipelines.samplekey import SampleKey
  from htsworkflow.pipelines import qseq2fastq
  from htsworkflow.pipelines import srf2fastq
  from htsworkflow.pipelines import desplit_fastq
-from htsworkflow.util.api import HtswApi
+from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
+     fromTypedNode, \
+     libraryOntology, \
+     stripNamespace, \
+     rdfNS
  from htsworkflow.util.conversion import parse_flowcell_id
  
  from django.conf import settings
  from django.template import Context, loader
  
+import RDF
+
  LOGGER = logging.getLogger(__name__)
  
  
  class CondorFastqExtract(object):
-    def __init__(self, host, apidata, sequences_path,
+    def __init__(self, host, sequences_path,
                   log_path='log',
                   force=False):
          """Extract fastqs from results archive
@@ -33,7 +41,8 @@ class CondorFastqExtract(object):
            log_path (str): where to put condor log files
            force (bool): do we force overwriting current files?
          """
-        self.api = HtswApi(host, apidata)
+        self.host = host
+        self.model = get_model()
          self.sequences_path = sequences_path
          self.log_path = log_path
          self.force = force
@@ -62,7 +71,7 @@ class CondorFastqExtract(object):
                           'logdir': self.log_path,
                           'env': env,
                           'args': condor_entries[script_type],
-                         'root_url': self.api.root_url,
+                         'root_url': self.host,
                           }
              context = Context(variables)
  
@@ -79,8 +88,8 @@ class CondorFastqExtract(object):
                              'split_fastq': self.condor_desplit_fastq
                              }
          by_sample = {}
-        lib_db = self.find_archive_sequence_files(result_map)
-        needed_targets = self.find_missing_targets(result_map, lib_db)
+        sequences = self.find_archive_sequence_files(result_map)
+        needed_targets = self.find_missing_targets(result_map, sequences)
  
          for target_pathname, available_sources in needed_targets.items():
              LOGGER.debug(' target : %s' % (target_pathname,))
@@ -98,7 +107,7 @@ class CondorFastqExtract(object):
                      condor_entries.setdefault(condor_type, []).append(
                          conversion(sources, target_pathname))
                      for s in sources:
-                        by_sample.setdefault(s.lane_id,[]).append(
+                        by_sample.setdefault(s.lane_number,[]).append(
                              target_pathname)
              else:
                  print " need file", target_pathname
@@ -110,41 +119,83 @@ class CondorFastqExtract(object):
          """
          Find archived sequence files associated with our results.
          """
-        LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,))
-
-        lib_db = {}
-        seq_dirs = set()
-        candidate_lanes = {}
+        self.import_libraries(result_map)
+        flowcell_ids = self.find_relavant_flowcell_ids()
+        self.import_sequences(flowcell_ids)
+
+        query = RDF.SPARQLQuery("""
+        prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+        select ?filenode ?filetype ?cycle ?lane_number ?read
+               ?library  ?library_id
+               ?flowcell ?flowcell_id ?read_length
+               ?flowcell_type ?flowcell_status
+        where {
+            ?filenode libns:cycle ?cycle ;
+                      libns:lane_number ?lane_number ;
+                      libns:read ?read ;
+                      libns:flowcell ?flowcell ;
+                      libns:flowcell_id ?flowcell_id ;
+                      libns:library ?library ;
+                      libns:library_id ?library_id ;
+                      rdf:type ?filetype ;
+                      a libns:raw_file .
+            ?flowcell libns:read_length ?read_length ;
+                      libns:flowcell_type ?flowcell_type .
+            OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
+            FILTER(?filetype != libns:raw_file)
+        }
+        """)
+        results = []
+        for r in query.execute(self.model):
+            library_id = fromTypedNode(r['library_id'])
+            if library_id in result_map:
+                results.append(SequenceResult(r))
+        return results
+
+    def import_libraries(self, result_map):
          for lib_id in result_map.keys():
-            lib_info = self.api.get_library(lib_id)
-            lib_info['lanes'] = {}
-            lib_db[lib_id] = lib_info
-
-            for lane in lib_info['lane_set']:
-                lane_key = (lane['flowcell'], lane['lane_number'])
-                candidate_lanes[lane_key] = (lib_id, lane['lane_id'])
-                seq_dirs.add(os.path.join(self.sequences_path,
-                                             'flowcells',
-                                             lane['flowcell']))
-        LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
-        candidate_seq_list = scan_for_sequences(seq_dirs)
-
-        # at this point we have too many sequences as scan_for_sequences
-        # returns all the sequences in a flowcell directory
-        # so lets filter out the extras
-
-        for seq in candidate_seq_list:
-            lane_key = (seq.flowcell, seq.lane)
-            candidate_key = candidate_lanes.get(lane_key, None)
-            if candidate_key is not None:
-                lib_id, lane_id = candidate_key
-                seq.lane_id = lane_id
-                lib_info = lib_db[lib_id]
-                lib_info['lanes'].setdefault(lane_key, set()).add(seq)
-
-        return lib_db
-
-    def find_missing_targets(self, result_map, lib_db):
+            liburl = urljoin(self.host, 'library/%s/' % (lib_id,))
+            library = RDF.Node(RDF.Uri(liburl))
+            self.import_library(library)
+
+    def import_library(self, library):
+        """Import library data into our model if we don't have it already
+        """
+        q = RDF.Statement(library, rdfNS['type'], libraryOntology['library'])
+        if not self.model.contains_statement(q):
+            load_into_model(self.model, 'rdfa', library)
+
+    def find_relavant_flowcell_ids(self):
+        """Generate set of flowcell ids that had samples of interest on them
+        """
+        flowcell_query =RDF.SPARQLQuery("""
+prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+select distinct ?library ?flowcell ?flowcell_id
+WHERE {
+  ?library a libns:library ;
+           libns:has_lane ?lane .
+  ?lane libns:flowcell ?flowcell .
+  ?flowcell libns:flowcell_id ?flowcell_id .
+}""")
+        flowcell_ids = set()
+        for r in flowcell_query.execute(self.model):
+            flowcell_ids.add( fromTypedNode(r['flowcell_id']) )
+        LOGGER.debug("Flowcells = %s" %(unicode(flowcell_ids)))
+        return flowcell_ids
+
+    def import_sequences(self, flowcell_ids):
+        seq_dirs = ( os.path.join(self.sequences_path, f)
+                     for f in flowcell_ids )
+        sequences = scan_for_sequences(seq_dirs)
+        for seq in sequences:
+            seq.save_to_model(self.model)
+        update_model_sequence_library(self.model, self.host)
+
+    def find_missing_targets(self, result_map, raw_files):
          """
          Check if the sequence file exists.
          This requires computing what the sequence name is and checking
@@ -156,61 +207,48 @@ class CondorFastqExtract(object):
          fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
          # find what targets we're missing
          needed_targets = {}
-        for lib_id in result_map.keys():
-            result_dir = result_map[lib_id]
-            lib = lib_db[lib_id]
-            lane_dict = make_lane_dict(lib_db, lib_id)
-
-            for lane_key, sequences in lib['lanes'].items():
-                for seq in sequences:
-                    seq.paired = lane_dict[seq.flowcell]['paired_end']
-                    lane_status = lane_dict[seq.flowcell]['status']
-
-                    if seq.paired and seq.read is None:
-                        seq.read = 1
-                    filename_attributes = {
-                        'flowcell': seq.flowcell,
-                        'lib_id': lib_id,
-                        'lane': seq.lane,
-                        'read': seq.read,
-                        'cycle': seq.cycle
-                        }
-                    # skip bad runs
-                    if lane_status == 'Failed':
-                        continue
-                    if seq.flowcell == '30DY0AAXX':
-                        # 30DY0 only ran for 151 bases instead of 152
-                        # it is actually 76 1st read, 75 2nd read
-                        seq.mid_point = 76
-
-                    # end filters
-                    if seq.paired:
-                        target_name = fastq_paired_template % \
-                                      filename_attributes
-                    else:
-                        target_name = fastq_single_template % \
-                                      filename_attributes
-
-                    target_pathname = os.path.join(result_dir, target_name)
-                    if self.force or not os.path.exists(target_pathname):
-                        t = needed_targets.setdefault(target_pathname, {})
-                        t.setdefault(seq.filetype, []).append(seq)
+        for seq in raw_files:
+            if not seq.isgood:
+                continue
+            filename_attributes = {
+                'flowcell': seq.flowcell_id,
+                'lib_id': seq.library_id,
+                'lane': seq.lane_number,
+                'read': seq.read,
+                'cycle': seq.cycle
+            }
+
+            if seq.ispaired:
+                target_name = fastq_paired_template % \
+                              filename_attributes
+            else:
+                target_name = fastq_single_template % \
+                              filename_attributes
  
-        return needed_targets
+            result_dir = result_map[seq.library_id]
+            target_pathname = os.path.join(result_dir, target_name)
+            if self.force or not os.path.exists(target_pathname):
+                t = needed_targets.setdefault(target_pathname, {})
+                t.setdefault(seq.filetype, []).append(seq)
  
+        return needed_targets
  
      def condor_srf_to_fastq(self, sources, target_pathname):
          if len(sources) > 1:
              raise ValueError("srf to fastq can only handle one file")
  
+        mid_point = None
+        if sources[0].flowcell_id == '30DY0AAXX':
+            mid_point = 76
+
          return {
-            'sources': [os.path.abspath(sources[0].path)],
+            'sources': [sources[0].path],
              'pyscript': srf2fastq.__file__,
-            'flowcell': sources[0].flowcell,
-            'ispaired': sources[0].paired,
+            'flowcell': sources[0].flowcell_id,
+            'ispaired': sources[0].ispaired,
              'target': target_pathname,
              'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
-            'mid': getattr(sources[0], 'mid_point', None),
+            'mid': mid_point,
              'force': self.force,
          }
  
@@ -221,10 +259,10 @@ class CondorFastqExtract(object):
          paths.sort()
          return {
              'pyscript': qseq2fastq.__file__,
-            'flowcell': sources[0].flowcell,
+            'flowcell': sources[0].flowcell_id,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
              'istar': len(sources) == 1,
          }
  
@@ -237,11 +275,9 @@ class CondorFastqExtract(object):
              'pyscript': desplit_fastq.__file__,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
          }
  
-    def lane_rdf(self, sources, target_pathname):
-        pass
  
  def make_lane_dict(lib_db, lib_id):
      """
@@ -255,3 +291,48 @@ def make_lane_dict(lib_db, lib_id):
          result.append((lane['flowcell'], lane))
      return dict(result)
  
+class SequenceResult(object):
+    """Convert the sparql query result from find_archive_sequence_files
+    """
+    def __init__(self, result):
+        self.filenode = result['filenode']
+        self._filetype = result['filetype']
+        self.cycle = fromTypedNode(result['cycle'])
+        self.lane_number = fromTypedNode(result['lane_number'])
+        self.read = fromTypedNode(result['read'])
+        if type(self.read) in types.StringTypes:
+            self.read = 1
+        self.library = result['library']
+        self.library_id = fromTypedNode(result['library_id'])
+        self.flowcell = result['flowcell']
+        self.flowcell_id = fromTypedNode(result['flowcell_id'])
+        self.flowcell_type = fromTypedNode(result['flowcell_type'])
+        self.flowcell_status = fromTypedNode(result['flowcell_status'])
+
+    def _is_good(self):
+        """is this sequence / flowcell 'good enough'"""
+        if self.flowcell_status is not None and \
+           self.flowcell_status.lower() == "failed":
+            return False
+        return True
+    isgood = property(_is_good)
+
+    def _get_ispaired(self):
+        if self.flowcell_type.lower() == "paired":
+            return True
+        else:
+            return False
+    ispaired = property(_get_ispaired)
+
+    def _get_filetype(self):
+        return stripNamespace(libraryOntology, self._filetype)
+    filetype = property(_get_filetype)
+
+    def _get_path(self):
+        url = urlparse(str(self.filenode.uri))
+        if url.scheme == 'file':
+            return url.path
+        else:
+            errmsg = u"Unsupported scheme {0} for {1}"
+            raise ValueError(errmsg.format(url.scheme, unicode(url)))
+    path = property(_get_path)
diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py

index 6aaf5846de8f3eb86e4f73145e487c73bc2ae914..24d6e4982018dd23ac8e8746d7ade1856ae3d4e7 100644 (file)
--- a/htsworkflow/submission/test/test_condorfastq.py
+++ b/htsworkflow/submission/test/test_condorfastq.py
@@ -7,8 +7,9 @@ import shutil
  import tempfile
  import unittest
  
-from htsworkflow.submission import condorfastq
+from htsworkflow.submission.condorfastq import CondorFastqExtract
  from htsworkflow.submission.results import ResultMap
+from htsworkflow.util.rdfhelp import load_string_into_model, dump_model
  
  FCDIRS = [
      'C02F9ACXX',
@@ -16,6 +17,7 @@ FCDIRS = [
      'C02F9ACXX/C1-202/Project_11154',
      'C02F9ACXX/C1-202/Project_12342_Index1',
      'C02F9ACXX/C1-202/Project_12342_Index2',
+    'C02F9ACXX/C1-202/Project_12345',
      '42JUYAAXX',
      '42JUYAAXX/C1-76',
      '30221AAXX',
@@ -31,9 +33,18 @@ DATAFILES = [
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
@@ -77,77 +88,288 @@ DATAFILES = [
      '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
  ]
  
-LIBDATA = {
-    '11154':{u'antibody_id': None,
-             u'cell_line': u'Unknown',
-             u'cell_line_id': 1,
-             u'experiment_type': u'RNA-seq',
-             u'experiment_type_id': 4,
-             u'gel_cut_size': 300,
-             u'hidden': False,
-             u'id': u'11154',
-             u'insert_size': 200,
-             u'lane_set': [{u'flowcell': u'30221AAXX',
-                            u'lane_number': 4,
-                            u'lane_id': 3400,
-                            u'paired_end': False,
-                            u'read_length': 33,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'42JUYAAXX',
-                            u'lane_number': 5,
-                            u'lane_id': 4200,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'61MJTAAXX',
-                            u'lane_number': 6,
-                            u'lane_id': 6600,
-                            u'paired_end': False,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'30DY0AAXX',
-                            u'lane_number': 8,
-                            u'lane_id': 3800,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'C02F9ACXX',
-                            u'lane_number': 3,
-                            u'lane_id': 12300,
-                            u'paired_end': True,
-                            u'read_length': 101,
-                            u'status': u'Unknown',
-                            u'status_code': None}],
-             u'library_id': u'11154',
-             u'library_name': u'Paired ends ASDF ',
-             u'library_species': u'Mus musculus',
-             u'library_species_id': 9,
-             u'library_type': u'Paired End (non-multiplexed)',
-             u'library_type_id': 2,
-             u'made_by': u'Gary Gygax',
-             u'made_for': u'TSR',
-             u'notes': u'300 bp gel fragment',
-             u'replicate': 1,
-             u'stopping_point': u'1Aa',
-             u'successful_pM': None,
-             u'undiluted_concentration': u'29.7'}
-    }
-
-FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
-
-class FakeApi(object):
-    def __init__(self, *args, **kwargs):
-        self.root_url = 'http://localhost'
-
-    def get_library(self, libid):
-        lib_data = LIBDATA[libid]
-        return copy.deepcopy(lib_data)
-
-
+lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
+
+<http://localhost/flowcell/30221AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 33 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3401> ;
+        libns:has_lane <http://localhost/lane/3402> ;
+        libns:has_lane <http://localhost/lane/3403> ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/3405> ;
+        libns:has_lane <http://localhost/lane/3406> ;
+        libns:has_lane <http://localhost/lane/3407> ;
+        libns:has_lane <http://localhost/lane/3408> ;
+        libns:flowcell_id "30221AAXX"@en .
+
+<http://localhost/lane/3401>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3402>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3403>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3404>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 4 .
+        # paired_end 1;
+        # read_length 33;
+        # status "Unknown"@en .
+<http://localhost/lane/3405>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3406>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3407>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3408>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/42JUYAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/4201> ;
+        libns:has_lane <http://localhost/lane/4202> ;
+        libns:has_lane <http://localhost/lane/4203> ;
+        libns:has_lane <http://localhost/lane/4204> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/4206> ;
+        libns:has_lane <http://localhost/lane/4207> ;
+        libns:has_lane <http://localhost/lane/4208> ;
+        libns:flowcell_id "42JUYAAXX"@en .
+
+<http://localhost/lane/4201>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/4202>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/4203>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/4204>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/4205>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 5 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/4206>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/4207>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/4208>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/61MJTAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/6601> ;
+        libns:has_lane <http://localhost/lane/6602> ;
+        libns:has_lane <http://localhost/lane/6603> ;
+        libns:has_lane <http://localhost/lane/6604> ;
+        libns:has_lane <http://localhost/lane/6605> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/6607> ;
+        libns:has_lane <http://localhost/lane/6608> ;
+        libns:flowcell_id "61MJTAAXX"@en .
+
+<http://localhost/lane/6601>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/6602>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/6603>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/6604>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/6605>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/6606>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 6 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/6607>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/6608>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/30DY0AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3801> ;
+        libns:has_lane <http://localhost/lane/3802> ;
+        libns:has_lane <http://localhost/lane/3803> ;
+        libns:has_lane <http://localhost/lane/3804> ;
+        libns:has_lane <http://localhost/lane/3805> ;
+        libns:has_lane <http://localhost/lane/3806> ;
+        libns:has_lane <http://localhost/lane/3807> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:flowcell_id "30DY0AAXX"@en .
+
+<http://localhost/lane/3801>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3802>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3803>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3804>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/3805>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3806>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3807>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3808>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 8 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+
+<http://localhost/flowcell/C02F9ACXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 101 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:flowcell_id "C02F9ACXX"@en .
+
+<http://localhost/lane/12300>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/12345/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/lane/12500>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/library/11154/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "29.7";
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:insert_size 2000 ;
+        libns:library_id "11154"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends ASDF"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+
+
+<http://localhost/library/12345/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "12.345";
+        libns:cell_line "Unknown"@en ;
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:insert_size 2000 ;
+        libns:library_id "12345"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends THING"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+"""
+HOST = "http://localhost"
  
  class TestCondorFastq(unittest.TestCase):
      def setUp(self):
@@ -168,141 +390,146 @@ class TestCondorFastq(unittest.TestCase):
              with open(filename, 'w') as stream:
                  stream.write('testfile')
  
-        self.subname = unicode('sub-11154')
-        self.subdir = os.path.join(self.tempdir, self.subname)
-        os.mkdir(self.subdir)
-
          self.result_map = ResultMap()
-        self.result_map['11154'] = self.subname
+        for lib_id in [u'11154', u'12345']:
+            subname = 'sub-%s' % (lib_id,)
+            sub_dir = os.path.join(self.tempdir, subname)
+            os.mkdir(sub_dir)
+            self.result_map[lib_id] =  sub_dir
+
+        self.extract = CondorFastqExtract(HOST,
+                                          self.flowcelldir,
+                                          self.logdir)
+        load_string_into_model(self.extract.model, 'turtle', lib_turtle)
  
      def tearDown(self):
          shutil.rmtree(self.tempdir)
          os.chdir(self.cwd)
  
+    def test_find_relavant_flowcell_ids(self):
+        expected = set(('30221AAXX',
+                        '42JUYAAXX',
+                        '61MJTAAXX',
+                        '30DY0AAXX',
+                        'C02F9ACXX'))
+        flowcell_ids = self.extract.find_relavant_flowcell_ids()
+        self.assertEqual(flowcell_ids, expected)
+
      def test_find_archive_sequence(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
-        lanes = [
-            lib_db['11154']['lanes'][(u'30221AAXX', 4)],
-            lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
-            lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
-            lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
-            lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
-        ]
-        self.failUnlessEqual(len(lanes[0]), 1)
-        self.failUnlessEqual(len(lanes[1]), 2)
-        self.failUnlessEqual(len(lanes[2]), 1)
-        self.failUnlessEqual(len(lanes[3]), 1)
-        self.failUnlessEqual(len(lanes[4]), 4)
+        seqs = self.extract.find_archive_sequence_files(self.result_map)
+
+        expected = set([
+            (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
+            (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
+            (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
+            (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
+        ])
+        found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
+        self.assertEqual(expected, found)
  
      def test_find_needed_targets(self):
+        lib_db = self.extract.find_archive_sequence_files(self.result_map)
  
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        needed_targets = extract.find_missing_targets(self.result_map,
-                                                      lib_db)
-        self.failUnlessEqual(len(needed_targets), 7)
+        needed_targets = self.extract.find_missing_targets(self.result_map,
+                                                           lib_db)
+        self.assertEqual(len(needed_targets), 9)
          srf_30221 = needed_targets[
-            self.subname + u'/11154_30221AAXX_c33_l4.fastq']
+            self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
          qseq_42JUY_r1 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
          qseq_42JUY_r2 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
          qseq_61MJT = needed_targets[
-            self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
+            self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
          split_C02F9_r1 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
          split_C02F9_r2 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
  
-        self.failUnlessEqual(len(srf_30221['srf']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
-        self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
-        self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
-        self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
-
-        #print '-------needed targets---------'
-        #pprint(needed_targets)
+        self.assertEqual(len(srf_30221['srf']), 1)
+        self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
+        self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
+        self.assertEqual(len(qseq_61MJT['qseq']), 1)
+        self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
+        self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
  
      def test_generate_fastqs(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        commands = extract.build_condor_arguments(self.result_map)
+        commands = self.extract.build_condor_arguments(self.result_map)
  
          srf = commands['srf']
          qseq = commands['qseq']
          split = commands['split_fastq']
  
-        self.failUnlessEqual(len(srf), 2)
-        self.failUnlessEqual(len(qseq), 3)
-        self.failUnlessEqual(len(split), 2)
+        self.assertEqual(len(srf), 2)
+        self.assertEqual(len(qseq), 3)
+        self.assertEqual(len(split), 4)
  
          srf_data = {
-            os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30221AAXX_c33_l4.fastq'): {
                  'mid': None,
                  'ispaired': False,
                  'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
                  'flowcell': u'30221AAXX',
-                'target': os.path.join(self.subname,
+                'target': os.path.join(self.result_map['11154'],
                                         u'11154_30221AAXX_c33_l4.fastq'),
              },
-            os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30DY0AAXX_c151_l8_r1.fastq'): {
                  'mid': None,
                  'ispaired': True,
                  'flowcell': u'30DY0AAXX',
                  'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
                  'mid': 76,
                  'target':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r1.fastq'),
                  'target_right':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r2.fastq'),
              }
          }
          for args in srf:
              expected = srf_data[args['target']]
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
-            self.failUnlessEqual(len(args['sources']), 1)
+            self.assertEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(len(args['sources']), 1)
              _, source_filename = os.path.split(args['sources'][0])
-            self.failUnlessEqual(source_filename, expected['sources'][0])
-            self.failUnlessEqual(args['target'], expected['target'])
+            self.assertEqual(source_filename, expected['sources'][0])
+            self.assertEqual(args['target'], expected['target'])
              if args['ispaired']:
-                self.failUnlessEqual(args['target_right'],
+                self.assertEqual(args['target_right'],
                                       expected['target_right'])
              if 'mid' in expected:
-                self.failUnlessEqual(args['mid'], expected['mid'])
+                self.assertEqual(args['mid'], expected['mid'])
  
          qseq_data = {
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r1.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
              },
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r2.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
              },
-            os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_61MJTAAXX_c76_l6.fastq'): {
                  'istar': True,
                  'ispaired': False,
                  'sources': [
@@ -311,11 +538,11 @@ class TestCondorFastq(unittest.TestCase):
          }
          for args in qseq:
              expected = qseq_data[args['target']]
-            self.failUnlessEqual(args['istar'], expected['istar'])
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(args['istar'], expected['istar'])
+            self.assertEqual(args['ispaired'], expected['ispaired'])
              for i in range(len(expected['sources'])):
                  _, filename = os.path.split(args['sources'][i])
-                self.failUnlessEqual(filename, expected['sources'][i])
+                self.assertEqual(filename, expected['sources'][i])
  
  
          split_test = dict((( x['target'], x) for x in
@@ -326,64 +553,82 @@ class TestCondorFastq(unittest.TestCase):
              {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
                           u'11154_NoIndex_L003_R2_002.fastq.gz'],
               'pyscript': 'desplit_fastq.pyc',
-             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
+             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
+                         u'12345_CGATGT_L003_R1_002.fastq.gz',
+                         u'12345_CGATGT_L003_R1_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
+                         u'12345_CGATGT_L003_R2_002.fastq.gz',
+                         u'12345_CGATGT_L003_R2_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
+             ]
           ))
          for arg in split:
              _, target = os.path.split(arg['target'])
              pyscript = split_test[target]['pyscript']
-            self.failUnless(arg['pyscript'].endswith(pyscript))
+            self.assertTrue(arg['pyscript'].endswith(pyscript))
              filename = split_test[target]['target']
-            self.failUnless(arg['target'].endswith(filename))
+            self.assertTrue(arg['target'].endswith(filename))
              for s_index in range(len(arg['sources'])):
                  s1 = arg['sources'][s_index]
                  s2 = split_test[target]['sources'][s_index]
-                self.failUnless(s1.endswith(s2))
-
-        #print '-------commands---------'
-        #pprint (commands)
+                self.assertTrue(s1.endswith(s2))
  
      def test_create_scripts(self):
-        os.chdir(self.tempdir)
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        extract.create_scripts(self.result_map)
-
-        self.failUnless(os.path.exists('srf.condor'))
+        self.extract.create_scripts(self.result_map)
+
+        self.assertTrue(os.path.exists('srf.condor'))
          with open('srf.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
+            self.assertEqual(len(arguments), 2)
+            self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
                              in arguments[0])
-            self.failUnless(
-                '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
+            self.assertTrue(
+                'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
                  arguments[1])
  
-        self.failUnless(os.path.exists('qseq.condor'))
+        self.assertTrue(os.path.exists('qseq.condor'))
          with open('qseq.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 3)
-            self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
+            self.assertEqual(len(arguments), 3)
+            self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
                              arguments[0])
-            self.failUnless(
+            self.assertTrue(
                  'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
                  arguments[1])
-            self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
+            self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
                              arguments[2])
  
-        self.failUnless(os.path.exists('split_fastq.condor'))
+        self.assertTrue(os.path.exists('split_fastq.condor'))
          with open('split_fastq.condor', 'r') as split:
              arguments = [ l for l in split if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
+            self.assertEqual(len(arguments), 4)
+            # Lane 3 Read 1
+            self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
                              arguments[0])
-            self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
+            # Lane 3 Read 2
+            self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
                              arguments[1])
+            # Lane 3 Read 1
+            self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
+
+            # Lane 3 Read 2
+            self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
+
  
  def suite():
      suite = unittest.makeSuite(TestCondorFastq, 'test')
author	Diane Trout <diane@caltech.edu>
	Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
committer	Diane Trout <diane@caltech.edu>
	Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
htsworkflow/submission/condorfastq.py		patch \| blob \| history
htsworkflow/submission/test/test_condorfastq.py		patch \| blob \| history