Fix matching scanned sequence files to library IDs work for hiseq runs.

author Diane Trout <diane@caltech.edu>

Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)

committer Diane Trout <diane@caltech.edu>

Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
author Diane Trout <diane@caltech.edu>
Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
committer Diane Trout <diane@caltech.edu>
Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py

index 64eb6a1892a93ab086adb377552db4a828a2a974..5ed9243096b9061c5ec789651fca472d0268fdf9 100644 (file)
--- a/htsworkflow/submission/condorfastq.py
+++ b/htsworkflow/submission/condorfastq.py
@@ -2,26 +2,34 @@
  """
  import logging
  import os
  """
  import logging
  import os
-from pprint import pformat
+from pprint import pformat,pprint
  import sys
  import types
  import sys
  import types
+from urlparse import urljoin, urlparse
  
  
-from htsworkflow.pipelines.sequences import scan_for_sequences
+from htsworkflow.pipelines.sequences import scan_for_sequences, \
+     update_model_sequence_library
  from htsworkflow.pipelines.samplekey import SampleKey
  from htsworkflow.pipelines import qseq2fastq
  from htsworkflow.pipelines import srf2fastq
  from htsworkflow.pipelines import desplit_fastq
  from htsworkflow.pipelines.samplekey import SampleKey
  from htsworkflow.pipelines import qseq2fastq
  from htsworkflow.pipelines import srf2fastq
  from htsworkflow.pipelines import desplit_fastq
-from htsworkflow.util.api import HtswApi
+from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
+     fromTypedNode, \
+     libraryOntology, \
+     stripNamespace, \
+     rdfNS
  from htsworkflow.util.conversion import parse_flowcell_id
  
  from django.conf import settings
  from django.template import Context, loader
  
  from htsworkflow.util.conversion import parse_flowcell_id
  
  from django.conf import settings
  from django.template import Context, loader
  
+import RDF
+
  LOGGER = logging.getLogger(__name__)
  
  
  class CondorFastqExtract(object):
  LOGGER = logging.getLogger(__name__)
  
  
  class CondorFastqExtract(object):
-    def __init__(self, host, apidata, sequences_path,
+    def __init__(self, host, sequences_path,
                   log_path='log',
                   force=False):
          """Extract fastqs from results archive
                   log_path='log',
                   force=False):
          """Extract fastqs from results archive
@@ -33,7 +41,8 @@ class CondorFastqExtract(object):
            log_path (str): where to put condor log files
            force (bool): do we force overwriting current files?
          """
            log_path (str): where to put condor log files
            force (bool): do we force overwriting current files?
          """
-        self.api = HtswApi(host, apidata)
+        self.host = host
+        self.model = get_model()
          self.sequences_path = sequences_path
          self.log_path = log_path
          self.force = force
          self.sequences_path = sequences_path
          self.log_path = log_path
          self.force = force
@@ -62,7 +71,7 @@ class CondorFastqExtract(object):
                           'logdir': self.log_path,
                           'env': env,
                           'args': condor_entries[script_type],
                           'logdir': self.log_path,
                           'env': env,
                           'args': condor_entries[script_type],
-                         'root_url': self.api.root_url,
+                         'root_url': self.host,
                           }
              context = Context(variables)
  
                           }
              context = Context(variables)
  
@@ -79,8 +88,8 @@ class CondorFastqExtract(object):
                              'split_fastq': self.condor_desplit_fastq
                              }
          by_sample = {}
                              'split_fastq': self.condor_desplit_fastq
                              }
          by_sample = {}
-        lib_db = self.find_archive_sequence_files(result_map)
-        needed_targets = self.find_missing_targets(result_map, lib_db)
+        sequences = self.find_archive_sequence_files(result_map)
+        needed_targets = self.find_missing_targets(result_map, sequences)
  
          for target_pathname, available_sources in needed_targets.items():
              LOGGER.debug(' target : %s' % (target_pathname,))
  
          for target_pathname, available_sources in needed_targets.items():
              LOGGER.debug(' target : %s' % (target_pathname,))
@@ -98,7 +107,7 @@ class CondorFastqExtract(object):
                      condor_entries.setdefault(condor_type, []).append(
                          conversion(sources, target_pathname))
                      for s in sources:
                      condor_entries.setdefault(condor_type, []).append(
                          conversion(sources, target_pathname))
                      for s in sources:
-                        by_sample.setdefault(s.lane_id,[]).append(
+                        by_sample.setdefault(s.lane_number,[]).append(
                              target_pathname)
              else:
                  print " need file", target_pathname
                              target_pathname)
              else:
                  print " need file", target_pathname
@@ -110,41 +119,83 @@ class CondorFastqExtract(object):
          """
          Find archived sequence files associated with our results.
          """
          """
          Find archived sequence files associated with our results.
          """
-        LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,))
-
-        lib_db = {}
-        seq_dirs = set()
-        candidate_lanes = {}
+        self.import_libraries(result_map)
+        flowcell_ids = self.find_relavant_flowcell_ids()
+        self.import_sequences(flowcell_ids)
+
+        query = RDF.SPARQLQuery("""
+        prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+        select ?filenode ?filetype ?cycle ?lane_number ?read
+               ?library  ?library_id
+               ?flowcell ?flowcell_id ?read_length
+               ?flowcell_type ?flowcell_status
+        where {
+            ?filenode libns:cycle ?cycle ;
+                      libns:lane_number ?lane_number ;
+                      libns:read ?read ;
+                      libns:flowcell ?flowcell ;
+                      libns:flowcell_id ?flowcell_id ;
+                      libns:library ?library ;
+                      libns:library_id ?library_id ;
+                      rdf:type ?filetype ;
+                      a libns:raw_file .
+            ?flowcell libns:read_length ?read_length ;
+                      libns:flowcell_type ?flowcell_type .
+            OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
+            FILTER(?filetype != libns:raw_file)
+        }
+        """)
+        results = []
+        for r in query.execute(self.model):
+            library_id = fromTypedNode(r['library_id'])
+            if library_id in result_map:
+                results.append(SequenceResult(r))
+        return results
+
+    def import_libraries(self, result_map):
          for lib_id in result_map.keys():
          for lib_id in result_map.keys():
-            lib_info = self.api.get_library(lib_id)
-            lib_info['lanes'] = {}
-            lib_db[lib_id] = lib_info
-
-            for lane in lib_info['lane_set']:
-                lane_key = (lane['flowcell'], lane['lane_number'])
-                candidate_lanes[lane_key] = (lib_id, lane['lane_id'])
-                seq_dirs.add(os.path.join(self.sequences_path,
-                                             'flowcells',
-                                             lane['flowcell']))
-        LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
-        candidate_seq_list = scan_for_sequences(seq_dirs)
-
-        # at this point we have too many sequences as scan_for_sequences
-        # returns all the sequences in a flowcell directory
-        # so lets filter out the extras
-
-        for seq in candidate_seq_list:
-            lane_key = (seq.flowcell, seq.lane)
-            candidate_key = candidate_lanes.get(lane_key, None)
-            if candidate_key is not None:
-                lib_id, lane_id = candidate_key
-                seq.lane_id = lane_id
-                lib_info = lib_db[lib_id]
-                lib_info['lanes'].setdefault(lane_key, set()).add(seq)
-
-        return lib_db
-
-    def find_missing_targets(self, result_map, lib_db):
+            liburl = urljoin(self.host, 'library/%s/' % (lib_id,))
+            library = RDF.Node(RDF.Uri(liburl))
+            self.import_library(library)
+
+    def import_library(self, library):
+        """Import library data into our model if we don't have it already
+        """
+        q = RDF.Statement(library, rdfNS['type'], libraryOntology['library'])
+        if not self.model.contains_statement(q):
+            load_into_model(self.model, 'rdfa', library)
+
+    def find_relavant_flowcell_ids(self):
+        """Generate set of flowcell ids that had samples of interest on them
+        """
+        flowcell_query =RDF.SPARQLQuery("""
+prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+select distinct ?library ?flowcell ?flowcell_id
+WHERE {
+  ?library a libns:library ;
+           libns:has_lane ?lane .
+  ?lane libns:flowcell ?flowcell .
+  ?flowcell libns:flowcell_id ?flowcell_id .
+}""")
+        flowcell_ids = set()
+        for r in flowcell_query.execute(self.model):
+            flowcell_ids.add( fromTypedNode(r['flowcell_id']) )
+        LOGGER.debug("Flowcells = %s" %(unicode(flowcell_ids)))
+        return flowcell_ids
+
+    def import_sequences(self, flowcell_ids):
+        seq_dirs = ( os.path.join(self.sequences_path, f)
+                     for f in flowcell_ids )
+        sequences = scan_for_sequences(seq_dirs)
+        for seq in sequences:
+            seq.save_to_model(self.model)
+        update_model_sequence_library(self.model, self.host)
+
+    def find_missing_targets(self, result_map, raw_files):
          """
          Check if the sequence file exists.
          This requires computing what the sequence name is and checking
          """
          Check if the sequence file exists.
          This requires computing what the sequence name is and checking
@@ -156,61 +207,48 @@ class CondorFastqExtract(object):
          fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
          # find what targets we're missing
          needed_targets = {}
          fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
          # find what targets we're missing
          needed_targets = {}
-        for lib_id in result_map.keys():
-            result_dir = result_map[lib_id]
-            lib = lib_db[lib_id]
-            lane_dict = make_lane_dict(lib_db, lib_id)
-
-            for lane_key, sequences in lib['lanes'].items():
-                for seq in sequences:
-                    seq.paired = lane_dict[seq.flowcell]['paired_end']
-                    lane_status = lane_dict[seq.flowcell]['status']
-
-                    if seq.paired and seq.read is None:
-                        seq.read = 1
-                    filename_attributes = {
-                        'flowcell': seq.flowcell,
-                        'lib_id': lib_id,
-                        'lane': seq.lane,
-                        'read': seq.read,
-                        'cycle': seq.cycle
-                        }
-                    # skip bad runs
-                    if lane_status == 'Failed':
-                        continue
-                    if seq.flowcell == '30DY0AAXX':
-                        # 30DY0 only ran for 151 bases instead of 152
-                        # it is actually 76 1st read, 75 2nd read
-                        seq.mid_point = 76
-
-                    # end filters
-                    if seq.paired:
-                        target_name = fastq_paired_template % \
-                                      filename_attributes
-                    else:
-                        target_name = fastq_single_template % \
-                                      filename_attributes
-
-                    target_pathname = os.path.join(result_dir, target_name)
-                    if self.force or not os.path.exists(target_pathname):
-                        t = needed_targets.setdefault(target_pathname, {})
-                        t.setdefault(seq.filetype, []).append(seq)
+        for seq in raw_files:
+            if not seq.isgood:
+                continue
+            filename_attributes = {
+                'flowcell': seq.flowcell_id,
+                'lib_id': seq.library_id,
+                'lane': seq.lane_number,
+                'read': seq.read,
+                'cycle': seq.cycle
+            }
+
+            if seq.ispaired:
+                target_name = fastq_paired_template % \
+                              filename_attributes
+            else:
+                target_name = fastq_single_template % \
+                              filename_attributes
  
  
-        return needed_targets
+            result_dir = result_map[seq.library_id]
+            target_pathname = os.path.join(result_dir, target_name)
+            if self.force or not os.path.exists(target_pathname):
+                t = needed_targets.setdefault(target_pathname, {})
+                t.setdefault(seq.filetype, []).append(seq)
  
  
+        return needed_targets
  
      def condor_srf_to_fastq(self, sources, target_pathname):
          if len(sources) > 1:
              raise ValueError("srf to fastq can only handle one file")
  
  
      def condor_srf_to_fastq(self, sources, target_pathname):
          if len(sources) > 1:
              raise ValueError("srf to fastq can only handle one file")
  
+        mid_point = None
+        if sources[0].flowcell_id == '30DY0AAXX':
+            mid_point = 76
+
          return {
          return {
-            'sources': [os.path.abspath(sources[0].path)],
+            'sources': [sources[0].path],
              'pyscript': srf2fastq.__file__,
              'pyscript': srf2fastq.__file__,
-            'flowcell': sources[0].flowcell,
-            'ispaired': sources[0].paired,
+            'flowcell': sources[0].flowcell_id,
+            'ispaired': sources[0].ispaired,
              'target': target_pathname,
              'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
              'target': target_pathname,
              'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
-            'mid': getattr(sources[0], 'mid_point', None),
+            'mid': mid_point,
              'force': self.force,
          }
  
              'force': self.force,
          }
  
@@ -221,10 +259,10 @@ class CondorFastqExtract(object):
          paths.sort()
          return {
              'pyscript': qseq2fastq.__file__,
          paths.sort()
          return {
              'pyscript': qseq2fastq.__file__,
-            'flowcell': sources[0].flowcell,
+            'flowcell': sources[0].flowcell_id,
              'target': target_pathname,
              'sources': paths,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
              'istar': len(sources) == 1,
          }
  
              'istar': len(sources) == 1,
          }
  
@@ -237,11 +275,9 @@ class CondorFastqExtract(object):
              'pyscript': desplit_fastq.__file__,
              'target': target_pathname,
              'sources': paths,
              'pyscript': desplit_fastq.__file__,
              'target': target_pathname,
              'sources': paths,
-            'ispaired': sources[0].paired,
+            'ispaired': sources[0].ispaired,
          }
  
          }
  
-    def lane_rdf(self, sources, target_pathname):
-        pass
  
  def make_lane_dict(lib_db, lib_id):
      """
  
  def make_lane_dict(lib_db, lib_id):
      """
@@ -255,3 +291,48 @@ def make_lane_dict(lib_db, lib_id):
          result.append((lane['flowcell'], lane))
      return dict(result)
  
          result.append((lane['flowcell'], lane))
      return dict(result)
  
+class SequenceResult(object):
+    """Convert the sparql query result from find_archive_sequence_files
+    """
+    def __init__(self, result):
+        self.filenode = result['filenode']
+        self._filetype = result['filetype']
+        self.cycle = fromTypedNode(result['cycle'])
+        self.lane_number = fromTypedNode(result['lane_number'])
+        self.read = fromTypedNode(result['read'])
+        if type(self.read) in types.StringTypes:
+            self.read = 1
+        self.library = result['library']
+        self.library_id = fromTypedNode(result['library_id'])
+        self.flowcell = result['flowcell']
+        self.flowcell_id = fromTypedNode(result['flowcell_id'])
+        self.flowcell_type = fromTypedNode(result['flowcell_type'])
+        self.flowcell_status = fromTypedNode(result['flowcell_status'])
+
+    def _is_good(self):
+        """is this sequence / flowcell 'good enough'"""
+        if self.flowcell_status is not None and \
+           self.flowcell_status.lower() == "failed":
+            return False
+        return True
+    isgood = property(_is_good)
+
+    def _get_ispaired(self):
+        if self.flowcell_type.lower() == "paired":
+            return True
+        else:
+            return False
+    ispaired = property(_get_ispaired)
+
+    def _get_filetype(self):
+        return stripNamespace(libraryOntology, self._filetype)
+    filetype = property(_get_filetype)
+
+    def _get_path(self):
+        url = urlparse(str(self.filenode.uri))
+        if url.scheme == 'file':
+            return url.path
+        else:
+            errmsg = u"Unsupported scheme {0} for {1}"
+            raise ValueError(errmsg.format(url.scheme, unicode(url)))
+    path = property(_get_path)
diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py

index 6aaf5846de8f3eb86e4f73145e487c73bc2ae914..24d6e4982018dd23ac8e8746d7ade1856ae3d4e7 100644 (file)
--- a/htsworkflow/submission/test/test_condorfastq.py
+++ b/htsworkflow/submission/test/test_condorfastq.py
@@ -7,8 +7,9 @@ import shutil
  import tempfile
  import unittest
  
  import tempfile
  import unittest
  
-from htsworkflow.submission import condorfastq
+from htsworkflow.submission.condorfastq import CondorFastqExtract
  from htsworkflow.submission.results import ResultMap
  from htsworkflow.submission.results import ResultMap
+from htsworkflow.util.rdfhelp import load_string_into_model, dump_model
  
  FCDIRS = [
      'C02F9ACXX',
  
  FCDIRS = [
      'C02F9ACXX',
@@ -16,6 +17,7 @@ FCDIRS = [
      'C02F9ACXX/C1-202/Project_11154',
      'C02F9ACXX/C1-202/Project_12342_Index1',
      'C02F9ACXX/C1-202/Project_12342_Index2',
      'C02F9ACXX/C1-202/Project_11154',
      'C02F9ACXX/C1-202/Project_12342_Index1',
      'C02F9ACXX/C1-202/Project_12342_Index2',
+    'C02F9ACXX/C1-202/Project_12345',
      '42JUYAAXX',
      '42JUYAAXX/C1-76',
      '30221AAXX',
      '42JUYAAXX',
      '42JUYAAXX/C1-76',
      '30221AAXX',
@@ -31,9 +33,18 @@ DATAFILES = [
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
      'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
-    'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
+    'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
      '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
@@ -77,77 +88,288 @@ DATAFILES = [
      '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
  ]
  
      '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
  ]
  
-LIBDATA = {
-    '11154':{u'antibody_id': None,
-             u'cell_line': u'Unknown',
-             u'cell_line_id': 1,
-             u'experiment_type': u'RNA-seq',
-             u'experiment_type_id': 4,
-             u'gel_cut_size': 300,
-             u'hidden': False,
-             u'id': u'11154',
-             u'insert_size': 200,
-             u'lane_set': [{u'flowcell': u'30221AAXX',
-                            u'lane_number': 4,
-                            u'lane_id': 3400,
-                            u'paired_end': False,
-                            u'read_length': 33,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'42JUYAAXX',
-                            u'lane_number': 5,
-                            u'lane_id': 4200,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'61MJTAAXX',
-                            u'lane_number': 6,
-                            u'lane_id': 6600,
-                            u'paired_end': False,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'30DY0AAXX',
-                            u'lane_number': 8,
-                            u'lane_id': 3800,
-                            u'paired_end': True,
-                            u'read_length': 76,
-                            u'status': u'Unknown',
-                            u'status_code': None},
-                           {u'flowcell': u'C02F9ACXX',
-                            u'lane_number': 3,
-                            u'lane_id': 12300,
-                            u'paired_end': True,
-                            u'read_length': 101,
-                            u'status': u'Unknown',
-                            u'status_code': None}],
-             u'library_id': u'11154',
-             u'library_name': u'Paired ends ASDF ',
-             u'library_species': u'Mus musculus',
-             u'library_species_id': 9,
-             u'library_type': u'Paired End (non-multiplexed)',
-             u'library_type_id': 2,
-             u'made_by': u'Gary Gygax',
-             u'made_for': u'TSR',
-             u'notes': u'300 bp gel fragment',
-             u'replicate': 1,
-             u'stopping_point': u'1Aa',
-             u'successful_pM': None,
-             u'undiluted_concentration': u'29.7'}
-    }
-
-FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
-
-class FakeApi(object):
-    def __init__(self, *args, **kwargs):
-        self.root_url = 'http://localhost'
-
-    def get_library(self, libid):
-        lib_data = LIBDATA[libid]
-        return copy.deepcopy(lib_data)
-
-
+lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
+
+<http://localhost/flowcell/30221AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 33 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3401> ;
+        libns:has_lane <http://localhost/lane/3402> ;
+        libns:has_lane <http://localhost/lane/3403> ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/3405> ;
+        libns:has_lane <http://localhost/lane/3406> ;
+        libns:has_lane <http://localhost/lane/3407> ;
+        libns:has_lane <http://localhost/lane/3408> ;
+        libns:flowcell_id "30221AAXX"@en .
+
+<http://localhost/lane/3401>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3402>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3403>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3404>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 4 .
+        # paired_end 1;
+        # read_length 33;
+        # status "Unknown"@en .
+<http://localhost/lane/3405>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3406>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3407>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3408>
+        libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+        libns:library <http://localhost/library/10000/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/42JUYAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/4201> ;
+        libns:has_lane <http://localhost/lane/4202> ;
+        libns:has_lane <http://localhost/lane/4203> ;
+        libns:has_lane <http://localhost/lane/4204> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/4206> ;
+        libns:has_lane <http://localhost/lane/4207> ;
+        libns:has_lane <http://localhost/lane/4208> ;
+        libns:flowcell_id "42JUYAAXX"@en .
+
+<http://localhost/lane/4201>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/4202>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/4203>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/4204>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/4205>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 5 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/4206>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/4207>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/4208>
+        libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+        libns:library <http://localhost/library/1421/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/61MJTAAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Single"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/6601> ;
+        libns:has_lane <http://localhost/lane/6602> ;
+        libns:has_lane <http://localhost/lane/6603> ;
+        libns:has_lane <http://localhost/lane/6604> ;
+        libns:has_lane <http://localhost/lane/6605> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/6607> ;
+        libns:has_lane <http://localhost/lane/6608> ;
+        libns:flowcell_id "61MJTAAXX"@en .
+
+<http://localhost/lane/6601>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/6602>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/6603>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/6604>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/6605>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/6606>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 6 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+<http://localhost/lane/6607>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/6608>
+        libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+        libns:library <http://localhost/library/1661/> ;
+        libns:lane_number 8 .
+
+<http://localhost/flowcell/30DY0AAXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 76 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/3801> ;
+        libns:has_lane <http://localhost/lane/3802> ;
+        libns:has_lane <http://localhost/lane/3803> ;
+        libns:has_lane <http://localhost/lane/3804> ;
+        libns:has_lane <http://localhost/lane/3805> ;
+        libns:has_lane <http://localhost/lane/3806> ;
+        libns:has_lane <http://localhost/lane/3807> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:flowcell_id "30DY0AAXX"@en .
+
+<http://localhost/lane/3801>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 1 .
+<http://localhost/lane/3802>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 2 .
+<http://localhost/lane/3803>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 3 .
+<http://localhost/lane/3804>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 4 .
+<http://localhost/lane/3805>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 5 .
+<http://localhost/lane/3806>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 6 .
+<http://localhost/lane/3807>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/1331/> ;
+        libns:lane_number 7 .
+<http://localhost/lane/3808>
+        libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 8 .
+        # paired_end 1;
+        # read_length 76;
+        # status "Unknown"@en .
+
+<http://localhost/flowcell/C02F9ACXX/>
+        a libns:illumina_flowcell ;
+        libns:read_length 101 ;
+        libns:flowcell_type "Paired"@en ;
+        libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:flowcell_id "C02F9ACXX"@en .
+
+<http://localhost/lane/12300>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/12345/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/lane/12500>
+        libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+        libns:library <http://localhost/library/11154/> ;
+        libns:lane_number 3 .
+        # paired_end 1;
+        # read_length 101;
+        # status "Unknown"@en .
+
+<http://localhost/library/11154/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "29.7";
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/3404> ;
+        libns:has_lane <http://localhost/lane/4205> ;
+        libns:has_lane <http://localhost/lane/6606> ;
+        libns:has_lane <http://localhost/lane/3808> ;
+        libns:has_lane <http://localhost/lane/12500> ;
+        libns:insert_size 2000 ;
+        libns:library_id "11154"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends ASDF"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+
+
+<http://localhost/library/12345/>
+        a libns:library ;
+        libns:affiliation "TSR"@en;
+        libns:concentration "12.345";
+        libns:cell_line "Unknown"@en ;
+        libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+        libns:experiment_type "RNA-seq"@en ;
+        libns:gel_cut 300 ;
+        libns:has_lane <http://localhost/lane/12300> ;
+        libns:insert_size 2000 ;
+        libns:library_id "12345"@en ;
+        libns:library_type "Paired End (Multiplexed)"@en ;
+        libns:made_by "Gary Gygax"@en ;
+        libns:name "Paired Ends THING"@en ;
+        libns:replicate "1"@en;
+        libns:species "Mus musculus"@en ;
+        libns:stopping_point "Completed"@en ;
+        libns:total_unique_locations 8841201 .
+        # cell_line
+"""
+HOST = "http://localhost"
  
  class TestCondorFastq(unittest.TestCase):
      def setUp(self):
  
  class TestCondorFastq(unittest.TestCase):
      def setUp(self):
@@ -168,141 +390,146 @@ class TestCondorFastq(unittest.TestCase):
              with open(filename, 'w') as stream:
                  stream.write('testfile')
  
              with open(filename, 'w') as stream:
                  stream.write('testfile')
  
-        self.subname = unicode('sub-11154')
-        self.subdir = os.path.join(self.tempdir, self.subname)
-        os.mkdir(self.subdir)
-
          self.result_map = ResultMap()
          self.result_map = ResultMap()
-        self.result_map['11154'] = self.subname
+        for lib_id in [u'11154', u'12345']:
+            subname = 'sub-%s' % (lib_id,)
+            sub_dir = os.path.join(self.tempdir, subname)
+            os.mkdir(sub_dir)
+            self.result_map[lib_id] =  sub_dir
+
+        self.extract = CondorFastqExtract(HOST,
+                                          self.flowcelldir,
+                                          self.logdir)
+        load_string_into_model(self.extract.model, 'turtle', lib_turtle)
  
      def tearDown(self):
          shutil.rmtree(self.tempdir)
          os.chdir(self.cwd)
  
  
      def tearDown(self):
          shutil.rmtree(self.tempdir)
          os.chdir(self.cwd)
  
+    def test_find_relavant_flowcell_ids(self):
+        expected = set(('30221AAXX',
+                        '42JUYAAXX',
+                        '61MJTAAXX',
+                        '30DY0AAXX',
+                        'C02F9ACXX'))
+        flowcell_ids = self.extract.find_relavant_flowcell_ids()
+        self.assertEqual(flowcell_ids, expected)
+
      def test_find_archive_sequence(self):
      def test_find_archive_sequence(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
-        lanes = [
-            lib_db['11154']['lanes'][(u'30221AAXX', 4)],
-            lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
-            lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
-            lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
-            lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
-        ]
-        self.failUnlessEqual(len(lanes[0]), 1)
-        self.failUnlessEqual(len(lanes[1]), 2)
-        self.failUnlessEqual(len(lanes[2]), 1)
-        self.failUnlessEqual(len(lanes[3]), 1)
-        self.failUnlessEqual(len(lanes[4]), 4)
+        seqs = self.extract.find_archive_sequence_files(self.result_map)
+
+        expected = set([
+            (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
+            (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
+            (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+            (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+            (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
+            (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
+        ])
+        found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
+        self.assertEqual(expected, found)
  
      def test_find_needed_targets(self):
  
      def test_find_needed_targets(self):
+        lib_db = self.extract.find_archive_sequence_files(self.result_map)
  
  
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        lib_db = extract.find_archive_sequence_files(self.result_map)
-
-        needed_targets = extract.find_missing_targets(self.result_map,
-                                                      lib_db)
-        self.failUnlessEqual(len(needed_targets), 7)
+        needed_targets = self.extract.find_missing_targets(self.result_map,
+                                                           lib_db)
+        self.assertEqual(len(needed_targets), 9)
          srf_30221 = needed_targets[
          srf_30221 = needed_targets[
-            self.subname + u'/11154_30221AAXX_c33_l4.fastq']
+            self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
          qseq_42JUY_r1 = needed_targets[
          qseq_42JUY_r1 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
          qseq_42JUY_r2 = needed_targets[
          qseq_42JUY_r2 = needed_targets[
-            self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
+            self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
          qseq_61MJT = needed_targets[
          qseq_61MJT = needed_targets[
-            self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
+            self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
          split_C02F9_r1 = needed_targets[
          split_C02F9_r1 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
          split_C02F9_r2 = needed_targets[
          split_C02F9_r2 = needed_targets[
-            self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
+            self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
  
  
-        self.failUnlessEqual(len(srf_30221['srf']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
-        self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
-        self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
-        self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
-        self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
-
-        #print '-------needed targets---------'
-        #pprint(needed_targets)
+        self.assertEqual(len(srf_30221['srf']), 1)
+        self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
+        self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
+        self.assertEqual(len(qseq_61MJT['qseq']), 1)
+        self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
+        self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
  
      def test_generate_fastqs(self):
  
      def test_generate_fastqs(self):
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        commands = extract.build_condor_arguments(self.result_map)
+        commands = self.extract.build_condor_arguments(self.result_map)
  
          srf = commands['srf']
          qseq = commands['qseq']
          split = commands['split_fastq']
  
  
          srf = commands['srf']
          qseq = commands['qseq']
          split = commands['split_fastq']
  
-        self.failUnlessEqual(len(srf), 2)
-        self.failUnlessEqual(len(qseq), 3)
-        self.failUnlessEqual(len(split), 2)
+        self.assertEqual(len(srf), 2)
+        self.assertEqual(len(qseq), 3)
+        self.assertEqual(len(split), 4)
  
          srf_data = {
  
          srf_data = {
-            os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30221AAXX_c33_l4.fastq'): {
                  'mid': None,
                  'ispaired': False,
                  'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
                  'flowcell': u'30221AAXX',
                  'mid': None,
                  'ispaired': False,
                  'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
                  'flowcell': u'30221AAXX',
-                'target': os.path.join(self.subname,
+                'target': os.path.join(self.result_map['11154'],
                                         u'11154_30221AAXX_c33_l4.fastq'),
              },
                                         u'11154_30221AAXX_c33_l4.fastq'),
              },
-            os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_30DY0AAXX_c151_l8_r1.fastq'): {
                  'mid': None,
                  'ispaired': True,
                  'flowcell': u'30DY0AAXX',
                  'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
                  'mid': 76,
                  'target':
                  'mid': None,
                  'ispaired': True,
                  'flowcell': u'30DY0AAXX',
                  'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
                  'mid': 76,
                  'target':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r1.fastq'),
                  'target_right':
                                   u'11154_30DY0AAXX_c151_l8_r1.fastq'),
                  'target_right':
-                    os.path.join(self.subname,
+                    os.path.join(self.result_map['11154'],
                                   u'11154_30DY0AAXX_c151_l8_r2.fastq'),
              }
          }
          for args in srf:
              expected = srf_data[args['target']]
                                   u'11154_30DY0AAXX_c151_l8_r2.fastq'),
              }
          }
          for args in srf:
              expected = srf_data[args['target']]
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
-            self.failUnlessEqual(len(args['sources']), 1)
+            self.assertEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(len(args['sources']), 1)
              _, source_filename = os.path.split(args['sources'][0])
              _, source_filename = os.path.split(args['sources'][0])
-            self.failUnlessEqual(source_filename, expected['sources'][0])
-            self.failUnlessEqual(args['target'], expected['target'])
+            self.assertEqual(source_filename, expected['sources'][0])
+            self.assertEqual(args['target'], expected['target'])
              if args['ispaired']:
              if args['ispaired']:
-                self.failUnlessEqual(args['target_right'],
+                self.assertEqual(args['target_right'],
                                       expected['target_right'])
              if 'mid' in expected:
                                       expected['target_right'])
              if 'mid' in expected:
-                self.failUnlessEqual(args['mid'], expected['mid'])
+                self.assertEqual(args['mid'], expected['mid'])
  
          qseq_data = {
  
          qseq_data = {
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r1.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
              },
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
              },
-            os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_42JUYAAXX_c76_l5_r2.fastq'): {
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
              },
                  'istar': True,
                  'ispaired': True,
                  'sources': [
                      u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
              },
-            os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
+            os.path.join(self.result_map['11154'],
+                         '11154_61MJTAAXX_c76_l6.fastq'): {
                  'istar': True,
                  'ispaired': False,
                  'sources': [
                  'istar': True,
                  'ispaired': False,
                  'sources': [
@@ -311,11 +538,11 @@ class TestCondorFastq(unittest.TestCase):
          }
          for args in qseq:
              expected = qseq_data[args['target']]
          }
          for args in qseq:
              expected = qseq_data[args['target']]
-            self.failUnlessEqual(args['istar'], expected['istar'])
-            self.failUnlessEqual(args['ispaired'], expected['ispaired'])
+            self.assertEqual(args['istar'], expected['istar'])
+            self.assertEqual(args['ispaired'], expected['ispaired'])
              for i in range(len(expected['sources'])):
                  _, filename = os.path.split(args['sources'][i])
              for i in range(len(expected['sources'])):
                  _, filename = os.path.split(args['sources'][i])
-                self.failUnlessEqual(filename, expected['sources'][i])
+                self.assertEqual(filename, expected['sources'][i])
  
  
          split_test = dict((( x['target'], x) for x in
  
  
          split_test = dict((( x['target'], x) for x in
@@ -326,64 +553,82 @@ class TestCondorFastq(unittest.TestCase):
              {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
                           u'11154_NoIndex_L003_R2_002.fastq.gz'],
               'pyscript': 'desplit_fastq.pyc',
              {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
                           u'11154_NoIndex_L003_R2_002.fastq.gz'],
               'pyscript': 'desplit_fastq.pyc',
-             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
+             'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
+                         u'12345_CGATGT_L003_R1_002.fastq.gz',
+                         u'12345_CGATGT_L003_R1_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
+            {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
+                         u'12345_CGATGT_L003_R2_002.fastq.gz',
+                         u'12345_CGATGT_L003_R2_003.fastq.gz',
+                         ],
+             'pyscript': 'desplit_fastq.pyc',
+             'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
+             ]
           ))
          for arg in split:
              _, target = os.path.split(arg['target'])
              pyscript = split_test[target]['pyscript']
           ))
          for arg in split:
              _, target = os.path.split(arg['target'])
              pyscript = split_test[target]['pyscript']
-            self.failUnless(arg['pyscript'].endswith(pyscript))
+            self.assertTrue(arg['pyscript'].endswith(pyscript))
              filename = split_test[target]['target']
              filename = split_test[target]['target']
-            self.failUnless(arg['target'].endswith(filename))
+            self.assertTrue(arg['target'].endswith(filename))
              for s_index in range(len(arg['sources'])):
                  s1 = arg['sources'][s_index]
                  s2 = split_test[target]['sources'][s_index]
              for s_index in range(len(arg['sources'])):
                  s1 = arg['sources'][s_index]
                  s2 = split_test[target]['sources'][s_index]
-                self.failUnless(s1.endswith(s2))
-
-        #print '-------commands---------'
-        #pprint (commands)
+                self.assertTrue(s1.endswith(s2))
  
      def test_create_scripts(self):
  
      def test_create_scripts(self):
-        os.chdir(self.tempdir)
-        extract = condorfastq.CondorFastqExtract('host',
-                                                 FAKE_APIDATA,
-                                                 self.tempdir,
-                                                 self.logdir)
-        extract.api = FakeApi()
-        extract.create_scripts(self.result_map)
-
-        self.failUnless(os.path.exists('srf.condor'))
+        self.extract.create_scripts(self.result_map)
+
+        self.assertTrue(os.path.exists('srf.condor'))
          with open('srf.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
          with open('srf.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
+            self.assertEqual(len(arguments), 2)
+            self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
                              in arguments[0])
                              in arguments[0])
-            self.failUnless(
-                '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
+            self.assertTrue(
+                'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
                  arguments[1])
  
                  arguments[1])
  
-        self.failUnless(os.path.exists('qseq.condor'))
+        self.assertTrue(os.path.exists('qseq.condor'))
          with open('qseq.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
          with open('qseq.condor', 'r') as srf:
              arguments = [ l for l in srf if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 3)
-            self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
+            self.assertEqual(len(arguments), 3)
+            self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
                              arguments[0])
                              arguments[0])
-            self.failUnless(
+            self.assertTrue(
                  'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
                  arguments[1])
                  'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
                  arguments[1])
-            self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
+            self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
                              arguments[2])
  
                              arguments[2])
  
-        self.failUnless(os.path.exists('split_fastq.condor'))
+        self.assertTrue(os.path.exists('split_fastq.condor'))
          with open('split_fastq.condor', 'r') as split:
              arguments = [ l for l in split if l.startswith('argument') ]
              arguments.sort()
          with open('split_fastq.condor', 'r') as split:
              arguments = [ l for l in split if l.startswith('argument') ]
              arguments.sort()
-            self.failUnlessEqual(len(arguments), 2)
-            self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
+            self.assertEqual(len(arguments), 4)
+            # Lane 3 Read 1
+            self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
                              arguments[0])
                              arguments[0])
-            self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
+            # Lane 3 Read 2
+            self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
                              arguments[1])
                              arguments[1])
+            # Lane 3 Read 1
+            self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
+            self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
+
+            # Lane 3 Read 2
+            self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
+            self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
+            self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
+
  
  def suite():
      suite = unittest.makeSuite(TestCondorFastq, 'test')
  
  def suite():
      suite = unittest.makeSuite(TestCondorFastq, 'test')
author	Diane Trout <diane@caltech.edu>
	Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
committer	Diane Trout <diane@caltech.edu>
	Fri, 31 Aug 2012 19:29:59 +0000 (12:29 -0700)
htsworkflow/submission/condorfastq.py		patch \| blob \| history
htsworkflow/submission/test/test_condorfastq.py		patch \| blob \| history