"""
import logging
import os
-from pprint import pformat
+from pprint import pformat,pprint
import sys
import types
+from urlparse import urljoin, urlparse
-from htsworkflow.pipelines.sequences import scan_for_sequences
+from htsworkflow.pipelines.sequences import scan_for_sequences, \
+ update_model_sequence_library
from htsworkflow.pipelines.samplekey import SampleKey
from htsworkflow.pipelines import qseq2fastq
from htsworkflow.pipelines import srf2fastq
from htsworkflow.pipelines import desplit_fastq
-from htsworkflow.util.api import HtswApi
+from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
+ fromTypedNode, \
+ libraryOntology, \
+ stripNamespace, \
+ rdfNS
from htsworkflow.util.conversion import parse_flowcell_id
from django.conf import settings
from django.template import Context, loader
+import RDF
+
LOGGER = logging.getLogger(__name__)
class CondorFastqExtract(object):
- def __init__(self, host, apidata, sequences_path,
+ def __init__(self, host, sequences_path,
log_path='log',
force=False):
"""Extract fastqs from results archive
log_path (str): where to put condor log files
force (bool): do we force overwriting current files?
"""
- self.api = HtswApi(host, apidata)
+ self.host = host
+ self.model = get_model()
self.sequences_path = sequences_path
self.log_path = log_path
self.force = force
'logdir': self.log_path,
'env': env,
'args': condor_entries[script_type],
- 'root_url': self.api.root_url,
+ 'root_url': self.host,
}
context = Context(variables)
'split_fastq': self.condor_desplit_fastq
}
by_sample = {}
- lib_db = self.find_archive_sequence_files(result_map)
- needed_targets = self.find_missing_targets(result_map, lib_db)
+ sequences = self.find_archive_sequence_files(result_map)
+ needed_targets = self.find_missing_targets(result_map, sequences)
for target_pathname, available_sources in needed_targets.items():
LOGGER.debug(' target : %s' % (target_pathname,))
condor_entries.setdefault(condor_type, []).append(
conversion(sources, target_pathname))
for s in sources:
- by_sample.setdefault(s.lane_id,[]).append(
+ by_sample.setdefault(s.lane_number,[]).append(
target_pathname)
else:
print " need file", target_pathname
"""
Find archived sequence files associated with our results.
"""
- LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,))
-
- lib_db = {}
- seq_dirs = set()
- candidate_lanes = {}
+ self.import_libraries(result_map)
+ flowcell_ids = self.find_relavant_flowcell_ids()
+ self.import_sequences(flowcell_ids)
+
+ query = RDF.SPARQLQuery("""
+ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+ select ?filenode ?filetype ?cycle ?lane_number ?read
+ ?library ?library_id
+ ?flowcell ?flowcell_id ?read_length
+ ?flowcell_type ?flowcell_status
+ where {
+ ?filenode libns:cycle ?cycle ;
+ libns:lane_number ?lane_number ;
+ libns:read ?read ;
+ libns:flowcell ?flowcell ;
+ libns:flowcell_id ?flowcell_id ;
+ libns:library ?library ;
+ libns:library_id ?library_id ;
+ rdf:type ?filetype ;
+ a libns:raw_file .
+ ?flowcell libns:read_length ?read_length ;
+ libns:flowcell_type ?flowcell_type .
+ OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
+ FILTER(?filetype != libns:raw_file)
+ }
+ """)
+ results = []
+ for r in query.execute(self.model):
+ library_id = fromTypedNode(r['library_id'])
+ if library_id in result_map:
+ results.append(SequenceResult(r))
+ return results
+
+ def import_libraries(self, result_map):
for lib_id in result_map.keys():
- lib_info = self.api.get_library(lib_id)
- lib_info['lanes'] = {}
- lib_db[lib_id] = lib_info
-
- for lane in lib_info['lane_set']:
- lane_key = (lane['flowcell'], lane['lane_number'])
- candidate_lanes[lane_key] = (lib_id, lane['lane_id'])
- seq_dirs.add(os.path.join(self.sequences_path,
- 'flowcells',
- lane['flowcell']))
- LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
- candidate_seq_list = scan_for_sequences(seq_dirs)
-
- # at this point we have too many sequences as scan_for_sequences
- # returns all the sequences in a flowcell directory
- # so lets filter out the extras
-
- for seq in candidate_seq_list:
- lane_key = (seq.flowcell, seq.lane)
- candidate_key = candidate_lanes.get(lane_key, None)
- if candidate_key is not None:
- lib_id, lane_id = candidate_key
- seq.lane_id = lane_id
- lib_info = lib_db[lib_id]
- lib_info['lanes'].setdefault(lane_key, set()).add(seq)
-
- return lib_db
-
- def find_missing_targets(self, result_map, lib_db):
+ liburl = urljoin(self.host, 'library/%s/' % (lib_id,))
+ library = RDF.Node(RDF.Uri(liburl))
+ self.import_library(library)
+
+ def import_library(self, library):
+ """Import library data into our model if we don't have it already
+ """
+ q = RDF.Statement(library, rdfNS['type'], libraryOntology['library'])
+ if not self.model.contains_statement(q):
+ load_into_model(self.model, 'rdfa', library)
+
+ def find_relavant_flowcell_ids(self):
+ """Generate set of flowcell ids that had samples of interest on them
+ """
+ flowcell_query =RDF.SPARQLQuery("""
+prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+select distinct ?library ?flowcell ?flowcell_id
+WHERE {
+ ?library a libns:library ;
+ libns:has_lane ?lane .
+ ?lane libns:flowcell ?flowcell .
+ ?flowcell libns:flowcell_id ?flowcell_id .
+}""")
+ flowcell_ids = set()
+ for r in flowcell_query.execute(self.model):
+ flowcell_ids.add( fromTypedNode(r['flowcell_id']) )
+ LOGGER.debug("Flowcells = %s" %(unicode(flowcell_ids)))
+ return flowcell_ids
+
+ def import_sequences(self, flowcell_ids):
+ seq_dirs = ( os.path.join(self.sequences_path, f)
+ for f in flowcell_ids )
+ sequences = scan_for_sequences(seq_dirs)
+ for seq in sequences:
+ seq.save_to_model(self.model)
+ update_model_sequence_library(self.model, self.host)
+
+ def find_missing_targets(self, result_map, raw_files):
"""
Check if the sequence file exists.
This requires computing what the sequence name is and checking
fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
# find what targets we're missing
needed_targets = {}
- for lib_id in result_map.keys():
- result_dir = result_map[lib_id]
- lib = lib_db[lib_id]
- lane_dict = make_lane_dict(lib_db, lib_id)
-
- for lane_key, sequences in lib['lanes'].items():
- for seq in sequences:
- seq.paired = lane_dict[seq.flowcell]['paired_end']
- lane_status = lane_dict[seq.flowcell]['status']
-
- if seq.paired and seq.read is None:
- seq.read = 1
- filename_attributes = {
- 'flowcell': seq.flowcell,
- 'lib_id': lib_id,
- 'lane': seq.lane,
- 'read': seq.read,
- 'cycle': seq.cycle
- }
- # skip bad runs
- if lane_status == 'Failed':
- continue
- if seq.flowcell == '30DY0AAXX':
- # 30DY0 only ran for 151 bases instead of 152
- # it is actually 76 1st read, 75 2nd read
- seq.mid_point = 76
-
- # end filters
- if seq.paired:
- target_name = fastq_paired_template % \
- filename_attributes
- else:
- target_name = fastq_single_template % \
- filename_attributes
-
- target_pathname = os.path.join(result_dir, target_name)
- if self.force or not os.path.exists(target_pathname):
- t = needed_targets.setdefault(target_pathname, {})
- t.setdefault(seq.filetype, []).append(seq)
+ for seq in raw_files:
+ if not seq.isgood:
+ continue
+ filename_attributes = {
+ 'flowcell': seq.flowcell_id,
+ 'lib_id': seq.library_id,
+ 'lane': seq.lane_number,
+ 'read': seq.read,
+ 'cycle': seq.cycle
+ }
+
+ if seq.ispaired:
+ target_name = fastq_paired_template % \
+ filename_attributes
+ else:
+ target_name = fastq_single_template % \
+ filename_attributes
- return needed_targets
+ result_dir = result_map[seq.library_id]
+ target_pathname = os.path.join(result_dir, target_name)
+ if self.force or not os.path.exists(target_pathname):
+ t = needed_targets.setdefault(target_pathname, {})
+ t.setdefault(seq.filetype, []).append(seq)
+ return needed_targets
def condor_srf_to_fastq(self, sources, target_pathname):
if len(sources) > 1:
raise ValueError("srf to fastq can only handle one file")
+ mid_point = None
+ if sources[0].flowcell_id == '30DY0AAXX':
+ mid_point = 76
+
return {
- 'sources': [os.path.abspath(sources[0].path)],
+ 'sources': [sources[0].path],
'pyscript': srf2fastq.__file__,
- 'flowcell': sources[0].flowcell,
- 'ispaired': sources[0].paired,
+ 'flowcell': sources[0].flowcell_id,
+ 'ispaired': sources[0].ispaired,
'target': target_pathname,
'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
- 'mid': getattr(sources[0], 'mid_point', None),
+ 'mid': mid_point,
'force': self.force,
}
paths.sort()
return {
'pyscript': qseq2fastq.__file__,
- 'flowcell': sources[0].flowcell,
+ 'flowcell': sources[0].flowcell_id,
'target': target_pathname,
'sources': paths,
- 'ispaired': sources[0].paired,
+ 'ispaired': sources[0].ispaired,
'istar': len(sources) == 1,
}
'pyscript': desplit_fastq.__file__,
'target': target_pathname,
'sources': paths,
- 'ispaired': sources[0].paired,
+ 'ispaired': sources[0].ispaired,
}
- def lane_rdf(self, sources, target_pathname):
- pass
def make_lane_dict(lib_db, lib_id):
"""
result.append((lane['flowcell'], lane))
return dict(result)
+class SequenceResult(object):
+ """Convert the sparql query result from find_archive_sequence_files
+ """
+ def __init__(self, result):
+ self.filenode = result['filenode']
+ self._filetype = result['filetype']
+ self.cycle = fromTypedNode(result['cycle'])
+ self.lane_number = fromTypedNode(result['lane_number'])
+ self.read = fromTypedNode(result['read'])
+ if type(self.read) in types.StringTypes:
+ self.read = 1
+ self.library = result['library']
+ self.library_id = fromTypedNode(result['library_id'])
+ self.flowcell = result['flowcell']
+ self.flowcell_id = fromTypedNode(result['flowcell_id'])
+ self.flowcell_type = fromTypedNode(result['flowcell_type'])
+ self.flowcell_status = fromTypedNode(result['flowcell_status'])
+
+ def _is_good(self):
+ """is this sequence / flowcell 'good enough'"""
+ if self.flowcell_status is not None and \
+ self.flowcell_status.lower() == "failed":
+ return False
+ return True
+ isgood = property(_is_good)
+
+ def _get_ispaired(self):
+ if self.flowcell_type.lower() == "paired":
+ return True
+ else:
+ return False
+ ispaired = property(_get_ispaired)
+
+ def _get_filetype(self):
+ return stripNamespace(libraryOntology, self._filetype)
+ filetype = property(_get_filetype)
+
+ def _get_path(self):
+ url = urlparse(str(self.filenode.uri))
+ if url.scheme == 'file':
+ return url.path
+ else:
+ errmsg = u"Unsupported scheme {0} for {1}"
+ raise ValueError(errmsg.format(url.scheme, unicode(url)))
+ path = property(_get_path)
import tempfile
import unittest
-from htsworkflow.submission import condorfastq
+from htsworkflow.submission.condorfastq import CondorFastqExtract
from htsworkflow.submission.results import ResultMap
+from htsworkflow.util.rdfhelp import load_string_into_model, dump_model
FCDIRS = [
'C02F9ACXX',
'C02F9ACXX/C1-202/Project_11154',
'C02F9ACXX/C1-202/Project_12342_Index1',
'C02F9ACXX/C1-202/Project_12342_Index2',
+ 'C02F9ACXX/C1-202/Project_12345',
'42JUYAAXX',
'42JUYAAXX/C1-76',
'30221AAXX',
'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
- 'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
- 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
- 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
+ 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
'42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
'42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
'42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
'61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
]
-LIBDATA = {
- '11154':{u'antibody_id': None,
- u'cell_line': u'Unknown',
- u'cell_line_id': 1,
- u'experiment_type': u'RNA-seq',
- u'experiment_type_id': 4,
- u'gel_cut_size': 300,
- u'hidden': False,
- u'id': u'11154',
- u'insert_size': 200,
- u'lane_set': [{u'flowcell': u'30221AAXX',
- u'lane_number': 4,
- u'lane_id': 3400,
- u'paired_end': False,
- u'read_length': 33,
- u'status': u'Unknown',
- u'status_code': None},
- {u'flowcell': u'42JUYAAXX',
- u'lane_number': 5,
- u'lane_id': 4200,
- u'paired_end': True,
- u'read_length': 76,
- u'status': u'Unknown',
- u'status_code': None},
- {u'flowcell': u'61MJTAAXX',
- u'lane_number': 6,
- u'lane_id': 6600,
- u'paired_end': False,
- u'read_length': 76,
- u'status': u'Unknown',
- u'status_code': None},
- {u'flowcell': u'30DY0AAXX',
- u'lane_number': 8,
- u'lane_id': 3800,
- u'paired_end': True,
- u'read_length': 76,
- u'status': u'Unknown',
- u'status_code': None},
- {u'flowcell': u'C02F9ACXX',
- u'lane_number': 3,
- u'lane_id': 12300,
- u'paired_end': True,
- u'read_length': 101,
- u'status': u'Unknown',
- u'status_code': None}],
- u'library_id': u'11154',
- u'library_name': u'Paired ends ASDF ',
- u'library_species': u'Mus musculus',
- u'library_species_id': 9,
- u'library_type': u'Paired End (non-multiplexed)',
- u'library_type_id': 2,
- u'made_by': u'Gary Gygax',
- u'made_for': u'TSR',
- u'notes': u'300 bp gel fragment',
- u'replicate': 1,
- u'stopping_point': u'1Aa',
- u'successful_pM': None,
- u'undiluted_concentration': u'29.7'}
- }
-
-FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
-
-class FakeApi(object):
- def __init__(self, *args, **kwargs):
- self.root_url = 'http://localhost'
-
- def get_library(self, libid):
- lib_data = LIBDATA[libid]
- return copy.deepcopy(lib_data)
-
-
+lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix dc: <http://purl.org/dc/elements/1.1/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
+@prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
+
+<http://localhost/flowcell/30221AAXX/>
+ a libns:illumina_flowcell ;
+ libns:read_length 33 ;
+ libns:flowcell_type "Single"@en ;
+ libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+ libns:has_lane <http://localhost/lane/3401> ;
+ libns:has_lane <http://localhost/lane/3402> ;
+ libns:has_lane <http://localhost/lane/3403> ;
+ libns:has_lane <http://localhost/lane/3404> ;
+ libns:has_lane <http://localhost/lane/3405> ;
+ libns:has_lane <http://localhost/lane/3406> ;
+ libns:has_lane <http://localhost/lane/3407> ;
+ libns:has_lane <http://localhost/lane/3408> ;
+ libns:flowcell_id "30221AAXX"@en .
+
+<http://localhost/lane/3401>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 1 .
+<http://localhost/lane/3402>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 2 .
+<http://localhost/lane/3403>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 3 .
+<http://localhost/lane/3404>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/11154/> ;
+ libns:lane_number 4 .
+ # paired_end 1;
+ # read_length 33;
+ # status "Unknown"@en .
+<http://localhost/lane/3405>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 5 .
+<http://localhost/lane/3406>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 6 .
+<http://localhost/lane/3407>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 7 .
+<http://localhost/lane/3408>
+ libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
+ libns:library <http://localhost/library/10000/> ;
+ libns:lane_number 8 .
+
+<http://localhost/flowcell/42JUYAAXX/>
+ a libns:illumina_flowcell ;
+ libns:read_length 76 ;
+ libns:flowcell_type "Paired"@en ;
+ libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+ libns:has_lane <http://localhost/lane/4201> ;
+ libns:has_lane <http://localhost/lane/4202> ;
+ libns:has_lane <http://localhost/lane/4203> ;
+ libns:has_lane <http://localhost/lane/4204> ;
+ libns:has_lane <http://localhost/lane/4205> ;
+ libns:has_lane <http://localhost/lane/4206> ;
+ libns:has_lane <http://localhost/lane/4207> ;
+ libns:has_lane <http://localhost/lane/4208> ;
+ libns:flowcell_id "42JUYAAXX"@en .
+
+<http://localhost/lane/4201>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 1 .
+<http://localhost/lane/4202>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 2 .
+<http://localhost/lane/4203>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 3 .
+<http://localhost/lane/4204>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 4 .
+<http://localhost/lane/4205>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/11154/> ;
+ libns:lane_number 5 .
+ # paired_end 1;
+ # read_length 76;
+ # status "Unknown"@en .
+<http://localhost/lane/4206>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 6 .
+<http://localhost/lane/4207>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 7 .
+<http://localhost/lane/4208>
+ libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
+ libns:library <http://localhost/library/1421/> ;
+ libns:lane_number 8 .
+
+<http://localhost/flowcell/61MJTAAXX/>
+ a libns:illumina_flowcell ;
+ libns:read_length 76 ;
+ libns:flowcell_type "Single"@en ;
+ libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+ libns:has_lane <http://localhost/lane/6601> ;
+ libns:has_lane <http://localhost/lane/6602> ;
+ libns:has_lane <http://localhost/lane/6603> ;
+ libns:has_lane <http://localhost/lane/6604> ;
+ libns:has_lane <http://localhost/lane/6605> ;
+ libns:has_lane <http://localhost/lane/6606> ;
+ libns:has_lane <http://localhost/lane/6607> ;
+ libns:has_lane <http://localhost/lane/6608> ;
+ libns:flowcell_id "61MJTAAXX"@en .
+
+<http://localhost/lane/6601>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 1 .
+<http://localhost/lane/6602>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 2 .
+<http://localhost/lane/6603>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 3 .
+<http://localhost/lane/6604>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 4 .
+<http://localhost/lane/6605>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 5 .
+<http://localhost/lane/6606>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/11154/> ;
+ libns:lane_number 6 .
+ # paired_end 1;
+ # read_length 76;
+ # status "Unknown"@en .
+<http://localhost/lane/6607>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 7 .
+<http://localhost/lane/6608>
+ libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
+ libns:library <http://localhost/library/1661/> ;
+ libns:lane_number 8 .
+
+<http://localhost/flowcell/30DY0AAXX/>
+ a libns:illumina_flowcell ;
+ libns:read_length 76 ;
+ libns:flowcell_type "Paired"@en ;
+ libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+ libns:has_lane <http://localhost/lane/3801> ;
+ libns:has_lane <http://localhost/lane/3802> ;
+ libns:has_lane <http://localhost/lane/3803> ;
+ libns:has_lane <http://localhost/lane/3804> ;
+ libns:has_lane <http://localhost/lane/3805> ;
+ libns:has_lane <http://localhost/lane/3806> ;
+ libns:has_lane <http://localhost/lane/3807> ;
+ libns:has_lane <http://localhost/lane/3808> ;
+ libns:flowcell_id "30DY0AAXX"@en .
+
+<http://localhost/lane/3801>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 1 .
+<http://localhost/lane/3802>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 2 .
+<http://localhost/lane/3803>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 3 .
+<http://localhost/lane/3804>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 4 .
+<http://localhost/lane/3805>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 5 .
+<http://localhost/lane/3806>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 6 .
+<http://localhost/lane/3807>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/1331/> ;
+ libns:lane_number 7 .
+<http://localhost/lane/3808>
+ libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
+ libns:library <http://localhost/library/11154/> ;
+ libns:lane_number 8 .
+ # paired_end 1;
+ # read_length 76;
+ # status "Unknown"@en .
+
+<http://localhost/flowcell/C02F9ACXX/>
+ a libns:illumina_flowcell ;
+ libns:read_length 101 ;
+ libns:flowcell_type "Paired"@en ;
+ libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
+ libns:has_lane <http://localhost/lane/12300> ;
+ libns:has_lane <http://localhost/lane/12500> ;
+ libns:flowcell_id "C02F9ACXX"@en .
+
+<http://localhost/lane/12300>
+ libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+ libns:library <http://localhost/library/12345/> ;
+ libns:lane_number 3 .
+ # paired_end 1;
+ # read_length 101;
+ # status "Unknown"@en .
+
+<http://localhost/lane/12500>
+ libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
+ libns:library <http://localhost/library/11154/> ;
+ libns:lane_number 3 .
+ # paired_end 1;
+ # read_length 101;
+ # status "Unknown"@en .
+
+<http://localhost/library/11154/>
+ a libns:library ;
+ libns:affiliation "TSR"@en;
+ libns:concentration "29.7";
+ libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+ libns:experiment_type "RNA-seq"@en ;
+ libns:gel_cut 300 ;
+ libns:has_lane <http://localhost/lane/3404> ;
+ libns:has_lane <http://localhost/lane/4205> ;
+ libns:has_lane <http://localhost/lane/6606> ;
+ libns:has_lane <http://localhost/lane/3808> ;
+ libns:has_lane <http://localhost/lane/12500> ;
+ libns:insert_size 2000 ;
+ libns:library_id "11154"@en ;
+ libns:library_type "Paired End (Multiplexed)"@en ;
+ libns:made_by "Gary Gygax"@en ;
+ libns:name "Paired Ends ASDF"@en ;
+ libns:replicate "1"@en;
+ libns:species "Mus musculus"@en ;
+ libns:stopping_point "Completed"@en ;
+ libns:total_unique_locations 8841201 .
+ # cell_line
+
+
+<http://localhost/library/12345/>
+ a libns:library ;
+ libns:affiliation "TSR"@en;
+ libns:concentration "12.345";
+ libns:cell_line "Unknown"@en ;
+ libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
+ libns:experiment_type "RNA-seq"@en ;
+ libns:gel_cut 300 ;
+ libns:has_lane <http://localhost/lane/12300> ;
+ libns:insert_size 2000 ;
+ libns:library_id "12345"@en ;
+ libns:library_type "Paired End (Multiplexed)"@en ;
+ libns:made_by "Gary Gygax"@en ;
+ libns:name "Paired Ends THING"@en ;
+ libns:replicate "1"@en;
+ libns:species "Mus musculus"@en ;
+ libns:stopping_point "Completed"@en ;
+ libns:total_unique_locations 8841201 .
+ # cell_line
+"""
+HOST = "http://localhost"
class TestCondorFastq(unittest.TestCase):
def setUp(self):
with open(filename, 'w') as stream:
stream.write('testfile')
- self.subname = unicode('sub-11154')
- self.subdir = os.path.join(self.tempdir, self.subname)
- os.mkdir(self.subdir)
-
self.result_map = ResultMap()
- self.result_map['11154'] = self.subname
+ for lib_id in [u'11154', u'12345']:
+ subname = 'sub-%s' % (lib_id,)
+ sub_dir = os.path.join(self.tempdir, subname)
+ os.mkdir(sub_dir)
+ self.result_map[lib_id] = sub_dir
+
+ self.extract = CondorFastqExtract(HOST,
+ self.flowcelldir,
+ self.logdir)
+ load_string_into_model(self.extract.model, 'turtle', lib_turtle)
def tearDown(self):
shutil.rmtree(self.tempdir)
os.chdir(self.cwd)
+ def test_find_relavant_flowcell_ids(self):
+ expected = set(('30221AAXX',
+ '42JUYAAXX',
+ '61MJTAAXX',
+ '30DY0AAXX',
+ 'C02F9ACXX'))
+ flowcell_ids = self.extract.find_relavant_flowcell_ids()
+ self.assertEqual(flowcell_ids, expected)
+
def test_find_archive_sequence(self):
- extract = condorfastq.CondorFastqExtract('host',
- FAKE_APIDATA,
- self.tempdir,
- self.logdir)
- extract.api = FakeApi()
-
- lib_db = extract.find_archive_sequence_files(self.result_map)
-
- self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
- lanes = [
- lib_db['11154']['lanes'][(u'30221AAXX', 4)],
- lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
- lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
- lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
- lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
- ]
- self.failUnlessEqual(len(lanes[0]), 1)
- self.failUnlessEqual(len(lanes[1]), 2)
- self.failUnlessEqual(len(lanes[2]), 1)
- self.failUnlessEqual(len(lanes[3]), 1)
- self.failUnlessEqual(len(lanes[4]), 4)
+ seqs = self.extract.find_archive_sequence_files(self.result_map)
+
+ expected = set([
+ (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
+ (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
+ (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
+ (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+ (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+ (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+ (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
+ (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
+ (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
+ (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
+ ])
+ found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
+ self.assertEqual(expected, found)
def test_find_needed_targets(self):
+ lib_db = self.extract.find_archive_sequence_files(self.result_map)
- extract = condorfastq.CondorFastqExtract('host',
- FAKE_APIDATA,
- self.tempdir,
- self.logdir)
- extract.api = FakeApi()
- lib_db = extract.find_archive_sequence_files(self.result_map)
-
- needed_targets = extract.find_missing_targets(self.result_map,
- lib_db)
- self.failUnlessEqual(len(needed_targets), 7)
+ needed_targets = self.extract.find_missing_targets(self.result_map,
+ lib_db)
+ self.assertEqual(len(needed_targets), 9)
srf_30221 = needed_targets[
- self.subname + u'/11154_30221AAXX_c33_l4.fastq']
+ self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
qseq_42JUY_r1 = needed_targets[
- self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
+ self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
qseq_42JUY_r2 = needed_targets[
- self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
+ self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
qseq_61MJT = needed_targets[
- self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
+ self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
split_C02F9_r1 = needed_targets[
- self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
+ self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
split_C02F9_r2 = needed_targets[
- self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
+ self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
- self.failUnlessEqual(len(srf_30221['srf']), 1)
- self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
- self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
- self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
- self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
- self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
-
- #print '-------needed targets---------'
- #pprint(needed_targets)
+ self.assertEqual(len(srf_30221['srf']), 1)
+ self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
+ self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
+ self.assertEqual(len(qseq_61MJT['qseq']), 1)
+ self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
+ self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
def test_generate_fastqs(self):
- extract = condorfastq.CondorFastqExtract('host',
- FAKE_APIDATA,
- self.tempdir,
- self.logdir)
- extract.api = FakeApi()
- commands = extract.build_condor_arguments(self.result_map)
+ commands = self.extract.build_condor_arguments(self.result_map)
srf = commands['srf']
qseq = commands['qseq']
split = commands['split_fastq']
- self.failUnlessEqual(len(srf), 2)
- self.failUnlessEqual(len(qseq), 3)
- self.failUnlessEqual(len(split), 2)
+ self.assertEqual(len(srf), 2)
+ self.assertEqual(len(qseq), 3)
+ self.assertEqual(len(split), 4)
srf_data = {
- os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
+ os.path.join(self.result_map['11154'],
+ '11154_30221AAXX_c33_l4.fastq'): {
'mid': None,
'ispaired': False,
'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
'flowcell': u'30221AAXX',
- 'target': os.path.join(self.subname,
+ 'target': os.path.join(self.result_map['11154'],
u'11154_30221AAXX_c33_l4.fastq'),
},
- os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
+ os.path.join(self.result_map['11154'],
+ '11154_30DY0AAXX_c151_l8_r1.fastq'): {
'mid': None,
'ispaired': True,
'flowcell': u'30DY0AAXX',
'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
'mid': 76,
'target':
- os.path.join(self.subname,
+ os.path.join(self.result_map['11154'],
u'11154_30DY0AAXX_c151_l8_r1.fastq'),
'target_right':
- os.path.join(self.subname,
+ os.path.join(self.result_map['11154'],
u'11154_30DY0AAXX_c151_l8_r2.fastq'),
}
}
for args in srf:
expected = srf_data[args['target']]
- self.failUnlessEqual(args['ispaired'], expected['ispaired'])
- self.failUnlessEqual(len(args['sources']), 1)
+ self.assertEqual(args['ispaired'], expected['ispaired'])
+ self.assertEqual(len(args['sources']), 1)
_, source_filename = os.path.split(args['sources'][0])
- self.failUnlessEqual(source_filename, expected['sources'][0])
- self.failUnlessEqual(args['target'], expected['target'])
+ self.assertEqual(source_filename, expected['sources'][0])
+ self.assertEqual(args['target'], expected['target'])
if args['ispaired']:
- self.failUnlessEqual(args['target_right'],
+ self.assertEqual(args['target_right'],
expected['target_right'])
if 'mid' in expected:
- self.failUnlessEqual(args['mid'], expected['mid'])
+ self.assertEqual(args['mid'], expected['mid'])
qseq_data = {
- os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
+ os.path.join(self.result_map['11154'],
+ '11154_42JUYAAXX_c76_l5_r1.fastq'): {
'istar': True,
'ispaired': True,
'sources': [
u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
},
- os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
+ os.path.join(self.result_map['11154'],
+ '11154_42JUYAAXX_c76_l5_r2.fastq'): {
'istar': True,
'ispaired': True,
'sources': [
u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
},
- os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
+ os.path.join(self.result_map['11154'],
+ '11154_61MJTAAXX_c76_l6.fastq'): {
'istar': True,
'ispaired': False,
'sources': [
}
for args in qseq:
expected = qseq_data[args['target']]
- self.failUnlessEqual(args['istar'], expected['istar'])
- self.failUnlessEqual(args['ispaired'], expected['ispaired'])
+ self.assertEqual(args['istar'], expected['istar'])
+ self.assertEqual(args['ispaired'], expected['ispaired'])
for i in range(len(expected['sources'])):
_, filename = os.path.split(args['sources'][i])
- self.failUnlessEqual(filename, expected['sources'][i])
+ self.assertEqual(filename, expected['sources'][i])
split_test = dict((( x['target'], x) for x in
{'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
u'11154_NoIndex_L003_R2_002.fastq.gz'],
'pyscript': 'desplit_fastq.pyc',
- 'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
+ 'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
+ {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
+ u'12345_CGATGT_L003_R1_002.fastq.gz',
+ u'12345_CGATGT_L003_R1_003.fastq.gz',
+ ],
+ 'pyscript': 'desplit_fastq.pyc',
+ 'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
+ {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
+ u'12345_CGATGT_L003_R2_002.fastq.gz',
+ u'12345_CGATGT_L003_R2_003.fastq.gz',
+ ],
+ 'pyscript': 'desplit_fastq.pyc',
+ 'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
+ ]
))
for arg in split:
_, target = os.path.split(arg['target'])
pyscript = split_test[target]['pyscript']
- self.failUnless(arg['pyscript'].endswith(pyscript))
+ self.assertTrue(arg['pyscript'].endswith(pyscript))
filename = split_test[target]['target']
- self.failUnless(arg['target'].endswith(filename))
+ self.assertTrue(arg['target'].endswith(filename))
for s_index in range(len(arg['sources'])):
s1 = arg['sources'][s_index]
s2 = split_test[target]['sources'][s_index]
- self.failUnless(s1.endswith(s2))
-
- #print '-------commands---------'
- #pprint (commands)
+ self.assertTrue(s1.endswith(s2))
def test_create_scripts(self):
- os.chdir(self.tempdir)
- extract = condorfastq.CondorFastqExtract('host',
- FAKE_APIDATA,
- self.tempdir,
- self.logdir)
- extract.api = FakeApi()
- extract.create_scripts(self.result_map)
-
- self.failUnless(os.path.exists('srf.condor'))
+ self.extract.create_scripts(self.result_map)
+
+ self.assertTrue(os.path.exists('srf.condor'))
with open('srf.condor', 'r') as srf:
arguments = [ l for l in srf if l.startswith('argument') ]
arguments.sort()
- self.failUnlessEqual(len(arguments), 2)
- self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
+ self.assertEqual(len(arguments), 2)
+ self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
in arguments[0])
- self.failUnless(
- '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
+ self.assertTrue(
+ 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
arguments[1])
- self.failUnless(os.path.exists('qseq.condor'))
+ self.assertTrue(os.path.exists('qseq.condor'))
with open('qseq.condor', 'r') as srf:
arguments = [ l for l in srf if l.startswith('argument') ]
arguments.sort()
- self.failUnlessEqual(len(arguments), 3)
- self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
+ self.assertEqual(len(arguments), 3)
+ self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
arguments[0])
- self.failUnless(
+ self.assertTrue(
'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
arguments[1])
- self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
+ self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
arguments[2])
- self.failUnless(os.path.exists('split_fastq.condor'))
+ self.assertTrue(os.path.exists('split_fastq.condor'))
with open('split_fastq.condor', 'r') as split:
arguments = [ l for l in split if l.startswith('argument') ]
arguments.sort()
- self.failUnlessEqual(len(arguments), 2)
- self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
+ self.assertEqual(len(arguments), 4)
+ # Lane 3 Read 1
+ self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
arguments[0])
- self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
+ # Lane 3 Read 2
+ self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
arguments[1])
+ # Lane 3 Read 1
+ self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
+ self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
+ self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
+ self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
+
+ # Lane 3 Read 2
+ self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
+ self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
+ self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
+ self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
+
def suite():
suite = unittest.makeSuite(TestCondorFastq, 'test')