From 20a7487b3db5104392a70ffb62ad92a00e0057c1 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 31 Aug 2012 12:29:59 -0700 Subject: [PATCH] Fix matching scanned sequence files to library IDs work for hiseq runs. The previous version was keying off of flowcell/lane so if you had multiple libraries from the same flowcell/lane all the sequences would end up in one of the libraries. Hopefully this fixes that. Though to do this I ended up changing the whole structure of condorfastq to be based on updating an RDF model. This depends on the sequence.py module changes of saving things to rdf models -- and the new code to infer library ids at that layer. --- htsworkflow/submission/condorfastq.py | 265 +++++--- .../submission/test/test_condorfastq.py | 607 ++++++++++++------ 2 files changed, 599 insertions(+), 273 deletions(-) diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py index 64eb6a1..5ed9243 100644 --- a/htsworkflow/submission/condorfastq.py +++ b/htsworkflow/submission/condorfastq.py @@ -2,26 +2,34 @@ """ import logging import os -from pprint import pformat +from pprint import pformat,pprint import sys import types +from urlparse import urljoin, urlparse -from htsworkflow.pipelines.sequences import scan_for_sequences +from htsworkflow.pipelines.sequences import scan_for_sequences, \ + update_model_sequence_library from htsworkflow.pipelines.samplekey import SampleKey from htsworkflow.pipelines import qseq2fastq from htsworkflow.pipelines import srf2fastq from htsworkflow.pipelines import desplit_fastq -from htsworkflow.util.api import HtswApi +from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \ + fromTypedNode, \ + libraryOntology, \ + stripNamespace, \ + rdfNS from htsworkflow.util.conversion import parse_flowcell_id from django.conf import settings from django.template import Context, loader +import RDF + LOGGER = logging.getLogger(__name__) class CondorFastqExtract(object): - def __init__(self, host, apidata, sequences_path, + def __init__(self, host, sequences_path, log_path='log', force=False): """Extract fastqs from results archive @@ -33,7 +41,8 @@ class CondorFastqExtract(object): log_path (str): where to put condor log files force (bool): do we force overwriting current files? """ - self.api = HtswApi(host, apidata) + self.host = host + self.model = get_model() self.sequences_path = sequences_path self.log_path = log_path self.force = force @@ -62,7 +71,7 @@ class CondorFastqExtract(object): 'logdir': self.log_path, 'env': env, 'args': condor_entries[script_type], - 'root_url': self.api.root_url, + 'root_url': self.host, } context = Context(variables) @@ -79,8 +88,8 @@ class CondorFastqExtract(object): 'split_fastq': self.condor_desplit_fastq } by_sample = {} - lib_db = self.find_archive_sequence_files(result_map) - needed_targets = self.find_missing_targets(result_map, lib_db) + sequences = self.find_archive_sequence_files(result_map) + needed_targets = self.find_missing_targets(result_map, sequences) for target_pathname, available_sources in needed_targets.items(): LOGGER.debug(' target : %s' % (target_pathname,)) @@ -98,7 +107,7 @@ class CondorFastqExtract(object): condor_entries.setdefault(condor_type, []).append( conversion(sources, target_pathname)) for s in sources: - by_sample.setdefault(s.lane_id,[]).append( + by_sample.setdefault(s.lane_number,[]).append( target_pathname) else: print " need file", target_pathname @@ -110,41 +119,83 @@ class CondorFastqExtract(object): """ Find archived sequence files associated with our results. """ - LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,)) - - lib_db = {} - seq_dirs = set() - candidate_lanes = {} + self.import_libraries(result_map) + flowcell_ids = self.find_relavant_flowcell_ids() + self.import_sequences(flowcell_ids) + + query = RDF.SPARQLQuery(""" + prefix libns: + prefix rdf: + prefix xsd: + + select ?filenode ?filetype ?cycle ?lane_number ?read + ?library ?library_id + ?flowcell ?flowcell_id ?read_length + ?flowcell_type ?flowcell_status + where { + ?filenode libns:cycle ?cycle ; + libns:lane_number ?lane_number ; + libns:read ?read ; + libns:flowcell ?flowcell ; + libns:flowcell_id ?flowcell_id ; + libns:library ?library ; + libns:library_id ?library_id ; + rdf:type ?filetype ; + a libns:raw_file . + ?flowcell libns:read_length ?read_length ; + libns:flowcell_type ?flowcell_type . + OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status } + FILTER(?filetype != libns:raw_file) + } + """) + results = [] + for r in query.execute(self.model): + library_id = fromTypedNode(r['library_id']) + if library_id in result_map: + results.append(SequenceResult(r)) + return results + + def import_libraries(self, result_map): for lib_id in result_map.keys(): - lib_info = self.api.get_library(lib_id) - lib_info['lanes'] = {} - lib_db[lib_id] = lib_info - - for lane in lib_info['lane_set']: - lane_key = (lane['flowcell'], lane['lane_number']) - candidate_lanes[lane_key] = (lib_id, lane['lane_id']) - seq_dirs.add(os.path.join(self.sequences_path, - 'flowcells', - lane['flowcell'])) - LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs))) - candidate_seq_list = scan_for_sequences(seq_dirs) - - # at this point we have too many sequences as scan_for_sequences - # returns all the sequences in a flowcell directory - # so lets filter out the extras - - for seq in candidate_seq_list: - lane_key = (seq.flowcell, seq.lane) - candidate_key = candidate_lanes.get(lane_key, None) - if candidate_key is not None: - lib_id, lane_id = candidate_key - seq.lane_id = lane_id - lib_info = lib_db[lib_id] - lib_info['lanes'].setdefault(lane_key, set()).add(seq) - - return lib_db - - def find_missing_targets(self, result_map, lib_db): + liburl = urljoin(self.host, 'library/%s/' % (lib_id,)) + library = RDF.Node(RDF.Uri(liburl)) + self.import_library(library) + + def import_library(self, library): + """Import library data into our model if we don't have it already + """ + q = RDF.Statement(library, rdfNS['type'], libraryOntology['library']) + if not self.model.contains_statement(q): + load_into_model(self.model, 'rdfa', library) + + def find_relavant_flowcell_ids(self): + """Generate set of flowcell ids that had samples of interest on them + """ + flowcell_query =RDF.SPARQLQuery(""" +prefix libns: + +select distinct ?library ?flowcell ?flowcell_id +WHERE { + ?library a libns:library ; + libns:has_lane ?lane . + ?lane libns:flowcell ?flowcell . + ?flowcell libns:flowcell_id ?flowcell_id . +}""") + flowcell_ids = set() + for r in flowcell_query.execute(self.model): + flowcell_ids.add( fromTypedNode(r['flowcell_id']) ) + LOGGER.debug("Flowcells = %s" %(unicode(flowcell_ids))) + return flowcell_ids + + def import_sequences(self, flowcell_ids): + seq_dirs = ( os.path.join(self.sequences_path, f) + for f in flowcell_ids ) + sequences = scan_for_sequences(seq_dirs) + for seq in sequences: + seq.save_to_model(self.model) + update_model_sequence_library(self.model, self.host) + + def find_missing_targets(self, result_map, raw_files): """ Check if the sequence file exists. This requires computing what the sequence name is and checking @@ -156,61 +207,48 @@ class CondorFastqExtract(object): fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq' # find what targets we're missing needed_targets = {} - for lib_id in result_map.keys(): - result_dir = result_map[lib_id] - lib = lib_db[lib_id] - lane_dict = make_lane_dict(lib_db, lib_id) - - for lane_key, sequences in lib['lanes'].items(): - for seq in sequences: - seq.paired = lane_dict[seq.flowcell]['paired_end'] - lane_status = lane_dict[seq.flowcell]['status'] - - if seq.paired and seq.read is None: - seq.read = 1 - filename_attributes = { - 'flowcell': seq.flowcell, - 'lib_id': lib_id, - 'lane': seq.lane, - 'read': seq.read, - 'cycle': seq.cycle - } - # skip bad runs - if lane_status == 'Failed': - continue - if seq.flowcell == '30DY0AAXX': - # 30DY0 only ran for 151 bases instead of 152 - # it is actually 76 1st read, 75 2nd read - seq.mid_point = 76 - - # end filters - if seq.paired: - target_name = fastq_paired_template % \ - filename_attributes - else: - target_name = fastq_single_template % \ - filename_attributes - - target_pathname = os.path.join(result_dir, target_name) - if self.force or not os.path.exists(target_pathname): - t = needed_targets.setdefault(target_pathname, {}) - t.setdefault(seq.filetype, []).append(seq) + for seq in raw_files: + if not seq.isgood: + continue + filename_attributes = { + 'flowcell': seq.flowcell_id, + 'lib_id': seq.library_id, + 'lane': seq.lane_number, + 'read': seq.read, + 'cycle': seq.cycle + } + + if seq.ispaired: + target_name = fastq_paired_template % \ + filename_attributes + else: + target_name = fastq_single_template % \ + filename_attributes - return needed_targets + result_dir = result_map[seq.library_id] + target_pathname = os.path.join(result_dir, target_name) + if self.force or not os.path.exists(target_pathname): + t = needed_targets.setdefault(target_pathname, {}) + t.setdefault(seq.filetype, []).append(seq) + return needed_targets def condor_srf_to_fastq(self, sources, target_pathname): if len(sources) > 1: raise ValueError("srf to fastq can only handle one file") + mid_point = None + if sources[0].flowcell_id == '30DY0AAXX': + mid_point = 76 + return { - 'sources': [os.path.abspath(sources[0].path)], + 'sources': [sources[0].path], 'pyscript': srf2fastq.__file__, - 'flowcell': sources[0].flowcell, - 'ispaired': sources[0].paired, + 'flowcell': sources[0].flowcell_id, + 'ispaired': sources[0].ispaired, 'target': target_pathname, 'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'), - 'mid': getattr(sources[0], 'mid_point', None), + 'mid': mid_point, 'force': self.force, } @@ -221,10 +259,10 @@ class CondorFastqExtract(object): paths.sort() return { 'pyscript': qseq2fastq.__file__, - 'flowcell': sources[0].flowcell, + 'flowcell': sources[0].flowcell_id, 'target': target_pathname, 'sources': paths, - 'ispaired': sources[0].paired, + 'ispaired': sources[0].ispaired, 'istar': len(sources) == 1, } @@ -237,11 +275,9 @@ class CondorFastqExtract(object): 'pyscript': desplit_fastq.__file__, 'target': target_pathname, 'sources': paths, - 'ispaired': sources[0].paired, + 'ispaired': sources[0].ispaired, } - def lane_rdf(self, sources, target_pathname): - pass def make_lane_dict(lib_db, lib_id): """ @@ -255,3 +291,48 @@ def make_lane_dict(lib_db, lib_id): result.append((lane['flowcell'], lane)) return dict(result) +class SequenceResult(object): + """Convert the sparql query result from find_archive_sequence_files + """ + def __init__(self, result): + self.filenode = result['filenode'] + self._filetype = result['filetype'] + self.cycle = fromTypedNode(result['cycle']) + self.lane_number = fromTypedNode(result['lane_number']) + self.read = fromTypedNode(result['read']) + if type(self.read) in types.StringTypes: + self.read = 1 + self.library = result['library'] + self.library_id = fromTypedNode(result['library_id']) + self.flowcell = result['flowcell'] + self.flowcell_id = fromTypedNode(result['flowcell_id']) + self.flowcell_type = fromTypedNode(result['flowcell_type']) + self.flowcell_status = fromTypedNode(result['flowcell_status']) + + def _is_good(self): + """is this sequence / flowcell 'good enough'""" + if self.flowcell_status is not None and \ + self.flowcell_status.lower() == "failed": + return False + return True + isgood = property(_is_good) + + def _get_ispaired(self): + if self.flowcell_type.lower() == "paired": + return True + else: + return False + ispaired = property(_get_ispaired) + + def _get_filetype(self): + return stripNamespace(libraryOntology, self._filetype) + filetype = property(_get_filetype) + + def _get_path(self): + url = urlparse(str(self.filenode.uri)) + if url.scheme == 'file': + return url.path + else: + errmsg = u"Unsupported scheme {0} for {1}" + raise ValueError(errmsg.format(url.scheme, unicode(url))) + path = property(_get_path) diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py index 6aaf584..24d6e49 100644 --- a/htsworkflow/submission/test/test_condorfastq.py +++ b/htsworkflow/submission/test/test_condorfastq.py @@ -7,8 +7,9 @@ import shutil import tempfile import unittest -from htsworkflow.submission import condorfastq +from htsworkflow.submission.condorfastq import CondorFastqExtract from htsworkflow.submission.results import ResultMap +from htsworkflow.util.rdfhelp import load_string_into_model, dump_model FCDIRS = [ 'C02F9ACXX', @@ -16,6 +17,7 @@ FCDIRS = [ 'C02F9ACXX/C1-202/Project_11154', 'C02F9ACXX/C1-202/Project_12342_Index1', 'C02F9ACXX/C1-202/Project_12342_Index2', + 'C02F9ACXX/C1-202/Project_12345', '42JUYAAXX', '42JUYAAXX/C1-76', '30221AAXX', @@ -31,9 +33,18 @@ DATAFILES = [ 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz', 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz', 'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz', - 'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz', + 'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2', '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2', @@ -77,77 +88,288 @@ DATAFILES = [ '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2', ] -LIBDATA = { - '11154':{u'antibody_id': None, - u'cell_line': u'Unknown', - u'cell_line_id': 1, - u'experiment_type': u'RNA-seq', - u'experiment_type_id': 4, - u'gel_cut_size': 300, - u'hidden': False, - u'id': u'11154', - u'insert_size': 200, - u'lane_set': [{u'flowcell': u'30221AAXX', - u'lane_number': 4, - u'lane_id': 3400, - u'paired_end': False, - u'read_length': 33, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'42JUYAAXX', - u'lane_number': 5, - u'lane_id': 4200, - u'paired_end': True, - u'read_length': 76, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'61MJTAAXX', - u'lane_number': 6, - u'lane_id': 6600, - u'paired_end': False, - u'read_length': 76, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'30DY0AAXX', - u'lane_number': 8, - u'lane_id': 3800, - u'paired_end': True, - u'read_length': 76, - u'status': u'Unknown', - u'status_code': None}, - {u'flowcell': u'C02F9ACXX', - u'lane_number': 3, - u'lane_id': 12300, - u'paired_end': True, - u'read_length': 101, - u'status': u'Unknown', - u'status_code': None}], - u'library_id': u'11154', - u'library_name': u'Paired ends ASDF ', - u'library_species': u'Mus musculus', - u'library_species_id': 9, - u'library_type': u'Paired End (non-multiplexed)', - u'library_type_id': 2, - u'made_by': u'Gary Gygax', - u'made_for': u'TSR', - u'notes': u'300 bp gel fragment', - u'replicate': 1, - u'stopping_point': u'1Aa', - u'successful_pM': None, - u'undiluted_concentration': u'29.7'} - } - -FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'} - -class FakeApi(object): - def __init__(self, *args, **kwargs): - self.root_url = 'http://localhost' - - def get_library(self, libid): - lib_data = LIBDATA[libid] - return copy.deepcopy(lib_data) - - +lib_turtle = """@prefix : . +@prefix rdfs: . +@prefix dc: . +@prefix xsd: . +@prefix libns: . +@prefix seqns: . +@prefix invns: . + + + a libns:illumina_flowcell ; + libns:read_length 33 ; + libns:flowcell_type "Single"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "30221AAXX"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 1 . + + libns:flowcell ; + libns:library ; + libns:lane_number 2 . + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + + libns:flowcell ; + libns:library ; + libns:lane_number 4 . + # paired_end 1; + # read_length 33; + # status "Unknown"@en . + + libns:flowcell ; + libns:library ; + libns:lane_number 5 . + + libns:flowcell ; + libns:library ; + libns:lane_number 6 . + + libns:flowcell ; + libns:library ; + libns:lane_number 7 . + + libns:flowcell ; + libns:library ; + libns:lane_number 8 . + + + a libns:illumina_flowcell ; + libns:read_length 76 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "42JUYAAXX"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 1 . + + libns:flowcell ; + libns:library ; + libns:lane_number 2 . + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + + libns:flowcell ; + libns:library ; + libns:lane_number 4 . + + libns:flowcell ; + libns:library ; + libns:lane_number 5 . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + libns:flowcell ; + libns:library ; + libns:lane_number 6 . + + libns:flowcell ; + libns:library ; + libns:lane_number 7 . + + libns:flowcell ; + libns:library ; + libns:lane_number 8 . + + + a libns:illumina_flowcell ; + libns:read_length 76 ; + libns:flowcell_type "Single"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "61MJTAAXX"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 1 . + + libns:flowcell ; + libns:library ; + libns:lane_number 2 . + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + + libns:flowcell ; + libns:library ; + libns:lane_number 4 . + + libns:flowcell ; + libns:library ; + libns:lane_number 5 . + + libns:flowcell ; + libns:library ; + libns:lane_number 6 . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + libns:flowcell ; + libns:library ; + libns:lane_number 7 . + + libns:flowcell ; + libns:library ; + libns:lane_number 8 . + + + a libns:illumina_flowcell ; + libns:read_length 76 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "30DY0AAXX"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 1 . + + libns:flowcell ; + libns:library ; + libns:lane_number 2 . + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + + libns:flowcell ; + libns:library ; + libns:lane_number 4 . + + libns:flowcell ; + libns:library ; + libns:lane_number 5 . + + libns:flowcell ; + libns:library ; + libns:lane_number 6 . + + libns:flowcell ; + libns:library ; + libns:lane_number 7 . + + libns:flowcell ; + libns:library ; + libns:lane_number 8 . + # paired_end 1; + # read_length 76; + # status "Unknown"@en . + + + a libns:illumina_flowcell ; + libns:read_length 101 ; + libns:flowcell_type "Paired"@en ; + libns:date "2012-01-19T20:23:26"^^xsd:dateTime; + libns:has_lane ; + libns:has_lane ; + libns:flowcell_id "C02F9ACXX"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + # paired_end 1; + # read_length 101; + # status "Unknown"@en . + + + libns:flowcell ; + libns:library ; + libns:lane_number 3 . + # paired_end 1; + # read_length 101; + # status "Unknown"@en . + + + a libns:library ; + libns:affiliation "TSR"@en; + libns:concentration "29.7"; + libns:date "2012-12-28T00:00:00"^^xsd:dateTime ; + libns:experiment_type "RNA-seq"@en ; + libns:gel_cut 300 ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:has_lane ; + libns:insert_size 2000 ; + libns:library_id "11154"@en ; + libns:library_type "Paired End (Multiplexed)"@en ; + libns:made_by "Gary Gygax"@en ; + libns:name "Paired Ends ASDF"@en ; + libns:replicate "1"@en; + libns:species "Mus musculus"@en ; + libns:stopping_point "Completed"@en ; + libns:total_unique_locations 8841201 . + # cell_line + + + + a libns:library ; + libns:affiliation "TSR"@en; + libns:concentration "12.345"; + libns:cell_line "Unknown"@en ; + libns:date "2012-12-28T00:00:00"^^xsd:dateTime ; + libns:experiment_type "RNA-seq"@en ; + libns:gel_cut 300 ; + libns:has_lane ; + libns:insert_size 2000 ; + libns:library_id "12345"@en ; + libns:library_type "Paired End (Multiplexed)"@en ; + libns:made_by "Gary Gygax"@en ; + libns:name "Paired Ends THING"@en ; + libns:replicate "1"@en; + libns:species "Mus musculus"@en ; + libns:stopping_point "Completed"@en ; + libns:total_unique_locations 8841201 . + # cell_line +""" +HOST = "http://localhost" class TestCondorFastq(unittest.TestCase): def setUp(self): @@ -168,141 +390,146 @@ class TestCondorFastq(unittest.TestCase): with open(filename, 'w') as stream: stream.write('testfile') - self.subname = unicode('sub-11154') - self.subdir = os.path.join(self.tempdir, self.subname) - os.mkdir(self.subdir) - self.result_map = ResultMap() - self.result_map['11154'] = self.subname + for lib_id in [u'11154', u'12345']: + subname = 'sub-%s' % (lib_id,) + sub_dir = os.path.join(self.tempdir, subname) + os.mkdir(sub_dir) + self.result_map[lib_id] = sub_dir + + self.extract = CondorFastqExtract(HOST, + self.flowcelldir, + self.logdir) + load_string_into_model(self.extract.model, 'turtle', lib_turtle) def tearDown(self): shutil.rmtree(self.tempdir) os.chdir(self.cwd) + def test_find_relavant_flowcell_ids(self): + expected = set(('30221AAXX', + '42JUYAAXX', + '61MJTAAXX', + '30DY0AAXX', + 'C02F9ACXX')) + flowcell_ids = self.extract.find_relavant_flowcell_ids() + self.assertEqual(flowcell_ids, expected) + def test_find_archive_sequence(self): - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - - lib_db = extract.find_archive_sequence_files(self.result_map) - - self.failUnlessEqual(len(lib_db['11154']['lanes']), 5) - lanes = [ - lib_db['11154']['lanes'][(u'30221AAXX', 4)], - lib_db['11154']['lanes'][(u'42JUYAAXX', 5)], - lib_db['11154']['lanes'][(u'61MJTAAXX', 6)], - lib_db['11154']['lanes'][(u'30DY0AAXX', 8)], - lib_db['11154']['lanes'][(u'C02F9ACXX', 3)], - ] - self.failUnlessEqual(len(lanes[0]), 1) - self.failUnlessEqual(len(lanes[1]), 2) - self.failUnlessEqual(len(lanes[2]), 1) - self.failUnlessEqual(len(lanes[3]), 1) - self.failUnlessEqual(len(lanes[4]), 4) + seqs = self.extract.find_archive_sequence_files(self.result_map) + + expected = set([ + (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'), + (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'), + (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'), + (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'), + (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'), + (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'), + (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'), + (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf') + ]) + found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs]) + self.assertEqual(expected, found) def test_find_needed_targets(self): + lib_db = self.extract.find_archive_sequence_files(self.result_map) - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - lib_db = extract.find_archive_sequence_files(self.result_map) - - needed_targets = extract.find_missing_targets(self.result_map, - lib_db) - self.failUnlessEqual(len(needed_targets), 7) + needed_targets = self.extract.find_missing_targets(self.result_map, + lib_db) + self.assertEqual(len(needed_targets), 9) srf_30221 = needed_targets[ - self.subname + u'/11154_30221AAXX_c33_l4.fastq'] + self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq'] qseq_42JUY_r1 = needed_targets[ - self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq'] + self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq'] qseq_42JUY_r2 = needed_targets[ - self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq'] + self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq'] qseq_61MJT = needed_targets[ - self.subname + u'/11154_61MJTAAXX_c76_l6.fastq'] + self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq'] split_C02F9_r1 = needed_targets[ - self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq'] + self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq'] split_C02F9_r2 = needed_targets[ - self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq'] + self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq'] - self.failUnlessEqual(len(srf_30221['srf']), 1) - self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1) - self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1) - self.failUnlessEqual(len(qseq_61MJT['qseq']), 1) - self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2) - self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2) - - #print '-------needed targets---------' - #pprint(needed_targets) + self.assertEqual(len(srf_30221['srf']), 1) + self.assertEqual(len(qseq_42JUY_r1['qseq']), 1) + self.assertEqual(len(qseq_42JUY_r2['qseq']), 1) + self.assertEqual(len(qseq_61MJT['qseq']), 1) + self.assertEqual(len(split_C02F9_r1['split_fastq']), 2) + self.assertEqual(len(split_C02F9_r2['split_fastq']), 2) def test_generate_fastqs(self): - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - commands = extract.build_condor_arguments(self.result_map) + commands = self.extract.build_condor_arguments(self.result_map) srf = commands['srf'] qseq = commands['qseq'] split = commands['split_fastq'] - self.failUnlessEqual(len(srf), 2) - self.failUnlessEqual(len(qseq), 3) - self.failUnlessEqual(len(split), 2) + self.assertEqual(len(srf), 2) + self.assertEqual(len(qseq), 3) + self.assertEqual(len(split), 4) srf_data = { - os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): { + os.path.join(self.result_map['11154'], + '11154_30221AAXX_c33_l4.fastq'): { 'mid': None, 'ispaired': False, 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'], 'flowcell': u'30221AAXX', - 'target': os.path.join(self.subname, + 'target': os.path.join(self.result_map['11154'], u'11154_30221AAXX_c33_l4.fastq'), }, - os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): { + os.path.join(self.result_map['11154'], + '11154_30DY0AAXX_c151_l8_r1.fastq'): { 'mid': None, 'ispaired': True, 'flowcell': u'30DY0AAXX', 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'], 'mid': 76, 'target': - os.path.join(self.subname, + os.path.join(self.result_map['11154'], u'11154_30DY0AAXX_c151_l8_r1.fastq'), 'target_right': - os.path.join(self.subname, + os.path.join(self.result_map['11154'], u'11154_30DY0AAXX_c151_l8_r2.fastq'), } } for args in srf: expected = srf_data[args['target']] - self.failUnlessEqual(args['ispaired'], expected['ispaired']) - self.failUnlessEqual(len(args['sources']), 1) + self.assertEqual(args['ispaired'], expected['ispaired']) + self.assertEqual(len(args['sources']), 1) _, source_filename = os.path.split(args['sources'][0]) - self.failUnlessEqual(source_filename, expected['sources'][0]) - self.failUnlessEqual(args['target'], expected['target']) + self.assertEqual(source_filename, expected['sources'][0]) + self.assertEqual(args['target'], expected['target']) if args['ispaired']: - self.failUnlessEqual(args['target_right'], + self.assertEqual(args['target_right'], expected['target_right']) if 'mid' in expected: - self.failUnlessEqual(args['mid'], expected['mid']) + self.assertEqual(args['mid'], expected['mid']) qseq_data = { - os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): { + os.path.join(self.result_map['11154'], + '11154_42JUYAAXX_c76_l5_r1.fastq'): { 'istar': True, 'ispaired': True, 'sources': [ u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2'] }, - os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): { + os.path.join(self.result_map['11154'], + '11154_42JUYAAXX_c76_l5_r2.fastq'): { 'istar': True, 'ispaired': True, 'sources': [ u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2'] }, - os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): { + os.path.join(self.result_map['11154'], + '11154_61MJTAAXX_c76_l6.fastq'): { 'istar': True, 'ispaired': False, 'sources': [ @@ -311,11 +538,11 @@ class TestCondorFastq(unittest.TestCase): } for args in qseq: expected = qseq_data[args['target']] - self.failUnlessEqual(args['istar'], expected['istar']) - self.failUnlessEqual(args['ispaired'], expected['ispaired']) + self.assertEqual(args['istar'], expected['istar']) + self.assertEqual(args['ispaired'], expected['ispaired']) for i in range(len(expected['sources'])): _, filename = os.path.split(args['sources'][i]) - self.failUnlessEqual(filename, expected['sources'][i]) + self.assertEqual(filename, expected['sources'][i]) split_test = dict((( x['target'], x) for x in @@ -326,64 +553,82 @@ class TestCondorFastq(unittest.TestCase): {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz', u'11154_NoIndex_L003_R2_002.fastq.gz'], 'pyscript': 'desplit_fastq.pyc', - 'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}] + 'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}, + {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz', + u'12345_CGATGT_L003_R1_002.fastq.gz', + u'12345_CGATGT_L003_R1_003.fastq.gz', + ], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'}, + {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz', + u'12345_CGATGT_L003_R2_002.fastq.gz', + u'12345_CGATGT_L003_R2_003.fastq.gz', + ], + 'pyscript': 'desplit_fastq.pyc', + 'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'} + ] )) for arg in split: _, target = os.path.split(arg['target']) pyscript = split_test[target]['pyscript'] - self.failUnless(arg['pyscript'].endswith(pyscript)) + self.assertTrue(arg['pyscript'].endswith(pyscript)) filename = split_test[target]['target'] - self.failUnless(arg['target'].endswith(filename)) + self.assertTrue(arg['target'].endswith(filename)) for s_index in range(len(arg['sources'])): s1 = arg['sources'][s_index] s2 = split_test[target]['sources'][s_index] - self.failUnless(s1.endswith(s2)) - - #print '-------commands---------' - #pprint (commands) + self.assertTrue(s1.endswith(s2)) def test_create_scripts(self): - os.chdir(self.tempdir) - extract = condorfastq.CondorFastqExtract('host', - FAKE_APIDATA, - self.tempdir, - self.logdir) - extract.api = FakeApi() - extract.create_scripts(self.result_map) - - self.failUnless(os.path.exists('srf.condor')) + self.extract.create_scripts(self.result_map) + + self.assertTrue(os.path.exists('srf.condor')) with open('srf.condor', 'r') as srf: arguments = [ l for l in srf if l.startswith('argument') ] arguments.sort() - self.failUnlessEqual(len(arguments), 2) - self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq' + self.assertEqual(len(arguments), 2) + self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq' in arguments[0]) - self.failUnless( - '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in + self.assertTrue( + 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in arguments[1]) - self.failUnless(os.path.exists('qseq.condor')) + self.assertTrue(os.path.exists('qseq.condor')) with open('qseq.condor', 'r') as srf: arguments = [ l for l in srf if l.startswith('argument') ] arguments.sort() - self.failUnlessEqual(len(arguments), 3) - self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in + self.assertEqual(len(arguments), 3) + self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in arguments[0]) - self.failUnless( + self.assertTrue( 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in arguments[1]) - self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in + self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in arguments[2]) - self.failUnless(os.path.exists('split_fastq.condor')) + self.assertTrue(os.path.exists('split_fastq.condor')) with open('split_fastq.condor', 'r') as split: arguments = [ l for l in split if l.startswith('argument') ] arguments.sort() - self.failUnlessEqual(len(arguments), 2) - self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \ + self.assertEqual(len(arguments), 4) + # Lane 3 Read 1 + self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \ arguments[0]) - self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \ + # Lane 3 Read 2 + self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \ arguments[1]) + # Lane 3 Read 1 + self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2]) + self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2]) + self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2]) + self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2]) + + # Lane 3 Read 2 + self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3]) + self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3]) + self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3]) + self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3]) + def suite(): suite = unittest.makeSuite(TestCondorFastq, 'test') -- 2.30.2