From: Diane Trout Date: Fri, 13 Feb 2009 01:51:58 +0000 (+0000) Subject: Merge in the library list, detail, and results downloading feature from X-Git-Tag: 0.2.0.1~24 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=442a9a820e75598efa4d8a638ad4bf3c34fe212c Merge in the library list, detail, and results downloading feature from the Caltech live site. There's several components in the frontend tree to render the pages, in addition this adds in some helper functions in pipelines.eland to simplify computing summary statistics for an eland lane. I also needed to merge in a generator based makebed code for returning the files to the django database. To use this, the settings file in this branch will need a variable RESULT_HOME_DIR to be set. --- diff --git a/htsworkflow/frontend/samples/results.py b/htsworkflow/frontend/samples/results.py new file mode 100644 index 0000000..d7cb77e --- /dev/null +++ b/htsworkflow/frontend/samples/results.py @@ -0,0 +1,134 @@ +from htsworkflow.frontend import settings + +import glob +import os +import re + +s_paren = re.compile("^\w+") + +def get_flowcell_result_dict(flowcell_id): + """ + returns a dictionary following the following pattern for + a given flowcell_id: + + + d['C1-33']['summary'] # Summary.htm file path + d['C1-33']['eland_results'][5] # C1-33 lane 5 file eland results file path + d['C1-33']['run_xml'] # run_*.xml file path + d['C1-33']['scores'] # scores.tar.gz file path + """ + flowcell_id = flowcell_id.strip() + + d = {} + + ################################ + # Flowcell Directory + fc_dir = glob.glob(os.path.join(settings.RESULT_HOME_DIR, flowcell_id)) + + # Not found + if len(fc_dir) == 0: + return None + + # No duplicates! + assert len(fc_dir) <= 1 + + # Found fc dir + fc_dir = fc_dir[0] + + ################################ + # C#-## dirs + c_dir_list = glob.glob(os.path.join(fc_dir, 'C*')) + + # Not found + if len(c_dir_list) == 0: + return d + + for c_dir_path in c_dir_list: + summary_file = glob.glob(os.path.join(c_dir_path, 'Summary.htm')) + pathdir, c_dir = os.path.split(c_dir_path) + + # Create sub-dictionary + d[c_dir] = {} + + + ############################### + # Summary.htm file + + # Not found + if len(summary_file) == 0: + d[c_dir]['summary'] = None + + # Found + else: + # No duplicates! + assert len(summary_file) == 1 + + summary_file = summary_file[0] + d[c_dir]['summary'] = summary_file + + ############################### + # Result files + + d[c_dir]['eland_results'] = {} + + result_filepaths = glob.glob(os.path.join(c_dir_path, 's_*_eland_result.txt*')) + + for filepath in result_filepaths: + + junk, result_name = os.path.split(filepath) + + #lanes 1-8, single digit, therefore s_#; # == index 2 + lane = int(result_name[2]) + d[c_dir]['eland_results'][lane] = filepath + + ############################### + # run*.xml file + run_xml_filepath = glob.glob(os.path.join(c_dir_path, 'run_*.xml')) + + if len(run_xml_filepath) == 0: + d[c_dir]['run_xml'] = None + else: + # No duplicates + assert len(run_xml_filepath) == 1 + + d[c_dir]['run_xml'] = run_xml_filepath[0] + + ############################### + # scores.tar.gz + scores_filepath = glob.glob(os.path.join(c_dir_path, 'scores*')) + + if len(scores_filepath) == 0: + d[c_dir]['scores'] = None + else: + # No duplicates + assert len(scores_filepath) == 1 + + d[c_dir]['scores'] = scores_filepath[0] + + return d + + +def cn_mTobp(cn_m): + """ + Converts CN-M (i.e. C1-33, C1-26, C4-28) cycle information into + number of base pairs. + """ + pass + + +def parse_flowcell_id(flowcell_id): + """ + Return flowcell id and any status encoded in the id + + We stored the status information in the flowcell id name. + this was dumb, but database schemas are hard to update. + """ + fields = flowcell_id.split() + fcid = None + status = None + if len(fields) > 0: + fcid = fields[0] + if len(fields) > 1: + status = fields[1] + return fcid, status + diff --git a/htsworkflow/frontend/samples/views.py b/htsworkflow/frontend/samples/views.py index 2299e4f..4763f39 100644 --- a/htsworkflow/frontend/samples/views.py +++ b/htsworkflow/frontend/samples/views.py @@ -1 +1,333 @@ -# Create your views here. \ No newline at end of file +# Create your views here. +from htsworkflow.frontend.samples.models import Library +from htsworkflow.frontend.samples.results import get_flowcell_result_dict, parse_flowcell_id +from htsworkflow.pipelines.runfolder import load_pipeline_run_xml +from htsworkflow.pipelines import runfolder +from htsworkflow.frontend import settings +from htsworkflow.util import makebed +from htsworkflow.util import opener + +from django.http import HttpResponse +from django.template.loader import get_template +from django.template import Context + +import StringIO +import logging +import os + +LANE_LIST = [1,2,3,4,5,6,7,8] + +def create_library_list(): + """ + Create a list of libraries that includes how many lanes were run + """ + library_list = [] + for lib in Library.objects.all(): + summary = {} + summary['library_id'] = lib.library_id + summary['library_name'] = lib.library_name + summary['species_name' ] = lib.library_species.scientific_name + lanes_run = 0 + for lane_id in LANE_LIST: + lane = getattr(lib, 'lane_%d_library' % (lane_id,)) + lanes_run += len( lane.all() ) + summary['lanes_run'] = lanes_run + library_list.append(summary) + return library_list + +def library(request): + library_list = create_library_list() + t = get_template('samples/library_index.html') + c = Context({'library_list': library_list }) + return HttpResponse( t.render(c) ) + +def library_to_flowcells(request, lib_id): + """ + Display information about all the flowcells a library has been run on. + """ + t = get_template("samples/library_detail.html") + + try: + lib = Library.objects.get(library_id=lib_id) + except: + return HttpResponse("Library %s does not exist" % (lib_id)) + + output = [] + + output.append('Library ID: %s' % (lib.library_id)) + output.append('Name: %s' % (lib.library_name)) + output.append('Species: %s' % (lib.library_species.scientific_name)) + output.append('') + + output.append('FLOWCELL - LANE:') + + output.extend([ '%s - Lane 1 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 1)) for fc in lib.lane_1_library.all() ]) + output.extend([ '%s - Lane 2 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 2)) for fc in lib.lane_2_library.all() ]) + output.extend([ '%s - Lane 3 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 3)) for fc in lib.lane_3_library.all() ]) + output.extend([ '%s - Lane 4 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 4)) for fc in lib.lane_4_library.all() ]) + output.extend([ '%s - Lane 5 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 5)) for fc in lib.lane_5_library.all() ]) + output.extend([ '%s - Lane 6 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 6)) for fc in lib.lane_6_library.all() ]) + output.extend([ '%s - Lane 7 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 7)) for fc in lib.lane_7_library.all() ]) + output.extend([ '%s - Lane 8 %s' % (fc.flowcell_id, _files(fc.flowcell_id, 8)) for fc in lib.lane_8_library.all() ]) + + record_count = lib.lane_1_library.count() + \ + lib.lane_2_library.count() + \ + lib.lane_3_library.count() + \ + lib.lane_4_library.count() + \ + lib.lane_5_library.count() + \ + lib.lane_6_library.count() + \ + lib.lane_7_library.count() + \ + lib.lane_8_library.count() + + flowcell_list = [] + flowcell_list.extend([ (fc.flowcell_id, 1) for fc in lib.lane_1_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 2) for fc in lib.lane_2_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 3) for fc in lib.lane_3_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 4) for fc in lib.lane_4_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 5) for fc in lib.lane_5_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 6) for fc in lib.lane_6_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 7) for fc in lib.lane_7_library.all() ]) + flowcell_list.extend([ (fc.flowcell_id, 8) for fc in lib.lane_8_library.all() ]) + flowcell_list.sort() + + lane_summary_list = [] + for fc, lane in flowcell_list: + lane_summary, err_list = _summary_stats(fc, lane) + + lane_summary_list.extend(lane_summary) + + for err in err_list: + output.append(err) + + output.append('
') + output.append(t.render(Context({'lane_summary_list': lane_summary_list}))) + output.append('
') + + if record_count == 0: + output.append("None Found") + + return HttpResponse('
\n'.join(output)) + + +def summaryhtm_fc_cnm(request, fc_id, cnm): + """ + returns a Summary.htm file if it exists. + """ + fc_id, status = parse_flowcell_id(fc_id) + d = get_flowcell_result_dict(fc_id) + + if d is None: + return HttpResponse('Results for Flowcell %s not found.' % (fc_id)) + + if cnm not in d: + return HttpResponse('Results for Flowcell %s; %s not found.' % (fc_id, cnm)) + + summary_filepath = d[cnm]['summary'] + + if summary_filepath is None: + return HttpResponse('Summary.htm for Flowcell %s; %s not found.' % (fc_id, cnm)) + + f = open(summary_filepath, 'r') + + return HttpResponse(f) + + +def result_fc_cnm_eland_lane(request, fc_id, cnm, lane): + """ + returns an eland_file upon calling. + """ + fc_id, status = parse_flowcell_id(fc_id) + d = get_flowcell_result_dict(fc_id) + + if d is None: + return HttpResponse('Results for Flowcell %s not found.' % (fc_id)) + + if cnm not in d: + return HttpResponse('Results for Flowcell %s; %s not found.' % (fc_id, cnm)) + + erd = d[cnm]['eland_results'] + lane = int(lane) + + if lane not in erd: + return HttpResponse('Results for Flowcell %s; %s; lane %s not found.' % (fc_id, cnm, lane)) + + filepath = erd[lane] + + f = opener.autoopen(filepath, 'r') + + return HttpResponse(f, mimetype="application/x-elandresult") + + +def bedfile_fc_cnm_eland_lane_ucsc(request, fc_id, cnm, lane): + """ + returns a bed file for a given flowcell, CN-M (i.e. C1-33), and lane (ucsc compatible) + """ + return bedfile_fc_cnm_eland_lane(request, fc_id, cnm, lane, ucsc_compatible=True) + + +def bedfile_fc_cnm_eland_lane(request, fc_id, cnm, lane, ucsc_compatible=False): + """ + returns a bed file for a given flowcell, CN-M (i.e. C1-33), and lane + """ + fc_id, status = parse_flowcell_id(fc_id) + d = get_flowcell_result_dict(fc_id) + + if d is None: + return HttpResponse('Results for Flowcell %s not found.' % (fc_id)) + + if cnm not in d: + return HttpResponse('Results for Flowcell %s; %s not found.' % (fc_id, cnm)) + + erd = d[cnm]['eland_results'] + lane = int(lane) + + if lane not in erd: + return HttpResponse('Results for Flowcell %s; %s; lane %s not found.' % (fc_id, cnm, lane)) + + filepath = erd[lane] + + # Eland result file + fi = opener.autoopen(filepath, 'r') + # output memory file + + name, description = makebed.make_description( fc_id, lane ) + + bedgen = makebed.make_bed_from_eland_generator(fi, name, description) + + if ucsc_compatible: + return HttpResponse(bedgen) + else: + return HttpResponse(bedgen, mimetype="application/x-bedfile") + + +def _summary_stats(flowcell_id, lane_id): + """ + Return the summary statistics for a given flowcell, lane, and end. + """ + fc_id, status = parse_flowcell_id(flowcell_id) + fc_result_dict = get_flowcell_result_dict(fc_id) + + summary_list = [] + err_list = [] + + if fc_result_dict is None: + err_list.append('Results for Flowcell %s not found.' % (fc_id)) + return (summary_list, err_list) + + for cycle_width in fc_result_dict: + xmlpath = fc_result_dict[cycle_width]['run_xml'] + + if xmlpath is None: + err_list.append('Run xml for Flowcell %s(%s) not found.' % (fc_id, cycle_width)) + continue + + try: + run = load_pipeline_run_xml(xmlpath) + gerald_summary = run.gerald.summary.lane_results + for end in range(len(gerald_summary)): + eland_summary = run.gerald.eland_results.results[end][lane_id] + # add information to lane_summary + eland_summary.flowcell_id = flowcell_id + eland_summary.clusters = gerald_summary[end][lane_id].cluster + eland_summary.cycle_width = cycle_width + eland_summary.summarized_reads = runfolder.summarize_mapped_reads(eland_summary.mapped_reads) + summary_list.append(eland_summary) + + except Exception, e: + summary_list.append("Summary report needs to be updated.") + logging.error("Exception: " + str(e)) + + return (summary_list, err_list) + +def _summary_stats_old(flowcell_id, lane): + """ + return a dictionary of summary stats for a given flowcell_id & lane. + """ + fc_id, status = parse_flowcell_id(flowcell_id) + fc_result_dict = get_flowcell_result_dict(fc_id) + + dict_list = [] + err_list = [] + summary_list = [] + + if fc_result_dict is None: + err_list.append('Results for Flowcell %s not found.' % (fc_id)) + return (dict_list, err_list, summary_list) + + for cnm in fc_result_dict: + + xmlpath = fc_result_dict[cnm]['run_xml'] + + if xmlpath is None: + err_list.append('Run xml for Flowcell %s(%s) not found.' % (fc_id, cnm)) + continue + + tree = ElementTree.parse(xmlpath).getroot() + results = runfolder.PipelineRun(pathname='', xml=tree) + try: + lane_report = runfolder.summarize_lane(results.gerald, lane) + summary_list.append(os.linesep.join(lane_report)) + except Exception, e: + summary_list.append("Summary report needs to be updated.") + logging.error("Exception: " + str(e)) + + print "----------------------------------" + print "-- DOES NOT SUPPORT PAIRED END ---" + print "----------------------------------" + lane_results = results.gerald.summary[0][lane] + lrs = lane_results + + d = {} + + d['average_alignment_score'] = lrs.average_alignment_score + d['average_first_cycle_intensity'] = lrs.average_first_cycle_intensity + d['cluster'] = lrs.cluster + d['lane'] = lrs.lane + d['flowcell'] = flowcell_id + d['cnm'] = cnm + d['percent_error_rate'] = lrs.percent_error_rate + d['percent_intensity_after_20_cycles'] = lrs.percent_intensity_after_20_cycles + d['percent_pass_filter_align'] = lrs.percent_pass_filter_align + d['percent_pass_filter_clusters'] = lrs.percent_pass_filter_clusters + + #FIXME: function finished, but need to take advantage of + # may need to take in a list of lanes so we only have to + # load the xml file once per flowcell rather than once + # per lane. + dict_list.append(d) + + return (dict_list, err_list, summary_list) + + + + +def _files(flowcell_id, lane): + """ + Sets up available files for download + """ + flowcell_id, id = parse_flowcell_id(flowcell_id) + d = get_flowcell_result_dict(flowcell_id) + + if d is None: + return '' + + output = [] + + # c_name == 'CN-M' (i.e. C1-33) + for c_name in d: + + if d[c_name]['summary'] is not None: + output.append('summary(%s)' \ + % (flowcell_id, c_name, c_name)) + + erd = d[c_name]['eland_results'] + + if int(lane) in erd: + output.append('eland_result(%s)' % (flowcell_id, c_name, lane, c_name)) + output.append('bedfile(%s)' % (flowcell_id, c_name, lane, c_name)) + + if len(output) == 0: + return '' + + return '(' + '|'.join(output) + ')' + diff --git a/htsworkflow/frontend/settings.py b/htsworkflow/frontend/settings.py index 76aa1f3..a988694 100644 --- a/htsworkflow/frontend/settings.py +++ b/htsworkflow/frontend/settings.py @@ -27,6 +27,9 @@ The options understood by this module are (with their defaults): import ConfigParser import os +# make epydoc happy +__docformat__ = "restructuredtext en" + def options_to_list(dest, section_name): """ Load a options from section_name and store in a dictionary @@ -140,6 +143,7 @@ INSTALLED_APPS = ( 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', + 'django.contrib.humanize', 'django.contrib.sessions', 'django.contrib.sites', 'htsworkflow.frontend.eland_config', diff --git a/htsworkflow/frontend/templates/samples/library_detail.html b/htsworkflow/frontend/templates/samples/library_detail.html new file mode 100644 index 0000000..3533477 --- /dev/null +++ b/htsworkflow/frontend/templates/samples/library_detail.html @@ -0,0 +1,99 @@ +{% load humanize %} + +
+
+ + +{% block summary_stats %} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {% for lane in lane_summary_list %} + + + + + + + + + + + + + + + + + + + + + {% endfor %} + +
No MatchQC FailedUniqueRepeat
CyclesFlowcellLaneEndCluster / TileRaw Readstotal%total%0 mismatch1 mismatch2 mismatchTotal0 mismatch1 mismatch2 mismatchTotal
{{ lane.cycle_width }}{{ lane.flowcell_id }}{{ lane.lane_id }}{% if lane.end %}{{ lane.end }}{% endif %}{{ lane.clusters.0|intcomma }}{{ lane.reads|intcomma }}{{ lane.no_match|intcomma }}{{ lane.no_match_percent|stringformat:".2f" }}{{ lane.qc_failed|intcomma }}{{ lane.qc_failed_percent|stringformat:".2f" }}{{ lane.match_codes.U0|intcomma }}{{ lane.match_codes.U1|intcomma }}{{ lane.match_codes.U2|intcomma }}{{ lane.unique_reads|intcomma }}{{ lane.match_codes.R0|intcomma }}{{ lane.match_codes.R1|intcomma }}{{ lane.match_codes.R2|intcomma }}{{ lane.repeat_reads|intcomma }}
+
+
+{% for lane in lane_summary_list %} +

+ {{lane.cycle_width}} {{ lane.flowcell_id }} lane {{ lane.lane_id }} + {% if lane.end %} end {{ lane.end }}{% endif %} +

+ +{% endfor %} +{% endblock %} diff --git a/htsworkflow/frontend/templates/samples/library_index.html b/htsworkflow/frontend/templates/samples/library_index.html new file mode 100644 index 0000000..19e30c7 --- /dev/null +++ b/htsworkflow/frontend/templates/samples/library_index.html @@ -0,0 +1,46 @@ + + +{% block summary_stats %} + + + + + + + + + + + {% for lib in library_list%} + + + + + + + {% endfor %} + +
Library IDSpeciesLibrary NameTotal Lanes
{{ lib.library_id }}{{ lib.species_name }}{{ lib.library_name }}{{ lib.lanes_run }}
+{% endblock %} diff --git a/htsworkflow/frontend/urls.py b/htsworkflow/frontend/urls.py index b8b9e5e..ccc31b3 100644 --- a/htsworkflow/frontend/urls.py +++ b/htsworkflow/frontend/urls.py @@ -19,6 +19,19 @@ urlpatterns = patterns('', #(r'^analysis/', include('htsworkflow.frontend.analysis.urls')), # Report Views: (r'^reports/', include('htsworkflow.frontend.reports.urls')), + # Library browser + (r'^library/$', 'htsworkflow.frontend.samples.views.library'), + (r'^library/(?P\w+)/$', + 'htsworkflow.frontend.samples.views.library_to_flowcells'), + # Raw result files + (r'^results/(?P\w+)/(?PC[1-9]-[0-9]+)/summary/', + 'htsworkflow.frontend.samples.views.summaryhtm_fc_cnm'), + (r'^results/(?P\w+)/(?PC[1-9]-[0-9]+)/eland_result/(?P[1-8])', + 'htsworkflow.frontend.samples.views.result_fc_cnm_eland_lane'), + (r'^results/(?P\w+)/(?PC[1-9]-[0-9]+)/bedfile/(?P[1-8])/ucsc', + 'htsworkflow.frontend.samples.views.bedfile_fc_cnm_eland_lane_ucsc'), + (r'^results/(?P\w+)/(?PC[1-9]-[0-9]+)/bedfile/(?P[1-8])', + 'htsworkflow.frontend.samples.views.bedfile_fc_cnm_eland_lane'), # databrowser #(r'^databrowse/(.*)', databrowse.site.root) diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py index 05563a1..40507e6 100644 --- a/htsworkflow/pipelines/eland.py +++ b/htsworkflow/pipelines/eland.py @@ -194,6 +194,50 @@ class ElandLane(object): return self._match_codes match_codes = property(_get_match_codes) + def _get_no_match(self): + if self._mapped_reads is None: + self._update() + return self._match_codes['NM'] + no_match = property(_get_no_match, + doc="total reads that didn't match the target genome.") + + def _get_no_match_percent(self): + return float(self.no_match)/self.reads * 100 + no_match_percent = property(_get_no_match_percent, + doc="no match reads as percent of total") + + def _get_qc_failed(self): + if self._mapped_reads is None: + self._update() + return self._match_codes['QC'] + qc_failed = property(_get_qc_failed, + doc="total reads that didn't match the target genome.") + + def _get_qc_failed_percent(self): + return float(self.qc_failed)/self.reads * 100 + qc_failed_percent = property(_get_qc_failed_percent, + doc="QC failed reads as percent of total") + + def _get_unique_reads(self): + if self._mapped_reads is None: + self._update() + sum = 0 + for code in ['U0','U1','U2']: + sum += self._match_codes[code] + return sum + unique_reads = property(_get_unique_reads, + doc="total unique reads") + + def _get_repeat_reads(self): + if self._mapped_reads is None: + self._update() + sum = 0 + for code in ['R0','R1','R2']: + sum += self._match_codes[code] + return sum + repeat_reads = property(_get_repeat_reads, + doc="total repeat reads") + def get_elements(self): lane = ElementTree.Element(ElandLane.LANE, {'version': diff --git a/htsworkflow/util/makebed.py b/htsworkflow/util/makebed.py index eeb4e88..cb93163 100755 --- a/htsworkflow/util/makebed.py +++ b/htsworkflow/util/makebed.py @@ -4,11 +4,13 @@ Utility functions to make bedfiles. import os import re +__docformat__ = "restructredtext en" + # map eland_result.txt sense sense_map = { 'F': '+', 'R': '-'} sense_color = { 'F': '0,0,255', 'R': '255,255,0' } -def write_bed_header(outstream, name, description): +def create_bed_header(name, description): """ Produce the headerline for a bedfile """ @@ -17,11 +19,33 @@ def write_bed_header(outstream, name, description): if description is None: description = "eland result file" bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' bed_header += os.linesep - outstream.write(bed_header % (name, description)) + return bed_header def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'): """ read an eland result file from instream and write a bedfile to outstream + + :Parameters: + - `instream`: stream containing the output from eland + - `outstream`: stream to write the bed file too + - `name`: name of bed-file (must be unique) + - `description`: longer description of the bed file + - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern + """ + for line in make_bed_from_eland_generator(instream, name, description, chromosome_prefix): + outstream.write(line) + +def make_bed_from_eland_generator(instream, name, description, chromosome_prefix='chr'): + """ + read an eland result file from instream and write a bedfile to outstream + + :Parameters: + - `instream`: stream containing the output from eland + - `name`: name of bed-file (must be unique) + - `description`: longer description of the bed file + - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern + + :Return: generator which yields lines of bedfile """ # indexes into fields in eland_result.txt file SEQ = 1 @@ -29,7 +53,7 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom START = 7 SENSE = 8 - write_bed_header(outstream, name, description) + yield create_bed_header(name, description) prefix_len = len(chromosome_prefix) for line in instream: @@ -42,15 +66,14 @@ def make_bed_from_eland_stream(instream, outstream, name, description, chromosom # strip off filename extension chromosome = fields[CHR].split('.')[0] - outstream.write('%s %s %d read 0 %s - - %s%s' % ( + yield '%s %s %d read 0 %s - - %s%s' % ( chromosome, start, stop, sense_map[fields[SENSE]], sense_color[fields[SENSE]], os.linesep - )) - + ) def make_bed_from_multi_eland_stream( instream, @@ -61,17 +84,25 @@ def make_bed_from_multi_eland_stream( max_reads=255 ): """ - read a multi eland stream and write a bedfile + read a multi eland result file from instream and write the bedfile to outstream + + :Parameters: + - `instream`: stream containing the output from eland + - `outstream`: stream to write the bed file too + - `name`: name of bed-file (must be unique) + - `description`: longer description of the bed file + - `chromosome_prefix`: restrict output lines to fasta records that start with this pattern + - `max_reads`: maximum number of reads to write to bed stream """ - write_bed_header(outstream, name, description) - parse_multi_eland(instream, outstream, chr_prefix, max_reads) - -def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): + for lane in make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads): + oustream.write(lane) +def make_bed_from_multi_eland_generator(instream, name, description, chr_prefix, max_reads=255): loc_pattern = '(?P(?P[0-9]+)(?P[FR])(?P[0-9]+))' other_pattern = '(?P[^:,]+)' split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern)) + yield create_bed_header(name, description) for line in instream: rec = line.split() if len(rec) > 3: @@ -110,35 +141,30 @@ def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): if reported_reads <= max_reads: for cur_chr, start, stop, strand, color in read_list: reported_reads += 1 - outstream.write('%s %d %d read 0 %s - - %s%s' % ( + yield '%s %d %d read 0 %s - - %s%s' % ( cur_chr, start, stop, sense_map[orientation], sense_color[orientation], os.linesep - )) + ) -def make_description(database, flowcell_id, lane): +def make_description(flowcell_id, lane): """ - compute a bedfile name and description from the fctracker database + compute a bedfile name and description from the django database """ - from htsworkflow.util.fctracker import fctracker + from htsworkflow.frontend.experiments import models as experiments - fc = fctracker(database) - cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id)) - if len(cells) != 1: - raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id)) lane = int(lane) if lane < 1 or lane > 8: raise RuntimeError("flowcells only have lanes 1-8") - name = "%s-%s" % (flowcell_id, lane) + cell = experiments.FlowCell.objects.get(flowcell_id=flowcell_id) - cell_id, cell = cells.items()[0] - assert cell_id == flowcell_id + name = "%s-%s" % (flowcell_id, lane) - cell_library_id = cell['lane_%d_library_id' %(lane,)] - cell_library = cell['lane_%d_library' %(lane,)] - description = "%s-%s" % (cell_library['library_name'], cell_library_id) + cell_library = getattr(cell, 'lane_%d_library' %(lane,)) + cell_library_id = cell_library.library_id + description = "%s-%s" % (cell_library.library_name, cell_library_id) return name, description diff --git a/htsworkflow/util/test/test_makebed.py b/htsworkflow/util/test/test_makebed.py index 03c7919..b5d3026 100644 --- a/htsworkflow/util/test/test_makebed.py +++ b/htsworkflow/util/test/test_makebed.py @@ -9,38 +9,42 @@ class testMakeBed(unittest.TestCase): instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') out = StringIO() - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + out = list(makebed.make_bed_from_multi_eland_generator(instream, 'name', 'description', 'mm9_chr', 1)) + self.failUnlessEqual(out[1], 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') def test_multi_1_0_0_limit_255(self): instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') out = StringIO() - makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) - self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + out = list(makebed.make_bed_from_multi_eland_generator(instream, 'name', 'desc', 'mm9_chr', 255)) + self.failUnlessEqual(out[1], 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') def test_multi_2_0_0_limit_1(self): instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') out = StringIO() - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - self.failUnlessEqual(out.len, 0) + out = list(makebed.make_bed_from_multi_eland_generator(instream, 'name', 'desc', 'mm9_chr', 1)) + self.failUnlessEqual(len(out), 1) def test_multi_2_0_0_limit_255(self): instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') out = StringIO() - makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) - self.failUnlessEqual(out.len, 98) + out = list(makebed.make_bed_from_multi_eland_generator(instream, 'name', 'desc', 'mm9_chr', 255)) + self.failUnlessEqual(len(out), 3) + self.failUnlessEqual(out[1], + 'mm9_chr4 42995432 42995464 read 0 + - - 0,0,255\n') + self.failUnlessEqual(out[2], + 'mm9_chrX 101541458 101541490 read 0 + - - 0,0,255\n') def test_multi_0_2_0_limit_1(self): instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1') out = StringIO() - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - print out.getvalue() - self.failUnlessEqual(out.len, 0) + out = list(makebed.make_bed_from_multi_eland_generator(instream, 'name', 'desc', 'mm9_chr', 1)) + print out + self.failUnlessEqual(len(out), 1) def suite(): return unittest.makeSuite(testMakeBed, 'test')