From: Diane Trout Date: Fri, 1 Feb 2013 01:04:58 +0000 (-0800) Subject: Preliminary implementation of trackDb generation. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=293258a1743da0bd6ad8a39b0eaddd5575617fbc Preliminary implementation of trackDb generation. This is super preliminary importand report parts are hard coded instead of being detected properly. --- diff --git a/encode_submission/trackhub.py b/encode_submission/trackhub.py new file mode 100644 index 0000000..4ee2061 --- /dev/null +++ b/encode_submission/trackhub.py @@ -0,0 +1,152 @@ +"""Create a track hub +""" + +#!/usr/bin/env python +from ConfigParser import SafeConfigParser +import fnmatch +from glob import glob +import json +import logging +import netrc +from optparse import OptionParser, OptionGroup +import os +from pprint import pprint, pformat +import shlex +from StringIO import StringIO +import stat +import sys +import time +import types +import urllib +import urllib2 +import urlparse +from zipfile import ZipFile + +import RDF + +if not 'DJANGO_SETTINGS_MODULE' in os.environ: + os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings' + + +from htsworkflow.util import api +from htsworkflow.util.rdfhelp import \ + dafTermOntology, \ + fromTypedNode, \ + get_model, \ + get_serializer, \ + load_into_model, \ + sparql_query, \ + submissionOntology +from htsworkflow.submission.daf import get_submission_uri +from htsworkflow.submission.results import ResultMap +from htsworkflow.submission.trackhub import TrackHubSubmission +from htsworkflow.submission.condorfastq import CondorFastqExtract + +logger = logging.getLogger(__name__) + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + submission_uri = None + + if opts.debug: + logging.basicConfig(level = logging.DEBUG ) + elif opts.verbose: + logging.basicConfig(level = logging.INFO ) + else: + logging.basicConfig(level = logging.WARNING ) + + apidata = api.make_auth_from_opts(opts, parser) + + model = get_model(opts.model, opts.db_path) + mapper = None + if opts.name: + mapper = TrackHubSubmission(opts.name, model, host=opts.host) + submission_uri = get_submission_uri(opts.name) + + + if opts.load_rdf is not None: + if submission_uri is None: + parser.error("Please specify the submission name") + load_into_model(model, 'turtle', opts.load_rdf, submission_uri) + + results = ResultMap() + for a in args: + if os.path.exists(a): + results.add_results_from_file(a) + else: + logger.warn("File %s doesn't exist.", a) + + if opts.make_tree_from is not None: + results.make_tree_from(opts.make_tree_from) + + if opts.fastq: + logger.info("Building fastq extraction scripts") + flowcells = os.path.join(opts.sequence, 'flowcells') + extractor = CondorFastqExtract(opts.host, flowcells, + model=opts.model, + force=opts.force) + extractor.create_scripts(results) + + if opts.scan_submission: + if opts.name is None: + parser.error("Please define a submission name") + mapper.scan_submission_dirs(results) + + if opts.make_hub: + mapper.make_hub(results) + + if opts.sparql: + sparql_query(model, opts.sparql) + + if opts.print_rdf: + writer = get_serializer() + print writer.serialize_model_to_string(model) + + +def make_parser(): + parser = OptionParser() + + model = OptionGroup(parser, 'model') + model.add_option('--name', help="Set submission name") + model.add_option('--db-path', default=None, + help="set rdf database path") + model.add_option('--model', default=None, + help="Load model database") + model.add_option('--load-rdf', default=None, + help="load rdf statements into model") + model.add_option('--sparql', default=None, help="execute sparql query") + model.add_option('--print-rdf', action="store_true", default=False, + help="print ending model state") + parser.add_option_group(model) + # commands + commands = OptionGroup(parser, 'commands') + commands.add_option('--make-tree-from', + help="create directories & link data files", + default=None) + commands.add_option('--fastq', default=False, action="store_true", + help="generate scripts for making fastq files") + commands.add_option('--scan-submission', default=False, action="store_true", + help="Import metadata for submission into our model") + commands.add_option('--make-hub', help='make the hub file', default=False, + action="store_true") + + parser.add_option_group(commands) + + parser.add_option('--force', default=False, action="store_true", + help="Force regenerating fastqs") + parser.add_option('--daf', default=None, help='specify daf name') + parser.add_option('--library-url', default=None, + help="specify an alternate source for library information") + # debugging + parser.add_option('--verbose', default=False, action="store_true", + help='verbose logging') + parser.add_option('--debug', default=False, action="store_true", + help='debug logging') + + api.add_auth_options(parser) + + return parser + +if __name__ == "__main__": + main() diff --git a/htsworkflow/submission/trackhub.py b/htsworkflow/submission/trackhub.py new file mode 100644 index 0000000..e3087fe --- /dev/null +++ b/htsworkflow/submission/trackhub.py @@ -0,0 +1,55 @@ +import logging +import os + +import RDF + +from htsworkflow.submission.submission import Submission + +from htsworkflow.util.rdfhelp import \ + fromTypedNode, \ + geoSoftNS, \ + stripNamespace, \ + submissionOntology + +from django.conf import settings +from django.template import Context, loader + +LOGGER = logging.getLogger(__name__) + +class TrackHubSubmission(Submission): + def __init__(self, name, model, host): + super(TrackHubSubmission, self).__init__(name, model, host) + + def make_hub(self, result_map): + samples = [] + for lib_id, result_dir in result_map.items(): + an_analysis = self.get_submission_node(result_dir) + metadata = self.get_sample_metadata(an_analysis) + if len(metadata) == 0: + errmsg = 'No metadata found for {0}' + LOGGER.error(errmsg.format(str(an_analysis),)) + continue + elif len(metadata) > 1: + errmsg = 'Confused there are more than one sample for %s' + LOGGER.debug(errmsg % (str(an_analysis),)) + metadata = metadata[0] + samples.append(metadata) + + soft_template = loader.get_template('trackDb.txt') + context = Context({ + 'samples': samples, + }) + print str(soft_template.render(context)) + + def get_sample_metadata(self, analysis_node): + """Gather information for filling out sample section of a SOFT file + """ + query_template = loader.get_template('trackhub_samples.sparql') + + context = Context({ + 'submission': str(analysis_node.uri), + 'submissionSet': str(self.submissionSetNS[''].uri), + }) + + results = self.execute_query(query_template, context) + return results \ No newline at end of file diff --git a/htsworkflow/templates/trackDb.txt b/htsworkflow/templates/trackDb.txt new file mode 100644 index 0000000..8b839a6 --- /dev/null +++ b/htsworkflow/templates/trackDb.txt @@ -0,0 +1,30 @@ +track singleCell +compositeTrack on +visibility dense +shortLabel Single RNA-Seq +longLabel ENCODE Single cell and small pool RNA-Seq +subGroup1 tier Tier t1=1 t2=2 t3=3 +subGroup2 poolSize \ + Single=single \ + Ten=10_cells \ + Eleven=11_cells + Hundred=100_Cells \ + Pool=Pool +subGroup3 cellType Cell_Line GM12878=GM12878 H1hESC=H1-hESC K562=K562 HeLaS3=HeLa-S3 HepG2=HepG2 HUVEC=HUVEC T8988T=8988T A549=A549 AG04449=AG04449 AG04450=AG04450 AG09309=AG09309 AG09319=AG09319 AG10803=AG10803 AoAF=AoAF AoSMC=AoSMC BE2C=BE2_C BJ=BJ Caco2=Caco-2 CD20=CD20+ CD34Mobilized=CD34+_Mobilized Chorion=Chorion CLL=CLL CMK=CMK Fibrobl=Fibrobl FibroP=FibroP Gliobla=Gliobla GM06990=GM06990 GM12864=GM12864 GM12865=GM12865 GM12891=GM12891 GM12892=GM12892 GM18507=GM18507 GM19238=GM19238 GM19239=GM19239 GM19240=GM19240 H7hESC=H7-hESC H9ES=H9ES HAh=HA-h HAsp=HA-sp HAc=HAc HAEpiC=HAEpiC HBMEC=HBMEC HCF=HCF HCFaa=HCFaa HCM=HCM HConF=HConF HCPEpiC=HCPEpiC HCT116=HCT-116 HEEpiC=HEEpiC Hepatocytes=Hepatocytes HFF=HFF HFFMyc=HFF-Myc HGF=HGF HIPEpiC=HIPEpiC HL60=HL-60 HMEC=HMEC HMF=HMF HMVECdAd=HMVEC-dAd HMVECdBlAd=HMVEC-dBl-Ad HMVECdBlNeo=HMVEC-dBl-Neo HMVECdLyAd=HMVEC-dLy-Ad HMVECdLyNeo=HMVEC-dLy-Neo HMVECdNeo=HMVEC-dNeo HMVECLBl=HMVEC-LBl HMVECLLy=HMVEC-LLy HNPCEpiC=HNPCEpiC HPAEC=HPAEC HPAF=HPAF HPDE6E6E7=HPDE6-E6E7 HPdLF=HPdLF HPF=HPF HRCEpiC=HRCEpiC HRE=HRE HRGEC=HRGEC HRPEpiC=HRPEpiC HSMM=HSMM HSMMemb=HSMM_emb HSMMtube=HSMMtube HTR8svn=HTR8svn Huh7=Huh-7 Huh75=Huh-7.5 HVMF=HVMF iPS=iPS Ishikawa=Ishikawa Jurkat=Jurkat K562=K562 LNCaP=LNCaP MCF7=MCF-7 Medullo=Medullo Melano=Melano MonocytesCD14RO01746=Monocytes-CD14+_RO01746 Myometr=Myometr NB4=NB4 NHA=NH-A NHDFAd=NHDF-Ad NHDFneo=NHDF-neo NHEK=NHEK NHLF=NHLF NT2D1=NT2-D1 Osteobl=Osteobl PANC1=PANC-1 PanIsletD=PanIsletD PanIslets=PanIslets pHTE=pHTE PrEC=PrEC ProgFib=ProgFib RPTEC=RPTEC RWPE1=RWPE1 SAEC=SAEC SKNMC=SK-N-MC SKNSHRA=SK-N-SH_RA SkMC=SkMC Stellate=Stellate T47D=T-47D Th0=Th0 Th1=Th1 Th2=Th2 Urothelia=Urothelia WERIRb1=WERI-Rb-1 WI38=WI-38 +subGroup4 readType Read_type R1x100=1x100 +dimensions dimX=poolSize dimY=cellType dimA=readType +dragAndDrop subTracks +type bam + +{% for sample in samples %} + track sample_{{ sample.library_id }} + parent singleCell on + bigDataUrl {{ sample.bam }} + shortLabel {{ sample.library_id }} + longLabel {{ sample.name }} + type bam + subGroups tier=t1 \ + cellLine={{ sample.cell }} \ + poolSize={{ sample.input_quantity }} \ + readType=R1x100 +{% endfor %} \ No newline at end of file diff --git a/htsworkflow/templates/trackhub_samples.sparql b/htsworkflow/templates/trackhub_samples.sparql new file mode 100644 index 0000000..5930d9f --- /dev/null +++ b/htsworkflow/templates/trackhub_samples.sparql @@ -0,0 +1,42 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: +PREFIX cells: + +select distinct ?name ?bam ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?tier ?experiment_type ?library_selection ?library_source ?input_quantity +WHERE { + <{{submission}}> a submissionOntology:submission ; + submissionOntology:library ?library ; + submissionOntology:name ?name ; + ucscDaf:has_file ?file . + ?file ucscDaf:filename ?bam . + OPTIONAL { <{{submission}}> ucscDaf:control ?control } + OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId } + OPTIONAL { ?library libraryOntology:antibody ?antibody } + OPTIONAL { ?library libraryOntology:cell_line ?cell . + OPTIONAL { ?cell_line cells:cell ?cell . + OPTIONAL { ?cell_line cells:documents ?growthProtocol . } + OPTIONAL { ?cell_line cells:tier ?tier . } } } + OPTIONAL { ?library ucscDaf:sex ?sex } + OPTIONAL { ?library libraryOntology:library_id ?library_id } + OPTIONAL { ?library libraryOntology:replicate ?replicate } + OPTIONAL { ?library libraryOntology:species ?species_name . + ?species libraryOntology:species ?species_name ; + libraryOntology:taxon_id ?taxon_id . } + OPTIONAL { ?library libraryOntology:condition_term ?treatment } + OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type } + OPTIONAL { ?library libraryOntology:librarySelection ?library_selection } + OPTIONAL { ?library libraryOntology:librarySource ?library_source } + OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol } + OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule } + OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol } + OPTIONAL { ?library ucscDaf:protocol ?protocol } + OPTIONAL { ?library ucscDaf:readType ?readType } + OPTIONAL { ?library ucscDaf:strain ?strain } + OPTIONAL { ?library libraryOntology:insert_size ?insertLength } + OPTIONAL { ?library libraryOntology:inputQuantity ?input_quantity } + OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm } + +} \ No newline at end of file