Preliminary implementation of trackDb generation.
authorDiane Trout <diane@caltech.edu>
Fri, 1 Feb 2013 01:04:58 +0000 (17:04 -0800)
committerDiane Trout <diane@caltech.edu>
Fri, 1 Feb 2013 01:04:58 +0000 (17:04 -0800)
This is super preliminary importand report parts are
hard coded instead of being detected properly.

encode_submission/trackhub.py [new file with mode: 0644]
htsworkflow/submission/trackhub.py [new file with mode: 0644]
htsworkflow/templates/trackDb.txt [new file with mode: 0644]
htsworkflow/templates/trackhub_samples.sparql [new file with mode: 0644]

diff --git a/encode_submission/trackhub.py b/encode_submission/trackhub.py
new file mode 100644 (file)
index 0000000..4ee2061
--- /dev/null
@@ -0,0 +1,152 @@
+"""Create a track hub 
+"""
+
+#!/usr/bin/env python
+from ConfigParser import SafeConfigParser
+import fnmatch
+from glob import glob
+import json
+import logging
+import netrc
+from optparse import OptionParser, OptionGroup
+import os
+from pprint import pprint, pformat
+import shlex
+from StringIO import StringIO
+import stat
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+from zipfile import ZipFile
+
+import RDF
+
+if not 'DJANGO_SETTINGS_MODULE' in os.environ:
+    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
+
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     fromTypedNode, \
+     get_model, \
+     get_serializer, \
+     load_into_model, \
+     sparql_query, \
+     submissionOntology
+from htsworkflow.submission.daf import get_submission_uri
+from htsworkflow.submission.results import ResultMap
+from htsworkflow.submission.trackhub import TrackHubSubmission
+from htsworkflow.submission.condorfastq import CondorFastqExtract
+
+logger = logging.getLogger(__name__)
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+    submission_uri = None
+
+    if opts.debug:
+        logging.basicConfig(level = logging.DEBUG )
+    elif opts.verbose:
+        logging.basicConfig(level = logging.INFO )
+    else:
+        logging.basicConfig(level = logging.WARNING )
+
+    apidata = api.make_auth_from_opts(opts, parser)
+
+    model = get_model(opts.model, opts.db_path)
+    mapper = None
+    if opts.name:
+        mapper = TrackHubSubmission(opts.name,  model, host=opts.host)
+        submission_uri = get_submission_uri(opts.name)
+
+
+    if opts.load_rdf is not None:
+        if submission_uri is None:
+            parser.error("Please specify the submission name")
+        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
+
+    results = ResultMap()
+    for a in args:
+        if os.path.exists(a):
+            results.add_results_from_file(a)
+        else:
+            logger.warn("File %s doesn't exist.", a)
+
+    if opts.make_tree_from is not None:
+        results.make_tree_from(opts.make_tree_from)
+
+    if opts.fastq:
+        logger.info("Building fastq extraction scripts")
+        flowcells = os.path.join(opts.sequence, 'flowcells')
+        extractor = CondorFastqExtract(opts.host, flowcells,
+                                       model=opts.model,
+                                       force=opts.force)
+        extractor.create_scripts(results)
+
+    if opts.scan_submission:
+        if opts.name is None:
+            parser.error("Please define a submission name")
+        mapper.scan_submission_dirs(results)
+
+    if opts.make_hub:
+        mapper.make_hub(results)
+
+    if opts.sparql:
+        sparql_query(model, opts.sparql)
+
+    if opts.print_rdf:
+        writer = get_serializer()
+        print writer.serialize_model_to_string(model)
+
+
+def make_parser():
+    parser = OptionParser()
+
+    model = OptionGroup(parser, 'model')
+    model.add_option('--name', help="Set submission name")
+    model.add_option('--db-path', default=None,
+                     help="set rdf database path")
+    model.add_option('--model', default=None,
+      help="Load model database")
+    model.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    model.add_option('--sparql', default=None, help="execute sparql query")
+    model.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    parser.add_option_group(model)
+    # commands
+    commands = OptionGroup(parser, 'commands')
+    commands.add_option('--make-tree-from',
+                      help="create directories & link data files",
+                      default=None)
+    commands.add_option('--fastq', default=False, action="store_true",
+                        help="generate scripts for making fastq files")
+    commands.add_option('--scan-submission', default=False, action="store_true",
+                      help="Import metadata for submission into our model")
+    commands.add_option('--make-hub', help='make the hub file', default=False,
+                      action="store_true")
+
+    parser.add_option_group(commands)
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="Force regenerating fastqs")
+    parser.add_option('--daf', default=None, help='specify daf name')
+    parser.add_option('--library-url', default=None,
+                      help="specify an alternate source for library information")
+    # debugging
+    parser.add_option('--verbose', default=False, action="store_true",
+                      help='verbose logging')
+    parser.add_option('--debug', default=False, action="store_true",
+                      help='debug logging')
+
+    api.add_auth_options(parser)
+
+    return parser
+
+if __name__ == "__main__":
+    main()
diff --git a/htsworkflow/submission/trackhub.py b/htsworkflow/submission/trackhub.py
new file mode 100644 (file)
index 0000000..e3087fe
--- /dev/null
@@ -0,0 +1,55 @@
+import logging
+import os
+
+import RDF
+
+from htsworkflow.submission.submission import Submission
+
+from htsworkflow.util.rdfhelp import \
+     fromTypedNode, \
+     geoSoftNS, \
+     stripNamespace, \
+     submissionOntology
+
+from django.conf import settings
+from django.template import Context, loader
+
+LOGGER = logging.getLogger(__name__)
+
+class TrackHubSubmission(Submission):
+    def __init__(self, name, model, host):
+        super(TrackHubSubmission, self).__init__(name, model, host)
+
+    def make_hub(self, result_map):
+        samples = []
+        for lib_id, result_dir in result_map.items():
+            an_analysis = self.get_submission_node(result_dir)
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) == 0:
+                errmsg = 'No metadata found for {0}'
+                LOGGER.error(errmsg.format(str(an_analysis),))
+                continue
+            elif len(metadata) > 1:
+                errmsg = 'Confused there are more than one sample for %s'
+                LOGGER.debug(errmsg % (str(an_analysis),))
+            metadata = metadata[0]
+            samples.append(metadata)
+
+        soft_template = loader.get_template('trackDb.txt')
+        context = Context({
+            'samples': samples,
+        })
+        print str(soft_template.render(context))
+        
+    def get_sample_metadata(self, analysis_node):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('trackhub_samples.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
+            })
+
+        results = self.execute_query(query_template, context)
+        return results
\ No newline at end of file
diff --git a/htsworkflow/templates/trackDb.txt b/htsworkflow/templates/trackDb.txt
new file mode 100644 (file)
index 0000000..8b839a6
--- /dev/null
@@ -0,0 +1,30 @@
+track singleCell
+compositeTrack on
+visibility dense
+shortLabel Single RNA-Seq
+longLabel ENCODE Single cell and small pool RNA-Seq
+subGroup1 tier Tier t1=1 t2=2 t3=3
+subGroup2 poolSize \
+          Single=single \
+          Ten=10_cells \
+          Eleven=11_cells
+          Hundred=100_Cells \
+          Pool=Pool
+subGroup3 cellType Cell_Line GM12878=GM12878 H1hESC=H1-hESC K562=K562 HeLaS3=HeLa-S3 HepG2=HepG2 HUVEC=HUVEC T8988T=8988T A549=A549 AG04449=AG04449 AG04450=AG04450 AG09309=AG09309 AG09319=AG09319 AG10803=AG10803 AoAF=AoAF AoSMC=AoSMC BE2C=BE2_C BJ=BJ Caco2=Caco-2 CD20=CD20+ CD34Mobilized=CD34+_Mobilized Chorion=Chorion CLL=CLL CMK=CMK Fibrobl=Fibrobl FibroP=FibroP Gliobla=Gliobla GM06990=GM06990 GM12864=GM12864 GM12865=GM12865 GM12891=GM12891 GM12892=GM12892 GM18507=GM18507 GM19238=GM19238 GM19239=GM19239 GM19240=GM19240 H7hESC=H7-hESC H9ES=H9ES HAh=HA-h HAsp=HA-sp HAc=HAc HAEpiC=HAEpiC HBMEC=HBMEC HCF=HCF HCFaa=HCFaa HCM=HCM HConF=HConF HCPEpiC=HCPEpiC HCT116=HCT-116 HEEpiC=HEEpiC Hepatocytes=Hepatocytes HFF=HFF HFFMyc=HFF-Myc HGF=HGF HIPEpiC=HIPEpiC HL60=HL-60 HMEC=HMEC HMF=HMF HMVECdAd=HMVEC-dAd HMVECdBlAd=HMVEC-dBl-Ad HMVECdBlNeo=HMVEC-dBl-Neo HMVECdLyAd=HMVEC-dLy-Ad HMVECdLyNeo=HMVEC-dLy-Neo HMVECdNeo=HMVEC-dNeo HMVECLBl=HMVEC-LBl HMVECLLy=HMVEC-LLy HNPCEpiC=HNPCEpiC HPAEC=HPAEC HPAF=HPAF HPDE6E6E7=HPDE6-E6E7 HPdLF=HPdLF HPF=HPF HRCEpiC=HRCEpiC HRE=HRE HRGEC=HRGEC HRPEpiC=HRPEpiC HSMM=HSMM HSMMemb=HSMM_emb HSMMtube=HSMMtube HTR8svn=HTR8svn Huh7=Huh-7 Huh75=Huh-7.5 HVMF=HVMF iPS=iPS Ishikawa=Ishikawa Jurkat=Jurkat K562=K562 LNCaP=LNCaP MCF7=MCF-7 Medullo=Medullo Melano=Melano MonocytesCD14RO01746=Monocytes-CD14+_RO01746 Myometr=Myometr NB4=NB4 NHA=NH-A NHDFAd=NHDF-Ad NHDFneo=NHDF-neo NHEK=NHEK NHLF=NHLF NT2D1=NT2-D1 Osteobl=Osteobl PANC1=PANC-1 PanIsletD=PanIsletD PanIslets=PanIslets pHTE=pHTE PrEC=PrEC ProgFib=ProgFib RPTEC=RPTEC RWPE1=RWPE1 SAEC=SAEC SKNMC=SK-N-MC SKNSHRA=SK-N-SH_RA SkMC=SkMC Stellate=Stellate T47D=T-47D Th0=Th0 Th1=Th1 Th2=Th2 Urothelia=Urothelia WERIRb1=WERI-Rb-1 WI38=WI-38 
+subGroup4 readType Read_type R1x100=1x100
+dimensions dimX=poolSize dimY=cellType dimA=readType
+dragAndDrop subTracks
+type bam
+
+{% for sample in samples %}
+    track sample_{{ sample.library_id }}
+    parent singleCell on
+    bigDataUrl {{ sample.bam }}
+    shortLabel {{ sample.library_id }}
+    longLabel {{ sample.name }}
+    type bam
+    subGroups tier=t1 \
+              cellLine={{ sample.cell }} \
+              poolSize={{ sample.input_quantity }} \
+              readType=R1x100
+{% endfor %}
\ No newline at end of file
diff --git a/htsworkflow/templates/trackhub_samples.sparql b/htsworkflow/templates/trackhub_samples.sparql
new file mode 100644 (file)
index 0000000..5930d9f
--- /dev/null
@@ -0,0 +1,42 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?bam ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?tier ?experiment_type ?library_selection ?library_source ?input_quantity
+WHERE {
+  <{{submission}}> a submissionOntology:submission ;
+                   submissionOntology:library ?library ;
+                   submissionOntology:name ?name ;
+                   ucscDaf:has_file ?file .
+  ?file ucscDaf:filename ?bam .
+  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell .
+             OPTIONAL { ?cell_line cells:cell ?cell .
+                        OPTIONAL { ?cell_line cells:documents ?growthProtocol . }
+                        OPTIONAL { ?cell_line cells:tier ?tier . } } }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?library_id }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name .
+             ?species libraryOntology:species ?species_name ;
+                      libraryOntology:taxon_id ?taxon_id . }
+  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library libraryOntology:inputQuantity ?input_quantity }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+}
\ No newline at end of file