Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow

author Diane Trout <diane@caltech.edu>

Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)

committer Diane Trout <diane@caltech.edu>

Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
author Diane Trout <diane@caltech.edu>
Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
committer Diane Trout <diane@caltech.edu>
Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
diff --cc extra/ucsc_encode_submission/encode_find.py

index fdf754241de090b7ca71eea2abc859eae0b8d194,a01167047b3528f5bcc5257a898c7d1bffe04663..508291244ffea73c8cda46b7a74b875bc3ca1af3
--- 1/extra/ucsc_encode_submission/encode_find.py
--- 2/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@@ -268,10 -277,10 +277,10 @@@ def update_submission_detail(model, sub
   
       if len(status_nodes) == 0:
           # has no status node, add one
- -        logging.info("Adding status node to {0}".format(subUrn))
+ +        LOGGER.info("Adding status node to {0}".format(subUrn))
           status_node = create_status_node(subUrn, recent_update)
           add_stmt(model, subUrn, HasStatusN, status_node)
-         add_stmt(model, status_node, rdfsNS['type'], StatusN)
+         add_stmt(model, status_node, rdfNS['type'], StatusN)
           add_stmt(model, status_node, StatusN, status)
           add_stmt(model, status_node, LastModifyN, recent_update)
           update_ddf(model, subUrn, status_node, cookie=cookie)
@@@ -298,9 -307,14 +307,14 @@@ def update_daf(model, submission_url, s
   
       status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
       if not model.contains_statement(status_is_daf):
- -        logging.info('Adding daf to {0}, {1}'.format(submission_url,
+ +        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
                                                        status_node))
           daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+         daf_hash = hashlib.md5(daf_text).hexdigest()
+         daf_hash_stmt = RDF.Statement(status_node,
+                                       dafTermOntology['md5sum'],
+                                       daf_hash)
+         model.add_statement(daf_hash_stmt)
           daf.fromstring_into_model(model, status_node, daf_text)
   
   
diff --cc htsworkflow/frontend/samples/views.py
Simple merge
diff --cc htsworkflow/pipelines/retrieve_config.py

index 42ff263d97d99e6f54a659b55b5d6dba0bf13fe2,48b581e72be8aecc329b90fe204f818c5295d174..94d8f5036089a47daeaad4a4b3f44a58540d0555
--- 1/htsworkflow/pipelines/retrieve_config.py
--- 2/htsworkflow/pipelines/retrieve_config.py
+++ b/htsworkflow/pipelines/retrieve_config.py
@@@ -317,20 -315,20 +317,20 @@@ def saveConfigFile(options)
   
     flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
   
- -  logging.debug('genome_dir: %s' % ( options.genome_dir, ))
+ +  LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
     available_genomes = getAvailableGenomes(options.genome_dir)
     genome_map = constructMapperDict(available_genomes)
- -  logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
+ +  LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
   
-   #config = format_gerald_config(options, flowcell_info, genome_map)
-   #
-   #if options.output_filepath is not None:
-   #    outstream = open(options.output_filepath, 'w')
-   #    LOGGER.info('Writing config file to %s' % (options.output_filepath,))
-   #else:
-   #    outstream = sys.stdout
-   #
-   #outstream.write(config)
+   config = format_gerald_config(options, flowcell_info, genome_map)
+ 
+   if options.output_filepath is not None:
+       outstream = open(options.output_filepath, 'w')
+       logging.info('Writing config file to %s' % (options.output_filepath,))
+   else:
+       outstream = sys.stdout
+ 
+   outstream.write(config)
   
     if options.sample_sheet is None:
         pass
diff --cc htsworkflow/pipelines/runfolder.py

index 568e683ff2757bb65f9315f5059317cc900697c0,fcee65ce8d32806efbffb5abf610a0bf094d0b5c..4a2b4cdc3704fccb1b7fbdd5f48211c86851b6c4
--- 1/htsworkflow/pipelines/runfolder.py
--- 2/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@@ -13,12 -13,10 +13,12 @@@ import tarfil
   import time
   
   try:
-   from xml.etree import ElementTree
+     from xml.etree import ElementTree
   except ImportError, e:
-   from elementtree import ElementTree
+     from elementtree import ElementTree
   
+ +LOGGER = logging.getLogger(__name__)
+ +
   EUROPEAN_STRPTIME = "%d-%m-%Y"
   EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
   VERSION_RE = "([0-9\.]+)"
@@@ -57,23 -55,23 +57,23 @@@ class PipelineRun(object)
       def _get_flowcell_id(self):
           # extract flowcell ID
           if self._flowcell_id is None:
-           config_dir = os.path.join(self.pathname, 'Config')
-           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
-         if os.path.exists(flowcell_id_path):
-             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
-             self._flowcell_id = flowcell_id_tree.findtext('Text')
-         else:
-             path_fields = self.pathname.split('_')
-             if len(path_fields) > 0:
-               # guessing last element of filename
-               flowcell_id = path_fields[-1]
+             config_dir = os.path.join(self.pathname, 'Config')
+             flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+             if os.path.exists(flowcell_id_path):
+                 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+                 self._flowcell_id = flowcell_id_tree.findtext('Text')
               else:
-               flowcell_id = 'unknown'
+                 path_fields = self.pathname.split('_')
+                 if len(path_fields) > 0:
+                     # guessing last element of filename
- -                   flowcell_id = path_fields[-1]
++                   self._flowcell_id = path_fields[-1]
+                 else:
- -                   flowcell_id = 'unknown'
++                   self._flowcell_id = 'unknown'
++
++                   LOGGER.warning(
++                       "Flowcell id was not found, guessing %s" % (
++                       self._flowcell_id))
   
-           LOGGER.warning(
-             "Flowcell id was not found, guessing %s" % (
-                flowcell_id))
-           self._flowcell_id = flowcell_id
- -                logging.warning(
- -                  "Flowcell id was not found, guessing %s" % (
- -                     flowcell_id))
- -                self._flowcell_id = flowcell_id
           return self._flowcell_id
       flowcell_id = property(_get_flowcell_id)
   
@@@ -183,15 -181,15 +183,15 @@@ def get_runs(runfolder, flowcell_id=Non
           # RTA BaseCalls looks enough like Bustard.
           bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
           for bustard_pathname in bustard_dirs:
- -            logging.info("Found bustard directory %s" % (bustard_pathname,))
+ +            LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
               b = bustard.bustard(bustard_pathname)
               gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
- -            logging.info("Looking for gerald directories in %s" % (pathname,))
+ +            LOGGER.info("Looking for gerald directories in %s" % (pathname,))
               for gerald_pathname in glob(gerald_glob):
- -                logging.info("Found gerald directory %s" % (gerald_pathname,))
+ +                LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
                   try:
                       g = gerald.gerald(gerald_pathname)
-                     p = PipelineRun(runfolder)
+                     p = PipelineRun(runfolder, flowcell_id)
                       p.image_analysis = image_analysis
                       p.bustard = b
                       p.gerald = g
@@@ -613,12 -611,20 +613,20 @@@ def clean_runs(runs, dry_run=True)
           calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
           rm_list(calibration_dir, dry_run)
           # rm Images/L*
- -        logging.info("Cleaning images")
+ +        LOGGER.info("Cleaning images")
           image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
           rm_list(image_dirs, dry_run)
-         # cd Data/C1-*_Firecrest*
-         LOGGER.info("Cleaning intermediate files")
+         # rm ReadPrep
- -        logging.info("Cleaning ReadPrep*")
++        LOGGER.info("Cleaning ReadPrep*")
+         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
+         rm_list(read_prep_dirs, dry_run)
+         # rm ReadPrep
- -        logging.info("Cleaning Thubmnail_images")
++        LOGGER.info("Cleaning Thubmnail_images")
+         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
+         rm_list(thumbnail_dirs, dry_run)
+ 
           # make clean_intermediate
+         logging.info("Cleaning intermediate files")
           if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
               clean_process = subprocess.Popen(['make', 'clean_intermediate'],
                                                cwd=run.image_analysis.pathname,)
diff --cc htsworkflow/pipelines/srf.py
Simple merge
diff --cc scripts/htsw-runfolder

index b9dfa9096fb7f5960db7011f61086e40e0e15f0a,7ae95b4f49ebab11f15b319647f9b19483bd1fae..53bceeff47e920b5e9ddf2696272f7a17d2ec139
--- 1/scripts/htsw-runfolder
--- 2/scripts/htsw-runfolder
+++ b/scripts/htsw-runfolder
@@@ -36,8 -40,93 +40,94 @@@ import sy
   from htsworkflow.pipelines import runfolder
   from htsworkflow.pipelines.runfolder import ElementTree
   
+ +LOGGER = logging.getLogger(__name__)
   
+ def main(cmdlist=None):
+     parser = make_parser()
+     opts, args = parser.parse_args(cmdlist)
+ 
+     logging.basicConfig()
+     root_log = logging.getLogger()
+     if opts.debug:
+         root_log.setLevel(logging.DEBUG)
+     elif opts.verbose:
+         root_log.setLevel(logging.INFO)
+ 
+     logging.info('Starting htsworkflow illumina runfolder processing tool.')
+     runs = []
+     runs.extend(load_run_xml_file(parser, args, opts))
+     runs.extend(load_specific_runfolder_analysis(parser, args, opts))
+     runs.extend(load_runfolders(parser, args, opts))
+ 
+     if len(runs) == 0:
+         parser.error("Please specify some run folders to process")
+ 
+     command_run = False
+     if opts.summary:
+         print runfolder.summary_report(runs)
+         command_run = True
+     if opts.archive:
+         runfolder.extract_run_parameters(runs)
+         command_run = True
+     if opts.extract_results:
+         command_run = True
+         extract_results(parser, args, opts, runs)
+     if opts.clean:
+         runfolder.clean_runs(runs, opts.dry_run)
+         command_run = True
+ 
+     if command_run == False:
+         parser.perror("No commands provided")
+ 
+     return 0
+ 
+ 
+ def load_run_xml_file(parser, args, opts):
+     runs = []
+     if opts.run_xml:
+         # handle ~ shortcut
+         opt.run_xml = os.path.expanduser(opt.run_xml)
+         tree = ElementTree.parse(opt.run_xml).getroot()
+         runs.append(runfolder.PipelineRun(xml=tree))
+     return runs
+ 
+ 
+ def load_specific_runfolder_analysis(parser, args, opts):
+     # look for manually specified run
+     runs = []
+     if opts.use_run is not None:
+         specific_run = runfolder.get_specific_run(opts.use_run)
+         if specific_run is not None:
+             runs.append(specific_run)
+         else:
+             logging.warn("Couldn't find a run in %s" % (opts.use_run,))
+     return runs
+ 
+ 
+ def load_runfolders(parser, args, opts):
+     if opts.flowcell_id is not None:
+         if len(args) != 1:
+             parser.error(
+                 'Can only force flowcell ID when operating on one run')
+     # scan runfolders for runs
+     runs = []
+     for run_pattern in args:
+         # expand args on our own if needed
+         for run_dir in glob(run_pattern):
+             runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
+     return runs
+ 
+ 
+ def extract_results(parser, args, opts, runs):
+     if opts.dry_run:
+         parser.error("Dry-run is not supported for extract-results")
+     runfolder.extract_results(runs,
+                               opts.output_dir,
+                               opts.site,
+                               opts.max_jobs,
+                               opts.raw_format)
+ 
+ 
   def make_parser():
       usage = 'usage: %prog [options] runfolder_root_dir'
       parser = optparse.OptionParser(usage)
author	Diane Trout <diane@caltech.edu>
	Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
committer	Diane Trout <diane@caltech.edu>
	Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
		1	2
extra/ucsc_encode_submission/encode_find.py	patch \|	diff1 \|	diff2 \|	blob \| history
htsworkflow/frontend/samples/views.py	patch \|	diff1 \|	diff2 \|	blob \| history
htsworkflow/pipelines/retrieve_config.py	patch \|	diff1 \|	diff2 \|	blob \| history
htsworkflow/pipelines/runfolder.py	patch \|	diff1 \|	diff2 \|	blob \| history
htsworkflow/pipelines/srf.py	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/htsw-runfolder	patch \|	diff1 \|	diff2 \|	blob \| history