Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
authorDiane Trout <diane@caltech.edu>
Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
committerDiane Trout <diane@caltech.edu>
Sat, 5 Nov 2011 00:01:41 +0000 (17:01 -0700)
Conflicts:
htsworkflow/pipelines/eland.py
htsworkflow/pipelines/retrieve_config.py
htsworkflow/pipelines/runfolder.py
scripts/htsw-runfolder

1  2 
extra/ucsc_encode_submission/encode_find.py
htsworkflow/frontend/samples/views.py
htsworkflow/pipelines/retrieve_config.py
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/srf.py
scripts/htsw-runfolder

index fdf754241de090b7ca71eea2abc859eae0b8d194,a01167047b3528f5bcc5257a898c7d1bffe04663..508291244ffea73c8cda46b7a74b875bc3ca1af3
@@@ -268,10 -277,10 +277,10 @@@ def update_submission_detail(model, sub
  
      if len(status_nodes) == 0:
          # has no status node, add one
 -        logging.info("Adding status node to {0}".format(subUrn))
 +        LOGGER.info("Adding status node to {0}".format(subUrn))
          status_node = create_status_node(subUrn, recent_update)
          add_stmt(model, subUrn, HasStatusN, status_node)
-         add_stmt(model, status_node, rdfsNS['type'], StatusN)
+         add_stmt(model, status_node, rdfNS['type'], StatusN)
          add_stmt(model, status_node, StatusN, status)
          add_stmt(model, status_node, LastModifyN, recent_update)
          update_ddf(model, subUrn, status_node, cookie=cookie)
@@@ -298,9 -307,14 +307,14 @@@ def update_daf(model, submission_url, s
  
      status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
      if not model.contains_statement(status_is_daf):
 -        logging.info('Adding daf to {0}, {1}'.format(submission_url,
 +        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
                                                       status_node))
          daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+         daf_hash = hashlib.md5(daf_text).hexdigest()
+         daf_hash_stmt = RDF.Statement(status_node,
+                                       dafTermOntology['md5sum'],
+                                       daf_hash)
+         model.add_statement(daf_hash_stmt)
          daf.fromstring_into_model(model, status_node, daf_text)
  
  
Simple merge
index 42ff263d97d99e6f54a659b55b5d6dba0bf13fe2,48b581e72be8aecc329b90fe204f818c5295d174..94d8f5036089a47daeaad4a4b3f44a58540d0555
@@@ -317,20 -315,20 +317,20 @@@ def saveConfigFile(options)
  
    flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
  
 -  logging.debug('genome_dir: %s' % ( options.genome_dir, ))
 +  LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
    available_genomes = getAvailableGenomes(options.genome_dir)
    genome_map = constructMapperDict(available_genomes)
 -  logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 +  LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
  
-   #config = format_gerald_config(options, flowcell_info, genome_map)
-   #
-   #if options.output_filepath is not None:
-   #    outstream = open(options.output_filepath, 'w')
-   #    LOGGER.info('Writing config file to %s' % (options.output_filepath,))
-   #else:
-   #    outstream = sys.stdout
-   #
-   #outstream.write(config)
+   config = format_gerald_config(options, flowcell_info, genome_map)
+   if options.output_filepath is not None:
+       outstream = open(options.output_filepath, 'w')
+       logging.info('Writing config file to %s' % (options.output_filepath,))
+   else:
+       outstream = sys.stdout
+   outstream.write(config)
  
    if options.sample_sheet is None:
        pass
index 568e683ff2757bb65f9315f5059317cc900697c0,fcee65ce8d32806efbffb5abf610a0bf094d0b5c..4a2b4cdc3704fccb1b7fbdd5f48211c86851b6c4
@@@ -13,12 -13,10 +13,12 @@@ import tarfil
  import time
  
  try:
-   from xml.etree import ElementTree
+     from xml.etree import ElementTree
  except ImportError, e:
-   from elementtree import ElementTree
+     from elementtree import ElementTree
  
 +LOGGER = logging.getLogger(__name__)
 +
  EUROPEAN_STRPTIME = "%d-%m-%Y"
  EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
  VERSION_RE = "([0-9\.]+)"
@@@ -57,23 -55,23 +57,23 @@@ class PipelineRun(object)
      def _get_flowcell_id(self):
          # extract flowcell ID
          if self._flowcell_id is None:
-           config_dir = os.path.join(self.pathname, 'Config')
-           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
-         if os.path.exists(flowcell_id_path):
-             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
-             self._flowcell_id = flowcell_id_tree.findtext('Text')
-         else:
-             path_fields = self.pathname.split('_')
-             if len(path_fields) > 0:
-               # guessing last element of filename
-               flowcell_id = path_fields[-1]
+             config_dir = os.path.join(self.pathname, 'Config')
+             flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+             if os.path.exists(flowcell_id_path):
+                 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+                 self._flowcell_id = flowcell_id_tree.findtext('Text')
              else:
-               flowcell_id = 'unknown'
+                 path_fields = self.pathname.split('_')
+                 if len(path_fields) > 0:
+                     # guessing last element of filename
 -                   flowcell_id = path_fields[-1]
++                   self._flowcell_id = path_fields[-1]
+                 else:
 -                   flowcell_id = 'unknown'
++                   self._flowcell_id = 'unknown'
++
++                   LOGGER.warning(
++                       "Flowcell id was not found, guessing %s" % (
++                       self._flowcell_id))
  
-           LOGGER.warning(
-             "Flowcell id was not found, guessing %s" % (
-                flowcell_id))
-           self._flowcell_id = flowcell_id
 -                logging.warning(
 -                  "Flowcell id was not found, guessing %s" % (
 -                     flowcell_id))
 -                self._flowcell_id = flowcell_id
          return self._flowcell_id
      flowcell_id = property(_get_flowcell_id)
  
@@@ -183,15 -181,15 +183,15 @@@ def get_runs(runfolder, flowcell_id=Non
          # RTA BaseCalls looks enough like Bustard.
          bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
          for bustard_pathname in bustard_dirs:
 -            logging.info("Found bustard directory %s" % (bustard_pathname,))
 +            LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
              b = bustard.bustard(bustard_pathname)
              gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
 -            logging.info("Looking for gerald directories in %s" % (pathname,))
 +            LOGGER.info("Looking for gerald directories in %s" % (pathname,))
              for gerald_pathname in glob(gerald_glob):
 -                logging.info("Found gerald directory %s" % (gerald_pathname,))
 +                LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
                  try:
                      g = gerald.gerald(gerald_pathname)
-                     p = PipelineRun(runfolder)
+                     p = PipelineRun(runfolder, flowcell_id)
                      p.image_analysis = image_analysis
                      p.bustard = b
                      p.gerald = g
@@@ -613,12 -611,20 +613,20 @@@ def clean_runs(runs, dry_run=True)
          calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
          rm_list(calibration_dir, dry_run)
          # rm Images/L*
 -        logging.info("Cleaning images")
 +        LOGGER.info("Cleaning images")
          image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
          rm_list(image_dirs, dry_run)
-         # cd Data/C1-*_Firecrest*
-         LOGGER.info("Cleaning intermediate files")
+         # rm ReadPrep
 -        logging.info("Cleaning ReadPrep*")
++        LOGGER.info("Cleaning ReadPrep*")
+         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
+         rm_list(read_prep_dirs, dry_run)
+         # rm ReadPrep
 -        logging.info("Cleaning Thubmnail_images")
++        LOGGER.info("Cleaning Thubmnail_images")
+         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
+         rm_list(thumbnail_dirs, dry_run)
          # make clean_intermediate
+         logging.info("Cleaning intermediate files")
          if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
              clean_process = subprocess.Popen(['make', 'clean_intermediate'],
                                               cwd=run.image_analysis.pathname,)
Simple merge
index b9dfa9096fb7f5960db7011f61086e40e0e15f0a,7ae95b4f49ebab11f15b319647f9b19483bd1fae..53bceeff47e920b5e9ddf2696272f7a17d2ec139
@@@ -36,8 -40,93 +40,94 @@@ import sy
  from htsworkflow.pipelines import runfolder
  from htsworkflow.pipelines.runfolder import ElementTree
  
 +LOGGER = logging.getLogger(__name__)
  
+ def main(cmdlist=None):
+     parser = make_parser()
+     opts, args = parser.parse_args(cmdlist)
+     logging.basicConfig()
+     root_log = logging.getLogger()
+     if opts.debug:
+         root_log.setLevel(logging.DEBUG)
+     elif opts.verbose:
+         root_log.setLevel(logging.INFO)
+     logging.info('Starting htsworkflow illumina runfolder processing tool.')
+     runs = []
+     runs.extend(load_run_xml_file(parser, args, opts))
+     runs.extend(load_specific_runfolder_analysis(parser, args, opts))
+     runs.extend(load_runfolders(parser, args, opts))
+     if len(runs) == 0:
+         parser.error("Please specify some run folders to process")
+     command_run = False
+     if opts.summary:
+         print runfolder.summary_report(runs)
+         command_run = True
+     if opts.archive:
+         runfolder.extract_run_parameters(runs)
+         command_run = True
+     if opts.extract_results:
+         command_run = True
+         extract_results(parser, args, opts, runs)
+     if opts.clean:
+         runfolder.clean_runs(runs, opts.dry_run)
+         command_run = True
+     if command_run == False:
+         parser.perror("No commands provided")
+     return 0
+ def load_run_xml_file(parser, args, opts):
+     runs = []
+     if opts.run_xml:
+         # handle ~ shortcut
+         opt.run_xml = os.path.expanduser(opt.run_xml)
+         tree = ElementTree.parse(opt.run_xml).getroot()
+         runs.append(runfolder.PipelineRun(xml=tree))
+     return runs
+ def load_specific_runfolder_analysis(parser, args, opts):
+     # look for manually specified run
+     runs = []
+     if opts.use_run is not None:
+         specific_run = runfolder.get_specific_run(opts.use_run)
+         if specific_run is not None:
+             runs.append(specific_run)
+         else:
+             logging.warn("Couldn't find a run in %s" % (opts.use_run,))
+     return runs
+ def load_runfolders(parser, args, opts):
+     if opts.flowcell_id is not None:
+         if len(args) != 1:
+             parser.error(
+                 'Can only force flowcell ID when operating on one run')
+     # scan runfolders for runs
+     runs = []
+     for run_pattern in args:
+         # expand args on our own if needed
+         for run_dir in glob(run_pattern):
+             runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
+     return runs
+ def extract_results(parser, args, opts, runs):
+     if opts.dry_run:
+         parser.error("Dry-run is not supported for extract-results")
+     runfolder.extract_results(runs,
+                               opts.output_dir,
+                               opts.site,
+                               opts.max_jobs,
+                               opts.raw_format)
  def make_parser():
      usage = 'usage: %prog [options] runfolder_root_dir'
      parser = optparse.OptionParser(usage)