htsworkflow/frontend/samples/results.py

   1 from htsworkflow.frontend import settings
   2
   3 import glob
   4 import os
   5 import re
   6
   7 s_paren = re.compile("^\w+")
   8
   9 def get_flowcell_result_dict(flowcell_id):
  10     """
  11     returns a dictionary following the following pattern for
  12     a given flowcell_id:
  13
  14
  15     d['C1-33']['summary']           # Summary.htm file path
  16     d['C1-33']['eland_results'][5]  # C1-33 lane 5 file eland results file path
  17     d['C1-33']['run_xml']           # run_*.xml file path
  18     d['C1-33']['scores']            # scores.tar.gz file path
  19     """
  20     flowcell_id = flowcell_id.strip()
  21
  22     d = {}
  23
  24     ################################
  25     # Flowcell Directory
  26     fc_dir = glob.glob(os.path.join(settings.RESULT_HOME_DIR, flowcell_id))
  27
  28     # Not found
  29     if len(fc_dir) == 0:
  30         return None
  31
  32     # No duplicates!
  33     assert len(fc_dir) <= 1
  34
  35     # Found fc dir
  36     fc_dir = fc_dir[0]
  37
  38     ################################
  39     # C#-## dirs
  40     c_dir_list = glob.glob(os.path.join(fc_dir, 'C*'))
  41
  42     # Not found
  43     if len(c_dir_list) == 0:
  44         return d
  45
  46     for c_dir_path in c_dir_list:
  47         summary_file = glob.glob(os.path.join(c_dir_path, 'Summary.htm'))
  48         pathdir, c_dir = os.path.split(c_dir_path)
  49
  50         # Create sub-dictionary
  51         d[c_dir] = {}
  52
  53
  54         ###############################
  55         # Summary.htm file
  56
  57         # Not found
  58         if len(summary_file) == 0:
  59             d[c_dir]['summary'] = None
  60
  61         # Found
  62         else:
  63             # No duplicates!
  64             assert len(summary_file) == 1
  65
  66             summary_file = summary_file[0]
  67             d[c_dir]['summary'] = summary_file
  68
  69         ###############################
  70         # Result files
  71
  72         d[c_dir]['eland_results'] = {}
  73
  74         result_filepaths = glob.glob(os.path.join(c_dir_path, 's_*_eland_*'))
  75
  76         for filepath in result_filepaths:
  77
  78             junk, result_name = os.path.split(filepath)
  79
  80             #lanes 1-8, single digit, therefore s_#; # == index 2
  81             lane = int(result_name[2])
  82             d[c_dir]['eland_results'][lane] = filepath
  83
  84         ###############################
  85         # run*.xml file
  86         run_xml_filepath = glob.glob(os.path.join(c_dir_path, 'run_*.xml'))
  87
  88         if len(run_xml_filepath) == 0:
  89             d[c_dir]['run_xml'] = None
  90         else:
  91             # No duplicates
  92             assert len(run_xml_filepath) == 1
  93
  94             d[c_dir]['run_xml'] = run_xml_filepath[0]
  95
  96         ###############################
  97         # scores.tar.gz
  98         scores_filepath = glob.glob(os.path.join(c_dir_path, 'scores*'))
  99
 100         if len(scores_filepath) == 0:
 101             d[c_dir]['scores'] = None
 102         else:
 103             # No duplicates
 104             assert len(scores_filepath) == 1
 105
 106             d[c_dir]['scores'] = scores_filepath[0]
 107
 108     return d
 109
 110
 111 def cn_mTobp(cn_m):
 112     """
 113     Converts CN-M (i.e. C1-33, C1-26, C4-28) cycle information into
 114     number of base pairs.
 115     """
 116     pass
 117
 118
 119 def parse_flowcell_id(flowcell_id):
 120     """
 121     Return flowcell id and any status encoded in the id
 122
 123     We stored the status information in the flowcell id name.
 124     this was dumb, but database schemas are hard to update.
 125     """
 126     fields = flowcell_id.split()
 127     fcid = None
 128     status = None
 129     if len(fields) > 0:
 130         fcid = fields[0]
 131     if len(fields) > 1:
 132         status = fields[1]
 133     return fcid, status
 134