Fix ticket:145 this patch includes the tar.bz2 extension in the scores pattern.
[htsworkflow.git] / htsworkflow / frontend / samples / results.py
1 from htsworkflow.frontend import settings
2
3 import glob
4 import os
5 import re
6
7 s_paren = re.compile("^\w+")
8
9 def get_flowcell_result_dict(flowcell_id):
10     """
11     returns a dictionary following the following pattern for
12     a given flowcell_id:
13     
14      
15     d['C1-33']['summary']           # Summary.htm file path
16     d['C1-33']['eland_results'][5]  # C1-33 lane 5 file eland results file path
17     d['C1-33']['run_xml']           # run_*.xml file path
18     d['C1-33']['scores']            # scores.tar.gz file path
19     """
20     flowcell_id = flowcell_id.strip()
21     
22     d = {}
23     
24     ################################
25     # Flowcell Directory
26     fc_dir = glob.glob(os.path.join(settings.RESULT_HOME_DIR, flowcell_id))
27     
28     # Not found
29     if len(fc_dir) == 0:
30         return None
31     
32     # No duplicates!
33     assert len(fc_dir) <= 1
34     
35     # Found fc dir
36     fc_dir = fc_dir[0]
37     
38     ################################
39     # C#-## dirs
40     c_dir_list = glob.glob(os.path.join(fc_dir, 'C*'))
41     
42     # Not found
43     if len(c_dir_list) == 0:
44         return d
45     
46     for c_dir_path in c_dir_list:
47         summary_file = glob.glob(os.path.join(c_dir_path, 'Summary.htm'))
48         pathdir, c_dir = os.path.split(c_dir_path)
49         
50         # Create sub-dictionary
51         d[c_dir] = {}
52         
53         
54         ###############################
55         # Summary.htm file
56         
57         # Not found
58         if len(summary_file) == 0:
59             d[c_dir]['summary'] = None
60             
61         # Found
62         else:
63             # No duplicates!
64             assert len(summary_file) == 1
65             
66             summary_file = summary_file[0]
67             d[c_dir]['summary'] = summary_file
68             
69         ###############################
70         # Result files
71         
72         d[c_dir]['eland_results'] = {}
73         
74         result_filepaths = glob.glob(os.path.join(c_dir_path, 's_*_eland_*'))
75         
76         for filepath in result_filepaths:
77             
78             junk, result_name = os.path.split(filepath)
79             
80             #lanes 1-8, single digit, therefore s_#; # == index 2
81             lane = int(result_name[2])
82             d[c_dir]['eland_results'][lane] = filepath
83             
84         ###############################
85         # run*.xml file
86         run_xml_filepath = glob.glob(os.path.join(c_dir_path, 'run_*.xml'))
87         
88         if len(run_xml_filepath) == 0:
89             d[c_dir]['run_xml'] = None
90         else:
91             # No duplicates
92             assert len(run_xml_filepath) == 1
93             
94             d[c_dir]['run_xml'] = run_xml_filepath[0]
95             
96         ###############################
97         # scores.tar.gz
98         # restrict to only compressed files, so in case there are *.md5 files
99         # we don't get confused.
100         scores_filepath = []
101         for pattern in ['scores*.tar.bz2', 'scores*.tar.gz', 'scores*.tgz']:
102             scores_filepath += glob.glob(os.path.join(c_dir_path, pattern))
103         
104         if len(scores_filepath) == 0:
105             d[c_dir]['scores'] = None
106         else:
107             # No duplicates
108             assert len(scores_filepath) == 1
109             
110             d[c_dir]['scores'] = scores_filepath[0]
111         
112     return d
113
114     
115 def cn_mTobp(cn_m):
116     """
117     Converts CN-M (i.e. C1-33, C1-26, C4-28) cycle information into
118     number of base pairs.
119     """
120     pass
121
122
123 def parse_flowcell_id(flowcell_id):
124     """
125     Return flowcell id and any status encoded in the id
126   
127     We stored the status information in the flowcell id name.
128     this was dumb, but database schemas are hard to update.
129     """
130     fields = flowcell_id.split()
131     fcid = None
132     status = None
133     if len(fields) > 0:
134         fcid = fields[0]
135     if len(fields) > 1:
136         status = fields[1]
137     return fcid, status
138