7650a9468e96df9486ec98e3e0721d90a46ff3de
[htsworkflow.git] / extra / ucsc_encode_submission / ucsc_gather.py
1 #!/usr/bin/env python
2 from ConfigParser import SafeConfigParser
3 import fnmatch
4 from glob import glob
5 import json
6 import logging
7 import netrc
8 from optparse import OptionParser
9 import os
10 from pprint import pprint, pformat
11 import shlex
12 from StringIO import StringIO
13 import stat
14 from subprocess import Popen, PIPE
15 import sys
16 import time
17 import types
18 import urllib
19 import urllib2
20 import urlparse
21
22 from htsworkflow.util import api
23 from htsworkflow.pipelines.sequences import \
24     create_sequence_table, \
25     scan_for_sequences
26 from htsworkflow.pipelines import qseq2fastq
27 from htsworkflow.pipelines import srf2fastq
28
29 def main(cmdline=None):
30     parser = make_parser()
31     opts, args = parser.parse_args(cmdline)
32     
33     if opts.debug:
34         logging.basicConfig(level = logging.DEBUG )
35     elif opts.verbose:
36         logging.basicConfig(level = logging.INFO )
37     else:
38         logging.basicConfig(level = logging.WARNING )        
39     
40     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
41
42     if opts.host is None or opts.apiid is None or opts.apikey is None:
43         parser.error("Please specify host url, apiid, apikey")
44
45     if opts.makeddf and opts.daf is None:
46         parser.error("Please specify your daf when making ddf files")
47
48     if len(args) == 0:
49         parser.error("I need at least one library submission-dir input file")
50         
51     library_result_map = []
52     for a in args:
53         library_result_map.extend(read_library_result_map(a))
54
55     if opts.make_tree_from is not None:
56         make_tree_from(opts.make_tree_from, library_result_map)
57             
58     if opts.daf is not None:
59         link_daf(opts.daf, library_result_map)
60
61     if opts.fastq:
62         build_fastqs(opts.host, 
63                      apidata, 
64                      opts.sequence, 
65                      library_result_map,
66                      force=opts.force)
67
68     if opts.ini:
69         make_submission_ini(opts.host, apidata, library_result_map)
70
71     if opts.makeddf:
72         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
73
74
75 def make_parser():
76     # Load defaults from the config files
77     config = SafeConfigParser()
78     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
79     
80     sequence_archive = None
81     apiid = None
82     apikey = None
83     apihost = None
84     SECTION = 'sequence_archive'
85     if config.has_section(SECTION):
86         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
87         sequence_archive = os.path.expanduser(sequence_archive)
88         apiid = config.get(SECTION, 'apiid', apiid)
89         apikey = config.get(SECTION, 'apikey', apikey)
90         apihost = config.get(SECTION, 'host', apihost)
91
92     parser = OptionParser()
93
94     # commands
95     parser.add_option('--make-tree-from',
96                       help="create directories & link data files",
97                       default=None)
98     parser.add_option('--fastq', help="generate scripts for making fastq files",
99                       default=False, action="store_true")
100
101     parser.add_option('--ini', help="generate submission ini file", default=False,
102                       action="store_true")
103
104     parser.add_option('--makeddf', help='make the ddfs', default=False,
105                       action="store_true")
106     
107     parser.add_option('--daf', default=None, help='specify daf name')
108     parser.add_option('--force', default=False, action="store_true",
109                       help="Force regenerating fastqs")
110
111     # configuration options
112     parser.add_option('--apiid', default=apiid, help="Specify API ID")
113     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
114     parser.add_option('--host',  default=apihost,
115                       help="specify HTSWorkflow host",)
116     parser.add_option('--sequence', default=sequence_archive,
117                       help="sequence repository")
118
119     # debugging
120     parser.add_option('--verbose', default=False, action="store_true",
121                       help='verbose logging')
122     parser.add_option('--debug', default=False, action="store_true",
123                       help='debug logging')
124
125     return parser
126
127
128 def make_tree_from(source_path, library_result_map):
129     """Create a tree using data files from source path.
130     """
131     for lib_id, lib_path in library_result_map:
132         if not os.path.exists(lib_path):
133             logging.info("Making dir {0}".format(lib_path))
134             os.mkdir(lib_path)
135         source_lib_dir = os.path.join(source_path, lib_path)
136         if os.path.exists(source_lib_dir):
137             pass
138         for filename in os.listdir(source_lib_dir):
139             source_pathname = os.path.join(source_lib_dir, filename)
140             target_pathname = os.path.join(lib_path, filename)
141             if not os.path.exists(source_pathname):
142                 raise IOError("{0} does not exist".format(source_pathname))
143             if not os.path.exists(target_pathname):
144                 os.symlink(source_pathname, target_pathname)
145                 logging.info(
146                     'LINK {0} to {1}'.format(source_pathname, target_pathname))
147     
148 def build_fastqs(host, apidata, sequences_path, library_result_map, 
149                  force=False ):
150     """
151     Generate condor scripts to build any needed fastq files
152     
153     Args:
154       host (str): root of the htsworkflow api server
155       apidata (dict): id & key to post to the server
156       sequences_path (str): root of the directory tree to scan for files
157       library_result_map (list):  [(library_id, destination directory), ...]
158     """
159     qseq_condor_header = """
160 Universe=vanilla
161 executable=%(exe)s
162 error=log/qseq2fastq.err.$(process).log
163 output=log/qseq2fastq.out.$(process).log
164 log=log/qseq2fastq.log
165
166 """ % {'exe': sys.executable }
167     qseq_condor_entries = []
168     srf_condor_header = """
169 Universe=vanilla
170 executable=%(exe)s
171 output=log/srf_pair_fastq.out.$(process).log
172 error=log/srf_pair_fastq.err.$(process).log
173 log=log/srf_pair_fastq.log
174 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
175
176 """ % {'exe': sys.executable }
177     srf_condor_entries = []
178     lib_db = find_archive_sequence_files(host, 
179                                          apidata, 
180                                          sequences_path, 
181                                          library_result_map)
182
183     needed_targets = find_missing_targets(library_result_map, lib_db, force)
184
185     for target_pathname, available_sources in needed_targets.items():
186         logging.debug(' target : %s' % (target_pathname,))
187         logging.debug(' candidate sources: %s' % (available_sources,))
188         if available_sources.has_key('qseq'):
189             source = available_sources['qseq']
190             qseq_condor_entries.append(
191                 condor_qseq_to_fastq(source.path, 
192                                      target_pathname, 
193                                      source.flowcell,
194                                      force=force)
195             )
196         elif available_sources.has_key('srf'):
197             source = available_sources['srf']
198             mid = getattr(source, 'mid_point', None)
199             srf_condor_entries.append(
200                 condor_srf_to_fastq(source.path, 
201                                     target_pathname,
202                                     source.paired,
203                                     source.flowcell,
204                                     mid,
205                                     force=force)
206             )
207         else:
208             print " need file", target_pathname
209
210     if len(srf_condor_entries) > 0:
211         make_submit_script('srf.fastq.condor', 
212                            srf_condor_header,
213                            srf_condor_entries)
214
215     if len(qseq_condor_entries) > 0:
216         make_submit_script('qseq.fastq.condor', 
217                            qseq_condor_header,
218                            qseq_condor_entries)
219
220
221 def find_missing_targets(library_result_map, lib_db, force=False):
222     """
223     Check if the sequence file exists.
224     This requires computing what the sequence name is and checking
225     to see if it can be found in the sequence location.
226
227     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
228     """
229     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
230     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
231     # find what targets we're missing
232     needed_targets = {}
233     for lib_id, result_dir in library_result_map:
234         lib = lib_db[lib_id]
235         lane_dict = make_lane_dict(lib_db, lib_id)
236         
237         for lane_key, sequences in lib['lanes'].items():
238             for seq in sequences:
239                 seq.paired = lane_dict[seq.flowcell]['paired_end']
240                 lane_status = lane_dict[seq.flowcell]['status']
241
242                 if seq.paired and seq.read is None:
243                     seq.read = 1
244                 filename_attributes = { 
245                     'flowcell': seq.flowcell,
246                     'lib_id': lib_id,
247                     'lane': seq.lane,
248                     'read': seq.read,
249                     'cycle': seq.cycle
250                     }
251                 # skip bad runs
252                 if lane_status == 'Failed':
253                     continue
254                 if seq.flowcell == '30DY0AAXX':
255                     # 30DY0 only ran for 151 bases instead of 152
256                     # it is actually 76 1st read, 75 2nd read
257                     seq.mid_point = 76
258
259                 # end filters
260                 if seq.paired:
261                     target_name = fastq_paired_template % filename_attributes
262                 else:
263                     target_name = fastq_single_template % filename_attributes
264
265                 target_pathname = os.path.join(result_dir, target_name)
266                 if force or not os.path.exists(target_pathname):
267                     t = needed_targets.setdefault(target_pathname, {})
268                     t[seq.filetype] = seq
269
270     return needed_targets
271
272
273 def link_daf(daf_path, library_result_map):
274     if not os.path.exists(daf_path):
275         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
276
277     base_daf = os.path.basename(daf_path)
278     
279     for lib_id, result_dir in library_result_map:
280         if not os.path.exists(result_dir):
281             raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
282         submission_daf = os.path.join(result_dir, base_daf)
283         if not os.path.exists(submission_daf):
284             if not os.path.exists(daf_path):
285                 raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
286             os.link(daf_path, submission_daf)
287
288
289 def make_submission_ini(host, apidata, library_result_map, paired=True):
290     #attributes = get_filename_attribute_map(paired)
291     view_map = NameToViewMap(host, apidata)
292     
293     candidate_fastq_src = {}
294
295     for lib_id, result_dir in library_result_map:
296         order_by = ['order_by=files', 'view', 'replicate', 'cell', 
297                     'readType', 'mapAlgorithm', 'insertLength', 'md5sum' ]
298         inifile =  ['[config]']
299         inifile += [" ".join(order_by)]
300         inifile += ['']
301         line_counter = 1
302         result_ini = os.path.join(result_dir, result_dir+'.ini')
303
304         # write other lines
305         submission_files = os.listdir(result_dir)
306         fastqs = {}
307         fastq_attributes = {}
308         for f in submission_files:
309             attributes = view_map.find_attributes(f, lib_id)
310             if attributes is None:
311                 raise ValueError("Unrecognized file: %s" % (f,))
312             attributes['md5sum'] = "None"
313             
314             ext = attributes["extension"]
315             if attributes['view'] is None:                   
316                 continue               
317             elif attributes.get("type", None) == 'fastq':
318                 fastqs.setdefault(ext, set()).add(f)
319                 fastq_attributes[ext] = attributes
320             else:
321                 md5sum = make_md5sum(os.path.join(result_dir,f))
322                 if md5sum is not None:
323                     attributes['md5sum']=md5sum
324                 inifile.extend(
325                     make_submission_section(line_counter,
326                                             [f],
327                                             attributes
328                                             )
329                     )
330                 inifile += ['']
331                 line_counter += 1
332                 # add in fastqs on a single line.
333
334         for extension, fastq_files in fastqs.items():
335             inifile.extend(
336                 make_submission_section(line_counter, 
337                                         fastq_files,
338                                         fastq_attributes[extension])
339             )
340             inifile += ['']
341             line_counter += 1
342             
343         f = open(result_ini,'w')
344         f.write(os.linesep.join(inifile))
345
346         
347 def make_lane_dict(lib_db, lib_id):
348     """
349     Convert the lane_set in a lib_db to a dictionary
350     indexed by flowcell ID
351     """
352     result = []
353     for lane in lib_db[lib_id]['lane_set']:
354         result.append((lane['flowcell'], lane))
355     return dict(result)
356
357
358 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
359     dag_fragment = []
360     for lib_id, result_dir in library_result_map:
361         ininame = result_dir+'.ini'
362         inipathname = os.path.join(result_dir, ininame)
363         if os.path.exists(inipathname):
364             dag_fragment.extend(
365                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
366             )
367
368     if make_condor and len(dag_fragment) > 0:
369         dag_filename = 'submission.dagman'
370         if not force and os.path.exists(dag_filename):
371             logging.warn("%s exists, please delete" % (dag_filename,))
372         else:
373             f = open(dag_filename,'w')
374             f.write( os.linesep.join(dag_fragment))
375             f.write( os.linesep )
376             f.close()
377             
378
379 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
380     """
381     Make ddf files, and bonus condor file
382     """
383     dag_fragments = []
384     curdir = os.getcwd()
385     if outdir is not None:
386         os.chdir(outdir)
387     output = sys.stdout
388     ddf_name = None
389     if guess_ddf:
390         ddf_name = make_ddf_name(ininame)
391         print ddf_name
392         output = open(ddf_name,'w')
393
394     file_list = read_ddf_ini(ininame, output)
395     logging.info(
396         "Read config {0}, found files: {1}".format(
397             ininame, ", ".join(file_list)))
398
399     file_list.append(daf_name)
400     if ddf_name is not None:
401         file_list.append(ddf_name)
402
403     if make_condor:
404         archive_condor = make_condor_archive_script(ininame, file_list)
405         upload_condor = make_condor_upload_script(ininame)
406         
407         dag_fragments.extend( 
408             make_dag_fragment(ininame, archive_condor, upload_condor)
409         ) 
410         
411     os.chdir(curdir)
412     
413     return dag_fragments
414
415
416 def read_ddf_ini(filename, output=sys.stdout):
417     """
418     Read a ini file and dump out a tab delmited text file
419     """
420     file_list = []
421     config = SafeConfigParser()
422     config.read(filename)
423
424     order_by = shlex.split(config.get("config", "order_by"))
425
426     output.write("\t".join(order_by))
427     output.write(os.linesep)
428     sections = config.sections()
429     sections.sort()
430     for section in sections:
431         if section == "config":
432             # skip the config block
433             continue
434         values = []
435         for key in order_by:
436             v = config.get(section, key)
437             values.append(v)
438             if key == 'files':
439                 file_list.extend(parse_filelist(v))
440                 
441         output.write("\t".join(values))
442         output.write(os.linesep)
443     return file_list
444
445
446 def read_library_result_map(filename):
447     """
448     Read a file that maps library id to result directory.
449     Does not support spaces in filenames. 
450     
451     For example:
452       10000 result/foo/bar
453     """
454     stream = open(filename,'r')
455
456     results = []
457     for line in stream:
458         line = line.rstrip()
459         if not line.startswith('#') and len(line) > 0 :
460             library_id, result_dir = line.split()
461             results.append((library_id, result_dir))
462     return results
463
464
465 def make_condor_archive_script(ininame, files):
466     script = """Universe = vanilla
467
468 Executable = /bin/tar
469 arguments = czvhf ../%(archivename)s %(filelist)s
470
471 Error = compress.err.$(Process).log
472 Output = compress.out.$(Process).log
473 Log = /tmp/submission-compress-%(user)s.log
474 initialdir = %(initialdir)s
475 environment="GZIP=-3"
476 request_memory = 20
477
478 queue 
479 """
480     for f in files:
481         if not os.path.exists(f):
482             raise RuntimeError("Missing %s" % (f,))
483
484     context = {'archivename': make_submission_name(ininame),
485                'filelist': " ".join(files),
486                'initialdir': os.getcwd(),
487                'user': os.getlogin()}
488
489     condor_script = make_condor_name(ininame, 'archive')
490     condor_stream = open(condor_script,'w')
491     condor_stream.write(script % context)
492     condor_stream.close()
493     return condor_script
494
495
496 def make_condor_upload_script(ininame):
497     script = """Universe = vanilla
498
499 Executable = /usr/bin/lftp
500 arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
501
502 Error = upload.err.$(Process).log
503 Output = upload.out.$(Process).log
504 Log = /tmp/submission-upload-%(user)s.log
505 initialdir = %(initialdir)s
506
507 queue 
508 """
509     auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
510     
511     encodeftp = 'encodeftp.cse.ucsc.edu'
512     ftpuser = auth.hosts[encodeftp][0]
513     ftppassword = auth.hosts[encodeftp][2]
514     context = {'archivename': make_submission_name(ininame),
515                'initialdir': os.getcwd(),
516                'user': os.getlogin(),
517                'ftpuser': ftpuser,
518                'ftppassword': ftppassword,
519                'ftphost': encodeftp}
520
521     condor_script = make_condor_name(ininame, 'upload')
522     condor_stream = open(condor_script,'w')
523     condor_stream.write(script % context)
524     condor_stream.close()
525     os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
526
527     return condor_script
528
529
530 def make_dag_fragment(ininame, archive_condor, upload_condor):
531     """
532     Make the couple of fragments compress and then upload the data.
533     """
534     cur_dir = os.getcwd()
535     archive_condor = os.path.join(cur_dir, archive_condor)
536     upload_condor = os.path.join(cur_dir, upload_condor)
537     job_basename = make_base_name(ininame)
538
539     fragments = []
540     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
541     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
542     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
543
544     return fragments
545
546
547 def get_library_info(host, apidata, library_id):
548     url = api.library_url(host, library_id)
549     contents = api.retrieve_info(url, apidata)
550     return contents
551
552
553 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
554                         mid=None, force=False):
555     py = srf2fastq.__file__
556     args = [ py, srf_file, ]
557     if paired:
558         args.extend(['--left', target_pathname])
559         # this is ugly. I did it because I was pregenerating the target
560         # names before I tried to figure out what sources could generate
561         # those targets, and everything up to this point had been
562         # one-to-one. So I couldn't figure out how to pair the 
563         # target names. 
564         # With this at least the command will run correctly.
565         # however if we rename the default targets, this'll break
566         # also I think it'll generate it twice.
567         args.extend(['--right', 
568                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
569     else:
570         args.extend(['--single', target_pathname ])
571     if flowcell is not None:
572         args.extend(['--flowcell', flowcell])
573
574     if mid is not None:
575         args.extend(['-m', str(mid)])
576
577     if force:
578         args.extend(['--force'])
579
580     script = """
581 arguments="%s"
582 queue
583 """ % (" ".join(args),)
584     
585     return  script 
586
587
588 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
589     py = qseq2fastq.__file__
590     args = [py, '-i', qseq_file, '-o', target_pathname ]
591     if flowcell is not None:
592         args.extend(['-f', flowcell])
593     script = """
594 arguments="%s"
595 queue
596 """ % (" ".join(args))
597
598     return script 
599
600 def find_archive_sequence_files(host, apidata, sequences_path, 
601                                 library_result_map):
602     """
603     Find all the archive sequence files possibly associated with our results.
604
605     """
606     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
607
608     lib_db = {}
609     seq_dirs = set()
610     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
611     candidate_lanes = {}
612     for lib_id, result_dir in library_result_map:
613         lib_info = get_library_info(host, apidata, lib_id)
614         lib_info['lanes'] = {}
615         lib_db[lib_id] = lib_info
616
617         for lane in lib_info['lane_set']:
618             lane_key = (lane['flowcell'], lane['lane_number'])
619             candidate_lanes[lane_key] = lib_id
620             seq_dirs.add(os.path.join(sequences_path, 
621                                          'flowcells', 
622                                          lane['flowcell']))
623     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
624     candidate_seq_list = scan_for_sequences(seq_dirs)
625
626     # at this point we have too many sequences as scan_for_sequences
627     # returns all the sequences in a flowcell directory
628     # so lets filter out the extras
629     
630     for seq in candidate_seq_list:
631         lane_key = (seq.flowcell, seq.lane)
632         lib_id = candidate_lanes.get(lane_key, None)
633         if lib_id is not None:
634             lib_info = lib_db[lib_id]
635             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
636     
637     return lib_db
638
639
640 class NameToViewMap(object):
641     """Determine view attributes for a given submission file name
642     """
643     def __init__(self, root_url, apidata):
644         self.root_url = root_url
645         self.apidata = apidata
646         
647         self.lib_cache = {}
648         self.lib_paired = {}
649         # ma is "map algorithm"
650         ma = 'TH1014'
651
652         self.patterns = [
653             ('*.bai',                   None),
654             ('*.splices.bam',           'Splices'),
655             ('*.bam',                   self._guess_bam_view),
656             ('junctions.bed',           'Junctions'),
657             ('*.jnct',                  'Junctions'),
658             ('*unique.bigwig',         None),
659             ('*plus.bigwig',           'PlusSignal'),
660             ('*minus.bigwig',          'MinusSignal'),
661             ('*.bigwig',                'Signal'),
662             ('*.tar.bz2',               None),
663             ('*.condor',                None),
664             ('*.daf',                   None),
665             ('*.ddf',                   None),
666             ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
667             ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
668             ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
669             ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
670             ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
671             ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
672             ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
673             ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
674             ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
675             ('*_1.75mers.fastq',              'FastqRd1'),
676             ('*_2.75mers.fastq',              'FastqRd2'),
677             ('*_r1.fastq',              'FastqRd1'),
678             ('*_r2.fastq',              'FastqRd2'),
679             ('*.fastq',                 'Fastq'),
680             ('*.gtf',                   'GeneModel'),
681             ('*.ini',                   None),
682             ('*.log',                   None),
683             ('*.md5',                   None),
684             ('paired-end-distribution*', 'InsLength'),
685             ('*.stats.txt',              'InsLength'),
686             ('*.srf',                   None),
687             ('*.wig',                   None),
688             ('*.zip',                   None),
689             ]
690
691         self.views = {
692             None: {"MapAlgorithm": "NA"},
693             "Paired": {"MapAlgorithm": ma},
694             "Aligns": {"MapAlgorithm": ma},
695             "Single": {"MapAlgorithm": ma},
696             "Splices": {"MapAlgorithm": ma},
697             "Junctions": {"MapAlgorithm": ma},
698             "PlusSignal": {"MapAlgorithm": ma},
699             "MinusSignal": {"MapAlgorithm": ma},
700             "Signal": {"MapAlgorithm": ma},
701             "GeneModel": {"MapAlgorithm": ma},
702             "GeneDeNovo": {"MapAlgorithm": ma},
703             "TranscriptDeNovo": {"MapAlgorithm": ma},
704             "GeneGCV3c": {"MapAlgorithm": ma},
705             "TranscriptGCV3c": {"MapAlgorithm": ma},
706             "TranscriptGencV3c": {"MapAlgorithm": ma},
707             "GeneGCV4": {"MapAlgorithm": ma},
708             "TranscriptGCV4": {"MapAlgorithm": ma},
709             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
710             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
711             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
712             "InsLength": {"MapAlgorithm": ma},
713             }
714         # view name is one of the attributes
715         for v in self.views.keys():
716             self.views[v]['view'] = v
717             
718     def find_attributes(self, pathname, lib_id):
719         """Looking for the best extension
720         The 'best' is the longest match
721         
722         :Args:
723         filename (str): the filename whose extention we are about to examine
724         """
725         path, filename = os.path.splitext(pathname)
726         if not self.lib_cache.has_key(lib_id):
727             self.lib_cache[lib_id] = get_library_info(self.root_url,
728                                                       self.apidata, lib_id)
729
730         lib_info = self.lib_cache[lib_id]
731         if lib_info['cell_line'].lower() == 'unknown':
732             logging.warn("Library %s missing cell_line" % (lib_id,))
733         attributes = {
734             'cell': lib_info['cell_line'],
735             'replicate': lib_info['replicate'],
736             }
737         is_paired = self._is_paired(lib_id, lib_info)
738         
739         if is_paired:
740             attributes.update(self.get_paired_attributes(lib_info))
741         else:
742             attributes.update(self.get_single_attributes(lib_info))
743             
744         for pattern, view in self.patterns:
745             if fnmatch.fnmatch(pathname, pattern):
746                 if callable(view):
747                     view = view(is_paired=is_paired)
748                     
749                 attributes.update(self.views[view])
750                 attributes["extension"] = pattern
751                 return attributes
752
753
754     def _guess_bam_view(self, is_paired=True):
755         """Guess a view name based on library attributes
756         """
757         if is_paired:
758             return "Paired"
759         else:
760             return "Aligns"
761
762
763     def _is_paired(self, lib_id, lib_info):
764         """Determine if a library is paired end"""
765         if len(lib_info["lane_set"]) == 0:
766             return False
767
768         if not self.lib_paired.has_key(lib_id):
769             is_paired = 0
770             isnot_paired = 0
771             failed = 0
772             # check to see if all the flowcells are the same.
773             # otherwise we might need to do something complicated
774             for flowcell in lib_info["lane_set"]:
775                 # yes there's also a status code, but this comparison 
776                 # is easier to read
777                 if flowcell["status"].lower() == "failed":
778                     # ignore failed flowcell
779                     failed += 1
780                     pass
781                 elif flowcell["paired_end"]:
782                     is_paired += 1
783                 else:
784                     isnot_paired += 1
785                     
786             logging.debug("Library %s: %d paired, %d single, %d failed" % \
787                      (lib_info["library_id"], is_paired, isnot_paired, failed))
788
789             if is_paired > isnot_paired:
790                 self.lib_paired[lib_id] = True
791             elif is_paired < isnot_paired:
792                 self.lib_paired[lib_id] = False
793             else:
794                 raise RuntimeError("Equal number of paired & unpaired lanes."\
795                                    "Can't guess library paired status")
796             
797         return self.lib_paired[lib_id]
798
799     def get_paired_attributes(self, lib_info):
800         if lib_info['insert_size'] is None:
801             errmsg = "Library %s is missing insert_size, assuming 200"
802             logging.warn(errmsg % (lib_info["library_id"],))
803             insert_size = 200
804         else:
805             insert_size = lib_info['insert_size']
806         return {'insertLength': insert_size,
807                 'readType': '2x75'}
808
809     def get_single_attributes(self, lib_info):
810         return {'insertLength':'ilNA',
811                 'readType': '1x75D'
812                 }
813
814 def make_submission_section(line_counter, files, attributes):
815     """
816     Create a section in the submission ini file
817     """
818     inifile = [ "[line%s]" % (line_counter,) ]
819     inifile += ["files=%s" % (",".join(files))]
820
821     for k,v in attributes.items():
822         inifile += ["%s=%s" % (k,v)]
823     return inifile
824
825
826 def make_base_name(pathname):
827     base = os.path.basename(pathname)
828     name, ext = os.path.splitext(base)
829     return name
830
831
832 def make_submission_name(ininame):
833     name = make_base_name(ininame)
834     return name + ".tgz"
835
836
837 def make_ddf_name(pathname):
838     name = make_base_name(pathname)
839     return name + ".ddf"
840
841
842 def make_condor_name(pathname, run_type=None):
843     name = make_base_name(pathname)
844     elements = [name]
845     if run_type is not None:
846         elements.append(run_type)
847     elements.append("condor")
848     return ".".join(elements)
849
850
851 def make_submit_script(target, header, body_list):
852     """
853     write out a text file
854
855     this was intended for condor submit scripts
856
857     Args:
858       target (str or stream): 
859         if target is a string, we will open and close the file
860         if target is a stream, the caller is responsible.
861
862       header (str);
863         header to write at the beginning of the file
864       body_list (list of strs):
865         a list of blocks to add to the file.
866     """
867     if type(target) in types.StringTypes:
868         f = open(target,"w")
869     else:
870         f = target
871     f.write(header)
872     for entry in body_list:
873         f.write(entry)
874     if type(target) in types.StringTypes:
875         f.close()
876
877 def parse_filelist(file_string):
878     return file_string.split(",")
879
880
881 def validate_filelist(files):
882     """
883     Die if a file doesn't exist in a file list
884     """
885     for f in files:
886         if not os.path.exists(f):
887             raise RuntimeError("%s does not exist" % (f,))
888
889 def make_md5sum(filename):
890     """Quickly find the md5sum of a file
891     """
892     md5_cache = os.path.join(filename+".md5")
893     print md5_cache
894     if os.path.exists(md5_cache):
895         logging.debug("Found md5sum in {0}".format(md5_cache))
896         stream = open(md5_cache,'r')
897         lines = stream.readlines()
898         md5sum = parse_md5sum_line(lines, filename)
899     else:
900         md5sum = make_md5sum_unix(filename, md5_cache)
901     return md5sum
902     
903 def make_md5sum_unix(filename, md5_cache):
904     cmd = ["md5sum", filename]
905     logging.debug("Running {0}".format(" ".join(cmd)))
906     p = Popen(cmd, stdout=PIPE)
907     stdin, stdout = p.communicate()
908     retcode = p.wait()
909     logging.debug("Finished {0} retcode {1}".format(" ".join(cmd), retcode))
910     if retcode != 0:
911         logging.error("Trouble with md5sum for {0}".format(filename))
912         return None
913     lines = stdin.split(os.linesep)
914     md5sum = parse_md5sum_line(lines, filename)
915     if md5sum is not None:
916         logging.debug("Caching sum in {0}".format(md5_cache))
917         stream = open(md5_cache, "w")
918         stream.write(stdin)
919         stream.close()
920     return md5sum
921
922 def parse_md5sum_line(lines, filename):
923     md5sum, md5sum_filename = lines[0].split()
924     if md5sum_filename != filename:
925         errmsg = "MD5sum and I disagre about filename. {0} != {1}"
926         logging.error(errmsg.format(filename, md5sum_filename))
927         return None
928     return md5sum
929
930 if __name__ == "__main__":
931     main()