Update view names as per Cricket for the Jan 21st encode submission.
[htsworkflow.git] / extra / ucsc_encode_submission / ucsc_gather.py
1 #!/usr/bin/env python
2 from ConfigParser import SafeConfigParser
3 import fnmatch
4 from glob import glob
5 import json
6 import logging
7 from optparse import OptionParser
8 import os
9 from pprint import pprint, pformat
10 import shlex
11 from StringIO import StringIO
12 import time
13 import sys
14 import types
15 import urllib
16 import urllib2
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.pipelines.sequences import \
21     create_sequence_table, \
22     scan_for_sequences
23 from htsworkflow.pipelines import qseq2fastq
24 from htsworkflow.pipelines import srf2fastq
25
26 def main(cmdline=None):
27     parser = make_parser()
28     opts, args = parser.parse_args(cmdline)
29     
30     if opts.debug:
31         logging.basicConfig(level = logging.DEBUG )
32     elif opts.verbose:
33         logging.basicConfig(level = logging.INFO )
34     else:
35         logging.basicConfig(level = logging.WARNING )        
36     
37     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
38
39     if opts.host is None or opts.apiid is None or opts.apikey is None:
40         parser.error("Please specify host url, apiid, apikey")
41
42     if len(args) == 0:
43         parser.error("I need at least one library submission-dir input file")
44         
45     library_result_map = []
46     for a in args:
47         library_result_map.extend(read_library_result_map(a))
48
49     if opts.make_tree_from is not None:
50         make_tree_from(opts.make_tree_from, library_result_map)
51             
52     if opts.daf is not None:
53         link_daf(opts.daf, library_result_map)
54
55     if opts.fastq:
56         build_fastqs(opts.host, 
57                      apidata, 
58                      opts.sequence, 
59                      library_result_map,
60                      force=opts.force)
61
62     if opts.ini:
63         make_submission_ini(opts.host, apidata, library_result_map)
64
65     if opts.makeddf:
66         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
67
68
69 def make_parser():
70     # Load defaults from the config files
71     config = SafeConfigParser()
72     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
73     
74     sequence_archive = None
75     apiid = None
76     apikey = None
77     apihost = None
78     SECTION = 'sequence_archive'
79     if config.has_section(SECTION):
80         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
81         sequence_archive = os.path.expanduser(sequence_archive)
82         apiid = config.get(SECTION, 'apiid', apiid)
83         apikey = config.get(SECTION, 'apikey', apikey)
84         apihost = config.get(SECTION, 'host', apihost)
85
86     parser = OptionParser()
87
88     # commands
89     parser.add_option('--make-tree-from',
90                       help="create directories & link data files",
91                       default=None)
92     parser.add_option('--fastq', help="generate scripts for making fastq files",
93                       default=False, action="store_true")
94
95     parser.add_option('--ini', help="generate submission ini file", default=False,
96                       action="store_true")
97
98     parser.add_option('--makeddf', help='make the ddfs', default=False,
99                       action="store_true")
100     
101     parser.add_option('--daf', default=None, help='specify daf name')
102     parser.add_option('--force', default=False, action="store_true",
103                       help="Force regenerating fastqs")
104
105     # configuration options
106     parser.add_option('--apiid', default=apiid, help="Specify API ID")
107     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
108     parser.add_option('--host',  default=apihost,
109                       help="specify HTSWorkflow host",)
110     parser.add_option('--sequence', default=sequence_archive,
111                       help="sequence repository")
112
113     # debugging
114     parser.add_option('--verbose', default=False, action="store_true",
115                       help='verbose logging')
116     parser.add_option('--debug', default=False, action="store_true",
117                       help='debug logging')
118
119     return parser
120
121
122 def make_tree_from(source_path, library_result_map):
123     """Create a tree using data files from source path.
124     """
125     for lib_id, lib_path in library_result_map:
126         if not os.path.exists(lib_path):
127             logging.info("Making dir {0}".format(lib_path))
128             os.mkdir(lib_path)
129         source_lib_dir = os.path.join(source_path, lib_path)
130         if os.path.exists(source_lib_dir):
131             pass
132         for filename in os.listdir(source_lib_dir):
133             source_pathname = os.path.join(source_lib_dir, filename)
134             target_pathname = os.path.join(lib_path, filename)
135             if not os.path.exists(source_pathname):
136                 raise IOError("{0} does not exist".format(source_pathname))
137             if not os.path.exists(target_pathname):
138                 os.symlink(source_pathname, target_pathname)
139                 logging.info(
140                     'LINK {0} to {1}'.format(source_pathname, target_pathname))
141     
142 def build_fastqs(host, apidata, sequences_path, library_result_map, 
143                  force=False ):
144     """
145     Generate condor scripts to build any needed fastq files
146     
147     Args:
148       host (str): root of the htsworkflow api server
149       apidata (dict): id & key to post to the server
150       sequences_path (str): root of the directory tree to scan for files
151       library_result_map (list):  [(library_id, destination directory), ...]
152     """
153     qseq_condor_header = """
154 Universe=vanilla
155 executable=%(exe)s
156 error=log/qseq2fastq.err.$(process).log
157 output=log/qseq2fastq.out.$(process).log
158 log=log/qseq2fastq.log
159
160 """ % {'exe': sys.executable }
161     qseq_condor_entries = []
162     srf_condor_header = """
163 Universe=vanilla
164 executable=%(exe)s
165 output=log/srf_pair_fastq.out.$(process).log
166 error=log/srf_pair_fastq.err.$(process).log
167 log=log/srf_pair_fastq.log
168 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
169
170 """ % {'exe': sys.executable }
171     srf_condor_entries = []
172     lib_db = find_archive_sequence_files(host, 
173                                          apidata, 
174                                          sequences_path, 
175                                          library_result_map)
176
177     needed_targets = find_missing_targets(library_result_map, lib_db, force)
178
179     for target_pathname, available_sources in needed_targets.items():
180         logging.debug(' target : %s' % (target_pathname,))
181         logging.debug(' candidate sources: %s' % (available_sources,))
182         if available_sources.has_key('qseq'):
183             source = available_sources['qseq']
184             qseq_condor_entries.append(
185                 condor_qseq_to_fastq(source.path, 
186                                      target_pathname, 
187                                      source.flowcell,
188                                      force=force)
189             )
190         elif available_sources.has_key('srf'):
191             source = available_sources['srf']
192             mid = getattr(source, 'mid_point', None)
193             srf_condor_entries.append(
194                 condor_srf_to_fastq(source.path, 
195                                     target_pathname,
196                                     source.paired,
197                                     source.flowcell,
198                                     mid,
199                                     force=force)
200             )
201         else:
202             print " need file", target_pathname
203
204     if len(srf_condor_entries) > 0:
205         make_submit_script('srf.fastq.condor', 
206                            srf_condor_header,
207                            srf_condor_entries)
208
209     if len(qseq_condor_entries) > 0:
210         make_submit_script('qseq.fastq.condor', 
211                            qseq_condor_header,
212                            qseq_condor_entries)
213
214
215 def find_missing_targets(library_result_map, lib_db, force=False):
216     """
217     Check if the sequence file exists.
218     This requires computing what the sequence name is and checking
219     to see if it can be found in the sequence location.
220
221     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
222     """
223     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
224     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
225     # find what targets we're missing
226     needed_targets = {}
227     for lib_id, result_dir in library_result_map:
228         lib = lib_db[lib_id]
229         lane_dict = make_lane_dict(lib_db, lib_id)
230         
231         for lane_key, sequences in lib['lanes'].items():
232             for seq in sequences:
233                 seq.paired = lane_dict[seq.flowcell]['paired_end']
234                 lane_status = lane_dict[seq.flowcell]['status']
235
236                 if seq.paired and seq.read is None:
237                     seq.read = 1
238                 filename_attributes = { 
239                     'flowcell': seq.flowcell,
240                     'lib_id': lib_id,
241                     'lane': seq.lane,
242                     'read': seq.read,
243                     'cycle': seq.cycle
244                     }
245                 # skip bad runs
246                 if lane_status == 'Failed':
247                     continue
248                 if seq.flowcell == '30DY0AAXX':
249                     # 30DY0 only ran for 151 bases instead of 152
250                     # it is actually 76 1st read, 75 2nd read
251                     seq.mid_point = 76
252
253                 # end filters
254                 if seq.paired:
255                     target_name = fastq_paired_template % filename_attributes
256                 else:
257                     target_name = fastq_single_template % filename_attributes
258
259                 target_pathname = os.path.join(result_dir, target_name)
260                 if force or not os.path.exists(target_pathname):
261                     t = needed_targets.setdefault(target_pathname, {})
262                     t[seq.filetype] = seq
263
264     return needed_targets
265
266
267 def link_daf(daf_path, library_result_map):
268     if not os.path.exists(daf_path):
269         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
270
271     base_daf = os.path.basename(daf_path)
272     
273     for lib_id, result_dir in library_result_map:
274         submission_daf = os.path.join(result_dir, base_daf)
275         if not os.path.exists(submission_daf):
276             os.link(daf_path, submission_daf)
277
278
279 def make_submission_ini(host, apidata, library_result_map, paired=True):
280     #attributes = get_filename_attribute_map(paired)
281     view_map = NameToViewMap(host, apidata)
282     
283     candidate_fastq_src = {}
284
285     for lib_id, result_dir in library_result_map:
286         order_by = ['order_by=files', 'view', 'replicate', 'cell', 
287                     'readType', 'mapAlgorithm', 'insertLength' ]
288         inifile =  ['[config]']
289         inifile += [" ".join(order_by)]
290         inifile += ['']
291         line_counter = 1
292         result_ini = os.path.join(result_dir, result_dir+'.ini')
293
294         # write other lines
295         submission_files = os.listdir(result_dir)
296         fastqs = {}
297         fastq_attributes = {}
298         for f in submission_files:
299             attributes = view_map.find_attributes(f, lib_id)
300             if attributes is None:
301                 raise ValueError("Unrecognized file: %s" % (f,))
302             
303             ext = attributes["extension"]
304             if attributes['view'] is None:                   
305                 continue               
306             elif attributes.get("type", None) == 'fastq':
307                 fastqs.setdefault(ext, set()).add(f)
308                 fastq_attributes[ext] = attributes
309             else:
310                 inifile.extend(
311                     make_submission_section(line_counter,
312                                             [f],
313                                             attributes
314                                             )
315                     )
316                 inifile += ['']
317                 line_counter += 1
318                 # add in fastqs on a single line.
319
320         for extension, fastq_files in fastqs.items():
321             inifile.extend(
322                 make_submission_section(line_counter, 
323                                         fastq_files,
324                                         fastq_attributes[extension])
325             )
326             inifile += ['']
327             line_counter += 1
328             
329         f = open(result_ini,'w')
330         f.write(os.linesep.join(inifile))
331
332         
333 def make_lane_dict(lib_db, lib_id):
334     """
335     Convert the lane_set in a lib_db to a dictionary
336     indexed by flowcell ID
337     """
338     result = []
339     for lane in lib_db[lib_id]['lane_set']:
340         result.append((lane['flowcell'], lane))
341     return dict(result)
342
343
344 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
345     dag_fragment = []
346     for lib_id, result_dir in library_result_map:
347         ininame = result_dir+'.ini'
348         inipathname = os.path.join(result_dir, ininame)
349         if os.path.exists(inipathname):
350             dag_fragment.extend(
351                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
352             )
353
354     if make_condor and len(dag_fragment) > 0:
355         dag_filename = 'submission.dagman'
356         if not force and os.path.exists(dag_filename):
357             logging.warn("%s exists, please delete" % (dag_filename,))
358         else:
359             f = open(dag_filename,'w')
360             f.write( os.linesep.join(dag_fragment))
361             f.write( os.linesep )
362             f.close()
363             
364
365 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
366     """
367     Make ddf files, and bonus condor file
368     """
369     dag_fragments = []
370     curdir = os.getcwd()
371     if outdir is not None:
372         os.chdir(outdir)
373     output = sys.stdout
374     ddf_name = None
375     if guess_ddf:
376         ddf_name = make_ddf_name(ininame)
377         print ddf_name
378         output = open(ddf_name,'w')
379
380     file_list = read_ddf_ini(ininame, output)
381
382     file_list.append(daf_name)
383     if ddf_name is not None:
384         file_list.append(ddf_name)
385
386     if make_condor:
387         archive_condor = make_condor_archive_script(ininame, file_list)
388         upload_condor = make_condor_upload_script(ininame)
389         
390         dag_fragments.extend( 
391             make_dag_fragment(ininame, archive_condor, upload_condor)
392         ) 
393         
394     os.chdir(curdir)
395     
396     return dag_fragments
397
398
399 def read_ddf_ini(filename, output=sys.stdout):
400     """
401     Read a ini file and dump out a tab delmited text file
402     """
403     file_list = []
404     config = SafeConfigParser()
405     config.read(filename)
406
407     order_by = shlex.split(config.get("config", "order_by"))
408
409     output.write("\t".join(order_by))
410     output.write(os.linesep)
411     sections = config.sections()
412     sections.sort()
413     for section in sections:
414         if section == "config":
415             # skip the config block
416             continue
417         values = []
418         for key in order_by:
419             v = config.get(section, key)
420             values.append(v)
421             if key == 'files':
422                 file_list.extend(parse_filelist(v))
423                 
424         output.write("\t".join(values))
425         output.write(os.linesep)
426     return file_list
427
428
429 def read_library_result_map(filename):
430     """
431     Read a file that maps library id to result directory.
432     Does not support spaces in filenames. 
433     
434     For example:
435       10000 result/foo/bar
436     """
437     stream = open(filename,'r')
438
439     results = []
440     for line in stream:
441         line = line.rstrip()
442         if not line.startswith('#') and len(line) > 0 :
443             library_id, result_dir = line.split()
444             results.append((library_id, result_dir))
445     return results
446
447
448 def make_condor_archive_script(ininame, files):
449     script = """Universe = vanilla
450
451 Executable = /bin/tar
452 arguments = czvhf ../%(archivename)s %(filelist)s
453
454 Error = compress.err.$(Process).log
455 Output = compress.out.$(Process).log
456 Log = /tmp/submission-compress-%(user)s.log
457 initialdir = %(initialdir)s
458 environment="GZIP=-3"
459 request_memory = 20
460
461 queue 
462 """
463     for f in files:
464         if not os.path.exists(f):
465             raise RuntimeError("Missing %s" % (f,))
466
467     context = {'archivename': make_submission_name(ininame),
468                'filelist': " ".join(files),
469                'initialdir': os.getcwd(),
470                'user': os.getlogin()}
471
472     condor_script = make_condor_name(ininame, 'archive')
473     condor_stream = open(condor_script,'w')
474     condor_stream.write(script % context)
475     condor_stream.close()
476     return condor_script
477
478
479 def make_condor_upload_script(ininame):
480     script = """Universe = vanilla
481
482 Executable = /usr/bin/lftp
483 arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/%(archivename)s
484
485 Error = upload.err.$(Process).log
486 Output = upload.out.$(Process).log
487 Log = /tmp/submission-upload-%(user)s.log
488 initialdir = %(initialdir)s
489
490 queue 
491 """
492     context = {'archivename': make_submission_name(ininame),
493                'initialdir': os.getcwd(),
494                'user': os.getlogin()}
495
496     condor_script = make_condor_name(ininame, 'upload')
497     condor_stream = open(condor_script,'w')
498     condor_stream.write(script % context)
499     condor_stream.close()
500     return condor_script
501
502
503 def make_dag_fragment(ininame, archive_condor, upload_condor):
504     """
505     Make the couple of fragments compress and then upload the data.
506     """
507     cur_dir = os.getcwd()
508     archive_condor = os.path.join(cur_dir, archive_condor)
509     upload_condor = os.path.join(cur_dir, upload_condor)
510     job_basename = make_base_name(ininame)
511
512     fragments = []
513     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
514     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
515     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
516
517     return fragments
518
519
520 def get_library_info(host, apidata, library_id):
521     url = api.library_url(host, library_id)
522     contents = api.retrieve_info(url, apidata)
523     return contents
524
525
526 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
527                         mid=None, force=False):
528     py = srf2fastq.__file__
529     args = [ py, srf_file, ]
530     if paired:
531         args.extend(['--left', target_pathname])
532         # this is ugly. I did it because I was pregenerating the target
533         # names before I tried to figure out what sources could generate
534         # those targets, and everything up to this point had been
535         # one-to-one. So I couldn't figure out how to pair the 
536         # target names. 
537         # With this at least the command will run correctly.
538         # however if we rename the default targets, this'll break
539         # also I think it'll generate it twice.
540         args.extend(['--right', 
541                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
542     else:
543         args.extend(['--single', target_pathname ])
544     if flowcell is not None:
545         args.extend(['--flowcell', flowcell])
546
547     if mid is not None:
548         args.extend(['-m', str(mid)])
549
550     if force:
551         args.extend(['--force'])
552
553     script = """
554 arguments="%s"
555 queue
556 """ % (" ".join(args),)
557     
558     return  script 
559
560
561 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
562     py = qseq2fastq.__file__
563     args = [py, '-i', qseq_file, '-o', target_pathname ]
564     if flowcell is not None:
565         args.extend(['-f', flowcell])
566     script = """
567 arguments="%s"
568 queue
569 """ % (" ".join(args))
570
571     return script 
572
573 def find_archive_sequence_files(host, apidata, sequences_path, 
574                                 library_result_map):
575     """
576     Find all the archive sequence files possibly associated with our results.
577
578     """
579     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
580
581     lib_db = {}
582     seq_dirs = set()
583     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
584     candidate_lanes = {}
585     for lib_id, result_dir in library_result_map:
586         lib_info = get_library_info(host, apidata, lib_id)
587         lib_info['lanes'] = {}
588         lib_db[lib_id] = lib_info
589
590         for lane in lib_info['lane_set']:
591             lane_key = (lane['flowcell'], lane['lane_number'])
592             candidate_lanes[lane_key] = lib_id
593             seq_dirs.add(os.path.join(sequences_path, 
594                                          'flowcells', 
595                                          lane['flowcell']))
596     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
597     candidate_seq_list = scan_for_sequences(seq_dirs)
598
599     # at this point we have too many sequences as scan_for_sequences
600     # returns all the sequences in a flowcell directory
601     # so lets filter out the extras
602     
603     for seq in candidate_seq_list:
604         lane_key = (seq.flowcell, seq.lane)
605         lib_id = candidate_lanes.get(lane_key, None)
606         if lib_id is not None:
607             lib_info = lib_db[lib_id]
608             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
609     
610     return lib_db
611
612
613 class NameToViewMap(object):
614     """Determine view attributes for a given submission file name
615     """
616     def __init__(self, root_url, apidata):
617         self.root_url = root_url
618         self.apidata = apidata
619         
620         self.lib_cache = {}
621         self.lib_paired = {}
622         # ma is "map algorithm"
623         ma = 'TH1014'
624
625         self.patterns = [
626             ('*.bai',                   None),
627             ('*.splices.bam',           'Splices'),
628             ('*.bam',                   self._guess_bam_view),
629             ('junctions.bed',           'Junctions'),
630             ('*.jnct',                  'Junctions'),
631             ('*.unique.bigwig',         None),
632             ('*.plus.bigwig',           'PlusSignal'),
633             ('*.minus.bigwig',          'MinusSignal'),
634             ('*.bigwig',                'Signal'),
635             ('*.tar.bz2',               None),
636             ('*.condor',                None),
637             ('*.daf',                   None),
638             ('*.ddf',                   None),
639             ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
640             ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
641             ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
642             ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
643             ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
644             ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
645             ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
646             ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
647             ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
648             ('*_1.75mers.fastq',              'FastqRd1'),
649             ('*_2.75mers.fastq',              'FastqRd2'),
650             ('*_r1.fastq',              'FastqRd1'),
651             ('*_r2.fastq',              'FastqRd2'),
652             ('*.fastq',                 'Fastq'),
653             ('*.gtf',                   'GeneModel'),
654             ('*.ini',                   None),
655             ('*.log',                   None),
656             ('paired-end-distribution*', 'InsLength'),
657             ('*.stats.txt',              'InsLength'),
658             ('*.srf',                   None),
659             ('*.wig',                   None),
660             ('*.zip',                   None),
661             ]
662
663         self.views = {
664             None: {"MapAlgorithm": "NA"},
665             "Paired": {"MapAlgorithm": ma},
666             "Aligns": {"MapAlgorithm": ma},
667             "Single": {"MapAlgorithm": ma},
668             "Splices": {"MapAlgorithm": ma},
669             "Junctions": {"MapAlgorithm": ma},
670             "PlusSignal": {"MapAlgorithm": ma},
671             "MinusSignal": {"MapAlgorithm": ma},
672             "Signal": {"MapAlgorithm": ma},
673             "GeneModel": {"MapAlgorithm": ma},
674             "GeneDeNovo": {"MapAlgorithm": ma},
675             "TranscriptDeNovo": {"MapAlgorithm": ma},
676             "GeneGCV3c": {"MapAlgorithm": ma},
677             "TranscriptGCV3c": {"MapAlgorithm": ma},
678             "TranscriptGencV3c": {"MapAlgorithm": ma},
679             "GeneGCV4": {"MapAlgorithm": ma},
680             "TranscriptGCV4": {"MapAlgorithm": ma},
681             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
682             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
683             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
684             "InsLength": {"MapAlgorithm": ma},
685             }
686         # view name is one of the attributes
687         for v in self.views.keys():
688             self.views[v]['view'] = v
689             
690     def find_attributes(self, pathname, lib_id):
691         """Looking for the best extension
692         The 'best' is the longest match
693         
694         :Args:
695         filename (str): the filename whose extention we are about to examine
696         """
697         path, filename = os.path.splitext(pathname)
698         if not self.lib_cache.has_key(lib_id):
699             self.lib_cache[lib_id] = get_library_info(self.root_url,
700                                                       self.apidata, lib_id)
701
702         lib_info = self.lib_cache[lib_id]
703         if lib_info['cell_line'].lower() == 'unknown':
704             logging.warn("Library %s missing cell_line" % (lib_id,))
705         attributes = {
706             'cell': lib_info['cell_line'],
707             'replicate': lib_info['replicate'],
708             }
709         is_paired = self._is_paired(lib_id, lib_info)
710         
711         if is_paired:
712             attributes.update(self.get_paired_attributes(lib_info))
713         else:
714             attributes.update(self.get_single_attributes(lib_info))
715             
716         for pattern, view in self.patterns:
717             if fnmatch.fnmatch(pathname, pattern):
718                 if callable(view):
719                     view = view(is_paired=is_paired)
720                     
721                 attributes.update(self.views[view])
722                 attributes["extension"] = pattern
723                 return attributes
724
725
726     def _guess_bam_view(self, is_paired=True):
727         """Guess a view name based on library attributes
728         """
729         if is_paired:
730             return "Paired"
731         else:
732             return "Aligns"
733
734
735     def _is_paired(self, lib_id, lib_info):
736         """Determine if a library is paired end"""
737         if len(lib_info["lane_set"]) == 0:
738             return False
739
740         if not self.lib_paired.has_key(lib_id):
741             is_paired = 0
742             isnot_paired = 0
743             failed = 0
744             # check to see if all the flowcells are the same.
745             # otherwise we might need to do something complicated
746             for flowcell in lib_info["lane_set"]:
747                 # yes there's also a status code, but this comparison 
748                 # is easier to read
749                 if flowcell["status"].lower() == "failed":
750                     # ignore failed flowcell
751                     failed += 1
752                     pass
753                 elif flowcell["paired_end"]:
754                     is_paired += 1
755                 else:
756                     isnot_paired += 1
757                     
758             logging.debug("Library %s: %d paired, %d single, %d failed" % \
759                      (lib_info["library_id"], is_paired, isnot_paired, failed))
760
761             if is_paired > isnot_paired:
762                 self.lib_paired[lib_id] = True
763             elif is_paired < isnot_paired:
764                 self.lib_paired[lib_id] = False
765             else:
766                 raise RuntimeError("Equal number of paired & unpaired lanes."\
767                                    "Can't guess library paired status")
768             
769         return self.lib_paired[lib_id]
770
771     def get_paired_attributes(self, lib_info):
772         if lib_info['insert_size'] is None:
773             errmsg = "Library %s is missing insert_size, assuming 200"
774             logging.warn(errmsg % (lib_info["library_id"],))
775             insert_size = 200
776         else:
777             insert_size = lib_info['insert_size']
778         return {'insertLength': insert_size,
779                 'readType': '2x75'}
780
781     def get_single_attributes(self, lib_info):
782         return {'insertLength':'ilNA',
783                 'readType': '1x75D'
784                 }
785
786 def make_submission_section(line_counter, files, attributes):
787     """
788     Create a section in the submission ini file
789     """
790     inifile = [ "[line%s]" % (line_counter,) ]
791     inifile += ["files=%s" % (",".join(files))]
792
793     for k,v in attributes.items():
794         inifile += ["%s=%s" % (k,v)]
795     return inifile
796
797
798 def make_base_name(pathname):
799     base = os.path.basename(pathname)
800     name, ext = os.path.splitext(base)
801     return name
802
803
804 def make_submission_name(ininame):
805     name = make_base_name(ininame)
806     return name + ".tgz"
807
808
809 def make_ddf_name(pathname):
810     name = make_base_name(pathname)
811     return name + ".ddf"
812
813
814 def make_condor_name(pathname, run_type=None):
815     name = make_base_name(pathname)
816     elements = [name]
817     if run_type is not None:
818         elements.append(run_type)
819     elements.append("condor")
820     return ".".join(elements)
821
822
823 def make_submit_script(target, header, body_list):
824     """
825     write out a text file
826
827     this was intended for condor submit scripts
828
829     Args:
830       target (str or stream): 
831         if target is a string, we will open and close the file
832         if target is a stream, the caller is responsible.
833
834       header (str);
835         header to write at the beginning of the file
836       body_list (list of strs):
837         a list of blocks to add to the file.
838     """
839     if type(target) in types.StringTypes:
840         f = open(target,"w")
841     else:
842         f = target
843     f.write(header)
844     for entry in body_list:
845         f.write(entry)
846     if type(target) in types.StringTypes:
847         f.close()
848
849 def parse_filelist(file_string):
850     return file_string.split(",")
851
852
853 def validate_filelist(files):
854     """
855     Die if a file doesn't exist in a file list
856     """
857     for f in files:
858         if not os.path.exists(f):
859             raise RuntimeError("%s does not exist" % (f,))
860
861
862 if __name__ == "__main__":
863     main()