3ccfea6ef39c748516de1c179b382879f56b4db3
[htsworkflow.git] / extra / ucsc_encode_submission / ucsc_gather.py
1 #!/usr/bin/env python
2 from ConfigParser import SafeConfigParser
3 import fnmatch
4 from glob import glob
5 import json
6 import logging
7 from optparse import OptionParser
8 import os
9 from pprint import pprint, pformat
10 import shlex
11 from StringIO import StringIO
12 import time
13 import sys
14 import types
15 import urllib
16 import urllib2
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.pipelines.sequences import \
21     create_sequence_table, \
22     scan_for_sequences
23
24 def main(cmdline=None):
25     parser = make_parser()
26     opts, args = parser.parse_args(cmdline)
27     
28     if opts.debug:
29         logging.basicConfig(level = logging.DEBUG )
30     elif opts.verbose:
31         logging.basicConfig(level = logging.INFO )
32     else:
33         logging.basicConfig(level = logging.WARNING )        
34     
35     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
36
37     if opts.host is None or opts.apiid is None or opts.apikey is None:
38         parser.error("Please specify host url, apiid, apikey")
39
40     if len(args) == 0:
41         parser.error("I need at least one library submission-dir input file")
42         
43     library_result_map = []
44     for a in args:
45         library_result_map.extend(read_library_result_map(a))
46
47     if opts.daf is not None:
48         link_daf(opts.daf, library_result_map)
49
50     if opts.fastq:
51         build_fastqs(opts.host, 
52                      apidata, 
53                      opts.sequence, 
54                      library_result_map,
55                      force=opts.force)
56
57     if opts.ini:
58         make_submission_ini(opts.host, apidata, library_result_map)
59
60     if opts.makeddf:
61         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
62
63
64 def make_parser():
65     # Load defaults from the config files
66     config = SafeConfigParser()
67     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
68     
69     sequence_archive = None
70     apiid = None
71     apikey = None
72     apihost = None
73     SECTION = 'sequence_archive'
74     if config.has_section(SECTION):
75         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
76         sequence_archive = os.path.expanduser(sequence_archive)
77         apiid = config.get(SECTION, 'apiid', apiid)
78         apikey = config.get(SECTION, 'apikey', apikey)
79         apihost = config.get(SECTION, 'host', apihost)
80
81     parser = OptionParser()
82
83     # commands
84     parser.add_option('--fastq', help="generate scripts for making fastq files",
85                       default=False, action="store_true")
86
87     parser.add_option('--ini', help="generate submission ini file", default=False,
88                       action="store_true")
89
90     parser.add_option('--makeddf', help='make the ddfs', default=False,
91                       action="store_true")
92     
93     parser.add_option('--daf', default=None, help='specify daf name')
94     parser.add_option('--force', default=False, action="store_true",
95                       help="Force regenerating fastqs")
96
97     # configuration options
98     parser.add_option('--apiid', default=apiid, help="Specify API ID")
99     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
100     parser.add_option('--host',  default=apihost,
101                       help="specify HTSWorkflow host",)
102     parser.add_option('--sequence', default=sequence_archive,
103                       help="sequence repository")
104
105     # debugging
106     parser.add_option('--verbose', default=False, action="store_true",
107                       help='verbose logging')
108     parser.add_option('--debug', default=False, action="store_true",
109                       help='debug logging')
110
111     return parser
112
113
114 def build_fastqs(host, apidata, sequences_path, library_result_map, 
115                  force=False ):
116     """
117     Generate condor scripts to build any needed fastq files
118     
119     Args:
120       host (str): root of the htsworkflow api server
121       apidata (dict): id & key to post to the server
122       sequences_path (str): root of the directory tree to scan for files
123       library_result_map (list):  [(library_id, destination directory), ...]
124     """
125     qseq_condor_header = """
126 Universe=vanilla
127 executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/qseq2fastq
128 error=log/qseq2fastq.err.$(process).log
129 output=log/qseq2fastq.out.$(process).log
130 log=log/qseq2fastq.log
131
132 """
133     qseq_condor_entries = []
134     srf_condor_header = """
135 Universe=vanilla
136 executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq
137 output=log/srf_pair_fastq.out.$(process).log
138 error=log/srf_pair_fastq.err.$(process).log
139 log=log/srf_pair_fastq.log
140 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
141
142 """
143     srf_condor_entries = []
144     lib_db = find_archive_sequence_files(host, 
145                                          apidata, 
146                                          sequences_path, 
147                                          library_result_map)
148
149     needed_targets = find_missing_targets(library_result_map, lib_db, force)
150
151     for target_pathname, available_sources in needed_targets.items():
152         logging.debug(' target : %s' % (target_pathname,))
153         logging.debug(' candidate sources: %s' % (available_sources,))
154         if available_sources.has_key('qseq'):
155             source = available_sources['qseq']
156             qseq_condor_entries.append(
157                 condor_qseq_to_fastq(source.path, 
158                                      target_pathname, 
159                                      source.flowcell,
160                                      force=force)
161             )
162         elif available_sources.has_key('srf'):
163             source = available_sources['srf']
164             mid = getattr(source, 'mid_point', None)
165             srf_condor_entries.append(
166                 condor_srf_to_fastq(source.path, 
167                                     target_pathname,
168                                     source.paired,
169                                     source.flowcell,
170                                     mid,
171                                     force=force)
172             )
173         else:
174             print " need file", target_pathname
175
176     if len(srf_condor_entries) > 0:
177         make_submit_script('srf.fastq.condor', 
178                            srf_condor_header,
179                            srf_condor_entries)
180
181     if len(qseq_condor_entries) > 0:
182         make_submit_script('qseq.fastq.condor', 
183                            qseq_condor_header,
184                            qseq_condor_entries)
185
186
187 def find_missing_targets(library_result_map, lib_db, force=False):
188     """
189     Check if the sequence file exists.
190     This requires computing what the sequence name is and checking
191     to see if it can be found in the sequence location.
192
193     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
194     """
195     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
196     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
197     # find what targets we're missing
198     needed_targets = {}
199     for lib_id, result_dir in library_result_map:
200         lib = lib_db[lib_id]
201         lane_dict = make_lane_dict(lib_db, lib_id)
202         
203         for lane_key, sequences in lib['lanes'].items():
204             for seq in sequences:
205                 seq.paired = lane_dict[seq.flowcell]['paired_end']
206                 lane_status = lane_dict[seq.flowcell]['status']
207
208                 if seq.paired and seq.read is None:
209                     seq.read = 1
210                 filename_attributes = { 
211                     'flowcell': seq.flowcell,
212                     'lib_id': lib_id,
213                     'lane': seq.lane,
214                     'read': seq.read,
215                     'cycle': seq.cycle
216                     }
217                 # skip bad runs
218                 if lane_status == 'Failed':
219                     continue
220                 if seq.flowcell == '30DY0AAXX':
221                     # 30DY0 only ran for 151 bases instead of 152
222                     # it is actually 76 1st read, 75 2nd read
223                     seq.mid_point = 76
224
225                 # end filters
226                 if seq.paired:
227                     target_name = fastq_paired_template % filename_attributes
228                 else:
229                     target_name = fastq_single_template % filename_attributes
230
231                 target_pathname = os.path.join(result_dir, target_name)
232                 if force or not os.path.exists(target_pathname):
233                     t = needed_targets.setdefault(target_pathname, {})
234                     t[seq.filetype] = seq
235
236     return needed_targets
237
238
239 def link_daf(daf_path, library_result_map):
240     if not os.path.exists(daf_path):
241         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
242
243     base_daf = os.path.basename(daf_path)
244     
245     for lib_id, result_dir in library_result_map:
246         submission_daf = os.path.join(result_dir, base_daf)
247         if not os.path.exists(submission_daf):
248             os.link(daf_path, submission_daf)
249
250
251 def make_submission_ini(host, apidata, library_result_map, paired=True):
252     #attributes = get_filename_attribute_map(paired)
253     view_map = NameToViewMap(host, apidata)
254     
255     candidate_fastq_src = {}
256
257     for lib_id, result_dir in library_result_map:
258         order_by = ['order_by=files', 'view', 'replicate', 'cell', 
259                     'readType', 'mapAlgorithm', 'insertLength' ]
260         inifile =  ['[config]']
261         inifile += [" ".join(order_by)]
262         inifile += ['']
263         line_counter = 1
264         result_ini = os.path.join(result_dir, result_dir+'.ini')
265
266         # write other lines
267         submission_files = os.listdir(result_dir)
268         fastqs = {}
269         for f in submission_files:
270             attributes = view_map.find_attributes(f, lib_id)
271             if attributes is None:
272                 raise ValueError("Unrecognized file: %s" % (f,))
273             
274             ext = attributes["extension"]
275             if attributes['view'] is None:                   
276                 continue               
277             elif attributes.get("type", None) == 'fastq':
278                 fastqs.setdefault(ext, set()).add(f)
279             else:
280                 inifile.extend(
281                     make_submission_section(line_counter,
282                                             [f],
283                                             attributes
284                                             )
285                     )
286                 inifile += ['']
287                 line_counter += 1
288
289         # add in fastqs on a single line.
290         for extension, fastq_set in fastqs.items():
291             inifile.extend(
292                 make_submission_section(line_counter, 
293                                         fastq_set,
294                                         attributes[extension])
295             )
296             inifile += ['']
297             line_counter += 1
298             
299         f = open(result_ini,'w')
300         f.write(os.linesep.join(inifile))
301
302         
303 def make_lane_dict(lib_db, lib_id):
304     """
305     Convert the lane_set in a lib_db to a dictionary
306     indexed by flowcell ID
307     """
308     result = []
309     for lane in lib_db[lib_id]['lane_set']:
310         result.append((lane['flowcell'], lane))
311     return dict(result)
312
313
314 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
315     dag_fragment = []
316     for lib_id, result_dir in library_result_map:
317         ininame = result_dir+'.ini'
318         inipathname = os.path.join(result_dir, ininame)
319         if os.path.exists(inipathname):
320             dag_fragment.extend(
321                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
322             )
323
324     if make_condor and len(dag_fragment) > 0:
325         dag_filename = 'submission.dagman'
326         if not force and os.path.exists(dag_filename):
327             logging.warn("%s exists, please delete" % (dag_filename,))
328         else:
329             f = open(dag_filename,'w')
330             f.write( os.linesep.join(dag_fragment))
331             f.write( os.linesep )
332             f.close()
333             
334
335 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
336     """
337     Make ddf files, and bonus condor file
338     """
339     dag_fragments = []
340     curdir = os.getcwd()
341     if outdir is not None:
342         os.chdir(outdir)
343     output = sys.stdout
344     ddf_name = None
345     if guess_ddf:
346         ddf_name = make_ddf_name(ininame)
347         print ddf_name
348         output = open(ddf_name,'w')
349
350     file_list = read_ddf_ini(ininame, output)
351
352     file_list.append(daf_name)
353     if ddf_name is not None:
354         file_list.append(ddf_name)
355
356     if make_condor:
357         archive_condor = make_condor_archive_script(ininame, file_list)
358         upload_condor = make_condor_upload_script(ininame)
359         
360         dag_fragments.extend( 
361             make_dag_fragment(ininame, archive_condor, upload_condor)
362         ) 
363         
364     os.chdir(curdir)
365     
366     return dag_fragments
367
368
369 def read_ddf_ini(filename, output=sys.stdout):
370     """
371     Read a ini file and dump out a tab delmited text file
372     """
373     file_list = []
374     config = SafeConfigParser()
375     config.read(filename)
376
377     order_by = shlex.split(config.get("config", "order_by"))
378
379     output.write("\t".join(order_by))
380     output.write(os.linesep)
381     sections = config.sections()
382     sections.sort()
383     for section in sections:
384         if section == "config":
385             # skip the config block
386             continue
387         values = []
388         for key in order_by:
389             v = config.get(section, key)
390             values.append(v)
391             if key == 'files':
392                 file_list.extend(parse_filelist(v))
393                 
394         output.write("\t".join(values))
395         output.write(os.linesep)
396     return file_list
397
398
399 def read_library_result_map(filename):
400     """
401     Read a file that maps library id to result directory.
402     Does not support spaces in filenames. 
403     
404     For example:
405       10000 result/foo/bar
406     """
407     stream = open(filename,'r')
408
409     results = []
410     for line in stream:
411         line = line.rstrip()
412         if not line.startswith('#') and len(line) > 0 :
413             library_id, result_dir = line.split()
414             results.append((library_id, result_dir))
415     return results
416
417
418 def make_condor_archive_script(ininame, files):
419     script = """Universe = vanilla
420
421 Executable = /bin/tar
422 arguments = czvf ../%(archivename)s %(filelist)s
423
424 Error = compress.err.$(Process).log
425 Output = compress.out.$(Process).log
426 Log = /tmp/submission-compress-%(user)s.log
427 initialdir = %(initialdir)s
428
429 queue 
430 """
431     for f in files:
432         if not os.path.exists(f):
433             raise RuntimeError("Missing %s" % (f,))
434
435     context = {'archivename': make_submission_name(ininame),
436                'filelist': " ".join(files),
437                'initialdir': os.getcwd(),
438                'user': os.getlogin()}
439
440     condor_script = make_condor_name(ininame, 'archive')
441     condor_stream = open(condor_script,'w')
442     condor_stream.write(script % context)
443     condor_stream.close()
444     return condor_script
445
446
447 def make_condor_upload_script(ininame):
448     script = """Universe = vanilla
449
450 Executable = /usr/bin/lftp
451 arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/%(archivename)s
452
453 Error = upload.err.$(Process).log
454 Output = upload.out.$(Process).log
455 Log = /tmp/submission-upload-%(user)s.log
456 initialdir = %(initialdir)s
457
458 queue 
459 """
460     context = {'archivename': make_submission_name(ininame),
461                'initialdir': os.getcwd(),
462                'user': os.getlogin()}
463
464     condor_script = make_condor_name(ininame, 'upload')
465     condor_stream = open(condor_script,'w')
466     condor_stream.write(script % context)
467     condor_stream.close()
468     return condor_script
469
470
471 def make_dag_fragment(ininame, archive_condor, upload_condor):
472     """
473     Make the couple of fragments compress and then upload the data.
474     """
475     cur_dir = os.getcwd()
476     archive_condor = os.path.join(cur_dir, archive_condor)
477     upload_condor = os.path.join(cur_dir, upload_condor)
478     job_basename = make_base_name(ininame)
479
480     fragments = []
481     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
482     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
483     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
484
485     return fragments
486
487
488 def get_library_info(host, apidata, library_id):
489     url = api.library_url(host, library_id)
490     contents = api.retrieve_info(url, apidata)
491     return contents
492
493
494 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
495                         mid=None, force=False):
496     args = [ srf_file, ]
497     if paired:
498         args.extend(['--left', target_pathname])
499         # this is ugly. I did it because I was pregenerating the target
500         # names before I tried to figure out what sources could generate
501         # those targets, and everything up to this point had been
502         # one-to-one. So I couldn't figure out how to pair the 
503         # target names. 
504         # With this at least the command will run correctly.
505         # however if we rename the default targets, this'll break
506         # also I think it'll generate it twice.
507         args.extend(['--right', 
508                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
509     else:
510         args.extend(['--single', target_pathname ])
511     if flowcell is not None:
512         args.extend(['--flowcell', flowcell])
513
514     if mid is not None:
515         args.extend(['-m', str(mid)])
516
517     if force:
518         args.extend(['--force'])
519
520     script = """
521 arguments="%s"
522 queue
523 """ % (" ".join(args),)
524     
525     return  script 
526
527
528 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
529     args = ['-i', qseq_file, '-o', target_pathname ]
530     if flowcell is not None:
531         args.extend(['-f', flowcell])
532     script = """
533 arguments="%s"
534 queue
535 """ % (" ".join(args))
536
537     return script 
538
539 def find_archive_sequence_files(host, apidata, sequences_path, 
540                                 library_result_map):
541     """
542     Find all the archive sequence files possibly associated with our results.
543
544     """
545     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
546
547     lib_db = {}
548     seq_dirs = set()
549     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
550     candidate_lanes = {}
551     for lib_id, result_dir in library_result_map:
552         lib_info = get_library_info(host, apidata, lib_id)
553         lib_info['lanes'] = {}
554         lib_db[lib_id] = lib_info
555
556         for lane in lib_info['lane_set']:
557             lane_key = (lane['flowcell'], lane['lane_number'])
558             candidate_lanes[lane_key] = lib_id
559             seq_dirs.add(os.path.join(sequences_path, 
560                                          'flowcells', 
561                                          lane['flowcell']))
562     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
563     candidate_seq_list = scan_for_sequences(seq_dirs)
564
565     # at this point we have too many sequences as scan_for_sequences
566     # returns all the sequences in a flowcell directory
567     # so lets filter out the extras
568     
569     for seq in candidate_seq_list:
570         lane_key = (seq.flowcell, seq.lane)
571         lib_id = candidate_lanes.get(lane_key, None)
572         if lib_id is not None:
573             lib_info = lib_db[lib_id]
574             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
575     
576     return lib_db
577
578
579 class NameToViewMap(object):
580     """Determine view attributes for a given submission file name
581     """
582     def __init__(self, root_url, apidata):
583         self.root_url = root_url
584         self.apidata = apidata
585         
586         self.lib_cache = {}
587         self.lib_paired = {}
588         # ma is "map algorithm"
589         ma = 'TH1014'
590
591         self.patterns = [
592             ('*.bai',                   None),
593             ('*.bam',                   self._guess_bam_view),
594             ('*.splices.bam',           'Splices'),
595             ('*.jnct',                  'Junctions'),
596             ('*.plus.bigwig',           'PlusSignal'),
597             ('*.minus.bigwig',          'MinusSignal'),
598             ('*.bigwig',                'Signal'),
599             ('*.tar.bz2',               None),
600             ('*.condor',                None),
601             ('*.daf',                   None),
602             ('*.ddf',                   None),
603             ('cufflinks-0.9.0-genes.expr',       'GeneDeNovo'),
604             ('cufflinks-0.9.0-transcripts.expr', 'TranscriptDeNovo'),
605             ('cufflinks-0.9.0-transcripts.gtf',  'GeneModel'),
606             ('GENCODE-v3c-genes.expr',       'GeneGencV3c'),
607             ('GENCODE-v3c-transcripts.expr', 'TranscriptGencV3c'),
608             ('GENCODE-v4-genes.expr',       'GeneGencV4'),
609             ('GENCODE-v4-transcripts.expr', 'TranscriptGencV4'),
610             ('GENCODE-v4-transcript.expr', 'TranscriptGencV4'),
611             ('*_r1.fastq',              'FastqRd1'),
612             ('*_r2.fastq',              'FastqRd2'),
613             ('*.fastq',                 'Fastq'),
614             ('*.gtf',                   'GeneModel'),
615             ('*.ini',                   None),
616             ('*.log',                   None),
617             ('*.stats.txt',             'InsLength'),
618             ('*.srf',                   None),
619             ('*.wig',                   None),
620             ('*.zip',                   None),
621             ]
622
623         self.views = {
624             None: {"MapAlgorithm": "NA"},
625             "Paired": {"MapAlgorithm": ma},
626             "Single": {"MapAlgorithm": ma},
627             "Splices": {"MapAlgorithm": ma},
628             "Junctions": {"MapAlgorithm": ma},
629             "PlusSignal": {"MapAlgorithm": ma},
630             "MinusSignal": {"MapAlgorithm": ma},
631             "Signal": {"MapAlgorithm": ma},
632             "GeneModel": {"MapAlgorithm": ma},
633             "GeneDeNovo": {"MapAlgorithm": ma},
634             "TranscriptDeNovo": {"MapAlgorithm": ma},
635             "GeneGencV3c": {"MapAlgorithm": ma},
636             "TranscriptGencV3c": {"MapAlgorithm": ma},
637             "GeneGencV4": {"MapAlgorithm": ma},
638             "TranscriptGencV4": {"MapAlgorithm": ma},
639             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
640             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
641             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
642             "GeneModel": {"MapAlgorithm": ma},
643             "InsLength": {"MapAlgorithm": ma},
644             }
645         # view name is one of the attributes
646         for v in self.views.keys():
647             self.views[v]['view'] = v
648             
649     def find_attributes(self, pathname, lib_id):
650         """Looking for the best extension
651         The 'best' is the longest match
652         
653         :Args:
654         filename (str): the filename whose extention we are about to examine
655         """
656         path, filename = os.path.splitext(pathname)
657         if not self.lib_cache.has_key(lib_id):
658             self.lib_cache[lib_id] = get_library_info(self.root_url,
659                                                       self.apidata, lib_id)
660
661         lib_info = self.lib_cache[lib_id]
662         if lib_info['cell_line'].lower() == 'unknown':
663             logging.warn("Library %s missing cell_line" % (lib_id,))
664         attributes = {
665             'cell': lib_info['cell_line'],
666             'replicate': lib_info['replicate'],
667             }
668         is_paired = self._is_paired(lib_id, lib_info)
669         
670         if is_paired:
671             attributes.update(self.get_paired_attributes(lib_info))
672         else:
673             attributes.update(self.get_single_attributes(lib_info))
674             
675         for pattern, view in self.patterns:
676             if fnmatch.fnmatch(pathname, pattern):
677                 if callable(view):
678                     view = view(is_paired=is_paired)
679                     
680                 attributes.update(self.views[view])
681                 attributes["extension"] = pattern
682                 return attributes
683
684
685     def _guess_bam_view(self, is_paired=True):
686         """Guess a view name based on library attributes
687         """
688         if is_paired:
689             return "Paired"
690         else:
691             return "Align"
692
693
694     def _is_paired(self, lib_id, lib_info):
695         """Determine if a library is paired end"""
696         if len(lib_info["lane_set"]) == 0:
697             return False
698
699         if not self.lib_paired.has_key(lib_id):
700             is_paired = 0
701             isnot_paired = 0
702             failed = 0
703             # check to see if all the flowcells are the same.
704             # otherwise we might need to do something complicated
705             for flowcell in lib_info["lane_set"]:
706                 # yes there's also a status code, but this comparison 
707                 # is easier to read
708                 if flowcell["status"].lower() == "failed":
709                     # ignore failed flowcell
710                     failed += 1
711                     pass
712                 elif flowcell["paired_end"]:
713                     is_paired += 1
714                 else:
715                     isnot_paired += 1
716                     
717             logging.debug("Library %s: %d paired, %d single, %d failed" % \
718                      (lib_info["library_id"], is_paired, isnot_paired, failed))
719
720             if is_paired > isnot_paired:
721                 self.lib_paired[lib_id] = True
722             elif is_paired < isnot_paired:
723                 self.lib_paired[lib_id] = False
724             else:
725                 raise RuntimeError("Equal number of paired & unpaired lanes."\
726                                    "Can't guess library paired status")
727             
728         return self.lib_paired[lib_id]
729
730     def get_paired_attributes(self, lib_info):
731         if lib_info['insert_size'] is None:
732             errmsg = "Library %s is missing insert_size, assuming 200"
733             logging.warn(errmsg % (lib_info["library_id"],))
734             insert_size = 200
735         else:
736             insert_size = lib_info['insert_size']
737         return {'insertLength': insert_size,
738                 'readType': '2x75'}
739
740     def get_single_attributes(self, lib_info):
741         return {'insertLength':'ilNA',
742                 'readType': '1x75D'
743                 }
744
745 def make_submission_section(line_counter, files, attributes):
746     """
747     Create a section in the submission ini file
748     """
749     inifile = [ "[line%s]" % (line_counter,) ]
750     inifile += ["files=%s" % (",".join(files))]
751
752     for k,v in attributes.items():
753         inifile += ["%s=%s" % (k,v)]
754     return inifile
755
756
757 def make_base_name(pathname):
758     base = os.path.basename(pathname)
759     name, ext = os.path.splitext(base)
760     return name
761
762
763 def make_submission_name(ininame):
764     name = make_base_name(ininame)
765     return name + ".tgz"
766
767
768 def make_ddf_name(pathname):
769     name = make_base_name(pathname)
770     return name + ".ddf"
771
772
773 def make_condor_name(pathname, run_type=None):
774     name = make_base_name(pathname)
775     elements = [name]
776     if run_type is not None:
777         elements.append(run_type)
778     elements.append("condor")
779     return ".".join(elements)
780
781
782 def make_submit_script(target, header, body_list):
783     """
784     write out a text file
785
786     this was intended for condor submit scripts
787
788     Args:
789       target (str or stream): 
790         if target is a string, we will open and close the file
791         if target is a stream, the caller is responsible.
792
793       header (str);
794         header to write at the beginning of the file
795       body_list (list of strs):
796         a list of blocks to add to the file.
797     """
798     if type(target) in types.StringTypes:
799         f = open(target,"w")
800     else:
801         f = target
802     f.write(header)
803     for entry in body_list:
804         f.write(entry)
805     if type(target) in types.StringTypes:
806         f.close()
807
808 def parse_filelist(file_string):
809     return file_string.split(",")
810
811
812 def validate_filelist(files):
813     """
814     Die if a file doesn't exist in a file list
815     """
816     for f in files:
817         if not os.path.exists(f):
818             raise RuntimeError("%s does not exist" % (f,))
819
820
821 if __name__ == "__main__":
822     main()