Map junctions.bed to the Junctions view.
[htsworkflow.git] / extra / ucsc_encode_submission / ucsc_gather.py
1 #!/usr/bin/env python
2 from ConfigParser import SafeConfigParser
3 import fnmatch
4 from glob import glob
5 import json
6 import logging
7 from optparse import OptionParser
8 import os
9 from pprint import pprint, pformat
10 import shlex
11 from StringIO import StringIO
12 import time
13 import sys
14 import types
15 import urllib
16 import urllib2
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.pipelines.sequences import \
21     create_sequence_table, \
22     scan_for_sequences
23 from htsworkflow.pipelines import qseq2fastq
24 from htsworkflow.pipelines import srf2fastq
25
26 def main(cmdline=None):
27     parser = make_parser()
28     opts, args = parser.parse_args(cmdline)
29     
30     if opts.debug:
31         logging.basicConfig(level = logging.DEBUG )
32     elif opts.verbose:
33         logging.basicConfig(level = logging.INFO )
34     else:
35         logging.basicConfig(level = logging.WARNING )        
36     
37     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
38
39     if opts.host is None or opts.apiid is None or opts.apikey is None:
40         parser.error("Please specify host url, apiid, apikey")
41
42     if len(args) == 0:
43         parser.error("I need at least one library submission-dir input file")
44         
45     library_result_map = []
46     for a in args:
47         library_result_map.extend(read_library_result_map(a))
48
49     if opts.daf is not None:
50         link_daf(opts.daf, library_result_map)
51
52     if opts.fastq:
53         build_fastqs(opts.host, 
54                      apidata, 
55                      opts.sequence, 
56                      library_result_map,
57                      force=opts.force)
58
59     if opts.ini:
60         make_submission_ini(opts.host, apidata, library_result_map)
61
62     if opts.makeddf:
63         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
64
65
66 def make_parser():
67     # Load defaults from the config files
68     config = SafeConfigParser()
69     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
70     
71     sequence_archive = None
72     apiid = None
73     apikey = None
74     apihost = None
75     SECTION = 'sequence_archive'
76     if config.has_section(SECTION):
77         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
78         sequence_archive = os.path.expanduser(sequence_archive)
79         apiid = config.get(SECTION, 'apiid', apiid)
80         apikey = config.get(SECTION, 'apikey', apikey)
81         apihost = config.get(SECTION, 'host', apihost)
82
83     parser = OptionParser()
84
85     # commands
86     parser.add_option('--fastq', help="generate scripts for making fastq files",
87                       default=False, action="store_true")
88
89     parser.add_option('--ini', help="generate submission ini file", default=False,
90                       action="store_true")
91
92     parser.add_option('--makeddf', help='make the ddfs', default=False,
93                       action="store_true")
94     
95     parser.add_option('--daf', default=None, help='specify daf name')
96     parser.add_option('--force', default=False, action="store_true",
97                       help="Force regenerating fastqs")
98
99     # configuration options
100     parser.add_option('--apiid', default=apiid, help="Specify API ID")
101     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
102     parser.add_option('--host',  default=apihost,
103                       help="specify HTSWorkflow host",)
104     parser.add_option('--sequence', default=sequence_archive,
105                       help="sequence repository")
106
107     # debugging
108     parser.add_option('--verbose', default=False, action="store_true",
109                       help='verbose logging')
110     parser.add_option('--debug', default=False, action="store_true",
111                       help='debug logging')
112
113     return parser
114
115
116 def build_fastqs(host, apidata, sequences_path, library_result_map, 
117                  force=False ):
118     """
119     Generate condor scripts to build any needed fastq files
120     
121     Args:
122       host (str): root of the htsworkflow api server
123       apidata (dict): id & key to post to the server
124       sequences_path (str): root of the directory tree to scan for files
125       library_result_map (list):  [(library_id, destination directory), ...]
126     """
127     qseq_condor_header = """
128 Universe=vanilla
129 executable=%(exe)s
130 error=log/qseq2fastq.err.$(process).log
131 output=log/qseq2fastq.out.$(process).log
132 log=log/qseq2fastq.log
133
134 """ % {'exe': sys.executable }
135     qseq_condor_entries = []
136     srf_condor_header = """
137 Universe=vanilla
138 executable=%(exe)s
139 output=log/srf_pair_fastq.out.$(process).log
140 error=log/srf_pair_fastq.err.$(process).log
141 log=log/srf_pair_fastq.log
142 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
143
144 """ % {'exe': sys.executable }
145     srf_condor_entries = []
146     lib_db = find_archive_sequence_files(host, 
147                                          apidata, 
148                                          sequences_path, 
149                                          library_result_map)
150
151     needed_targets = find_missing_targets(library_result_map, lib_db, force)
152
153     for target_pathname, available_sources in needed_targets.items():
154         logging.debug(' target : %s' % (target_pathname,))
155         logging.debug(' candidate sources: %s' % (available_sources,))
156         if available_sources.has_key('qseq'):
157             source = available_sources['qseq']
158             qseq_condor_entries.append(
159                 condor_qseq_to_fastq(source.path, 
160                                      target_pathname, 
161                                      source.flowcell,
162                                      force=force)
163             )
164         elif available_sources.has_key('srf'):
165             source = available_sources['srf']
166             mid = getattr(source, 'mid_point', None)
167             srf_condor_entries.append(
168                 condor_srf_to_fastq(source.path, 
169                                     target_pathname,
170                                     source.paired,
171                                     source.flowcell,
172                                     mid,
173                                     force=force)
174             )
175         else:
176             print " need file", target_pathname
177
178     if len(srf_condor_entries) > 0:
179         make_submit_script('srf.fastq.condor', 
180                            srf_condor_header,
181                            srf_condor_entries)
182
183     if len(qseq_condor_entries) > 0:
184         make_submit_script('qseq.fastq.condor', 
185                            qseq_condor_header,
186                            qseq_condor_entries)
187
188
189 def find_missing_targets(library_result_map, lib_db, force=False):
190     """
191     Check if the sequence file exists.
192     This requires computing what the sequence name is and checking
193     to see if it can be found in the sequence location.
194
195     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
196     """
197     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
198     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
199     # find what targets we're missing
200     needed_targets = {}
201     for lib_id, result_dir in library_result_map:
202         lib = lib_db[lib_id]
203         lane_dict = make_lane_dict(lib_db, lib_id)
204         
205         for lane_key, sequences in lib['lanes'].items():
206             for seq in sequences:
207                 seq.paired = lane_dict[seq.flowcell]['paired_end']
208                 lane_status = lane_dict[seq.flowcell]['status']
209
210                 if seq.paired and seq.read is None:
211                     seq.read = 1
212                 filename_attributes = { 
213                     'flowcell': seq.flowcell,
214                     'lib_id': lib_id,
215                     'lane': seq.lane,
216                     'read': seq.read,
217                     'cycle': seq.cycle
218                     }
219                 # skip bad runs
220                 if lane_status == 'Failed':
221                     continue
222                 if seq.flowcell == '30DY0AAXX':
223                     # 30DY0 only ran for 151 bases instead of 152
224                     # it is actually 76 1st read, 75 2nd read
225                     seq.mid_point = 76
226
227                 # end filters
228                 if seq.paired:
229                     target_name = fastq_paired_template % filename_attributes
230                 else:
231                     target_name = fastq_single_template % filename_attributes
232
233                 target_pathname = os.path.join(result_dir, target_name)
234                 if force or not os.path.exists(target_pathname):
235                     t = needed_targets.setdefault(target_pathname, {})
236                     t[seq.filetype] = seq
237
238     return needed_targets
239
240
241 def link_daf(daf_path, library_result_map):
242     if not os.path.exists(daf_path):
243         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
244
245     base_daf = os.path.basename(daf_path)
246     
247     for lib_id, result_dir in library_result_map:
248         submission_daf = os.path.join(result_dir, base_daf)
249         if not os.path.exists(submission_daf):
250             os.link(daf_path, submission_daf)
251
252
253 def make_submission_ini(host, apidata, library_result_map, paired=True):
254     #attributes = get_filename_attribute_map(paired)
255     view_map = NameToViewMap(host, apidata)
256     
257     candidate_fastq_src = {}
258
259     for lib_id, result_dir in library_result_map:
260         order_by = ['order_by=files', 'view', 'replicate', 'cell', 
261                     'readType', 'mapAlgorithm', 'insertLength' ]
262         inifile =  ['[config]']
263         inifile += [" ".join(order_by)]
264         inifile += ['']
265         line_counter = 1
266         result_ini = os.path.join(result_dir, result_dir+'.ini')
267
268         # write other lines
269         submission_files = os.listdir(result_dir)
270         fastqs = {}
271         for f in submission_files:
272             attributes = view_map.find_attributes(f, lib_id)
273             if attributes is None:
274                 raise ValueError("Unrecognized file: %s" % (f,))
275             
276             ext = attributes["extension"]
277             if attributes['view'] is None:                   
278                 continue               
279             elif attributes.get("type", None) == 'fastq':
280                 fastqs.setdefault(ext, set()).add(f)
281             else:
282                 inifile.extend(
283                     make_submission_section(line_counter,
284                                             [f],
285                                             attributes
286                                             )
287                     )
288                 inifile += ['']
289                 line_counter += 1
290
291         # add in fastqs on a single line.
292         for extension, fastq_set in fastqs.items():
293             inifile.extend(
294                 make_submission_section(line_counter, 
295                                         fastq_set,
296                                         attributes[extension])
297             )
298             inifile += ['']
299             line_counter += 1
300             
301         f = open(result_ini,'w')
302         f.write(os.linesep.join(inifile))
303
304         
305 def make_lane_dict(lib_db, lib_id):
306     """
307     Convert the lane_set in a lib_db to a dictionary
308     indexed by flowcell ID
309     """
310     result = []
311     for lane in lib_db[lib_id]['lane_set']:
312         result.append((lane['flowcell'], lane))
313     return dict(result)
314
315
316 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
317     dag_fragment = []
318     for lib_id, result_dir in library_result_map:
319         ininame = result_dir+'.ini'
320         inipathname = os.path.join(result_dir, ininame)
321         if os.path.exists(inipathname):
322             dag_fragment.extend(
323                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
324             )
325
326     if make_condor and len(dag_fragment) > 0:
327         dag_filename = 'submission.dagman'
328         if not force and os.path.exists(dag_filename):
329             logging.warn("%s exists, please delete" % (dag_filename,))
330         else:
331             f = open(dag_filename,'w')
332             f.write( os.linesep.join(dag_fragment))
333             f.write( os.linesep )
334             f.close()
335             
336
337 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
338     """
339     Make ddf files, and bonus condor file
340     """
341     dag_fragments = []
342     curdir = os.getcwd()
343     if outdir is not None:
344         os.chdir(outdir)
345     output = sys.stdout
346     ddf_name = None
347     if guess_ddf:
348         ddf_name = make_ddf_name(ininame)
349         print ddf_name
350         output = open(ddf_name,'w')
351
352     file_list = read_ddf_ini(ininame, output)
353
354     file_list.append(daf_name)
355     if ddf_name is not None:
356         file_list.append(ddf_name)
357
358     if make_condor:
359         archive_condor = make_condor_archive_script(ininame, file_list)
360         upload_condor = make_condor_upload_script(ininame)
361         
362         dag_fragments.extend( 
363             make_dag_fragment(ininame, archive_condor, upload_condor)
364         ) 
365         
366     os.chdir(curdir)
367     
368     return dag_fragments
369
370
371 def read_ddf_ini(filename, output=sys.stdout):
372     """
373     Read a ini file and dump out a tab delmited text file
374     """
375     file_list = []
376     config = SafeConfigParser()
377     config.read(filename)
378
379     order_by = shlex.split(config.get("config", "order_by"))
380
381     output.write("\t".join(order_by))
382     output.write(os.linesep)
383     sections = config.sections()
384     sections.sort()
385     for section in sections:
386         if section == "config":
387             # skip the config block
388             continue
389         values = []
390         for key in order_by:
391             v = config.get(section, key)
392             values.append(v)
393             if key == 'files':
394                 file_list.extend(parse_filelist(v))
395                 
396         output.write("\t".join(values))
397         output.write(os.linesep)
398     return file_list
399
400
401 def read_library_result_map(filename):
402     """
403     Read a file that maps library id to result directory.
404     Does not support spaces in filenames. 
405     
406     For example:
407       10000 result/foo/bar
408     """
409     stream = open(filename,'r')
410
411     results = []
412     for line in stream:
413         line = line.rstrip()
414         if not line.startswith('#') and len(line) > 0 :
415             library_id, result_dir = line.split()
416             results.append((library_id, result_dir))
417     return results
418
419
420 def make_condor_archive_script(ininame, files):
421     script = """Universe = vanilla
422
423 Executable = /bin/tar
424 arguments = czvf ../%(archivename)s %(filelist)s
425
426 Error = compress.err.$(Process).log
427 Output = compress.out.$(Process).log
428 Log = /tmp/submission-compress-%(user)s.log
429 initialdir = %(initialdir)s
430
431 queue 
432 """
433     for f in files:
434         if not os.path.exists(f):
435             raise RuntimeError("Missing %s" % (f,))
436
437     context = {'archivename': make_submission_name(ininame),
438                'filelist': " ".join(files),
439                'initialdir': os.getcwd(),
440                'user': os.getlogin()}
441
442     condor_script = make_condor_name(ininame, 'archive')
443     condor_stream = open(condor_script,'w')
444     condor_stream.write(script % context)
445     condor_stream.close()
446     return condor_script
447
448
449 def make_condor_upload_script(ininame):
450     script = """Universe = vanilla
451
452 Executable = /usr/bin/lftp
453 arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/%(archivename)s
454
455 Error = upload.err.$(Process).log
456 Output = upload.out.$(Process).log
457 Log = /tmp/submission-upload-%(user)s.log
458 initialdir = %(initialdir)s
459
460 queue 
461 """
462     context = {'archivename': make_submission_name(ininame),
463                'initialdir': os.getcwd(),
464                'user': os.getlogin()}
465
466     condor_script = make_condor_name(ininame, 'upload')
467     condor_stream = open(condor_script,'w')
468     condor_stream.write(script % context)
469     condor_stream.close()
470     return condor_script
471
472
473 def make_dag_fragment(ininame, archive_condor, upload_condor):
474     """
475     Make the couple of fragments compress and then upload the data.
476     """
477     cur_dir = os.getcwd()
478     archive_condor = os.path.join(cur_dir, archive_condor)
479     upload_condor = os.path.join(cur_dir, upload_condor)
480     job_basename = make_base_name(ininame)
481
482     fragments = []
483     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
484     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
485     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
486
487     return fragments
488
489
490 def get_library_info(host, apidata, library_id):
491     url = api.library_url(host, library_id)
492     contents = api.retrieve_info(url, apidata)
493     return contents
494
495
496 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
497                         mid=None, force=False):
498     py = srf2fastq.__file__
499     args = [ py, srf_file, ]
500     if paired:
501         args.extend(['--left', target_pathname])
502         # this is ugly. I did it because I was pregenerating the target
503         # names before I tried to figure out what sources could generate
504         # those targets, and everything up to this point had been
505         # one-to-one. So I couldn't figure out how to pair the 
506         # target names. 
507         # With this at least the command will run correctly.
508         # however if we rename the default targets, this'll break
509         # also I think it'll generate it twice.
510         args.extend(['--right', 
511                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
512     else:
513         args.extend(['--single', target_pathname ])
514     if flowcell is not None:
515         args.extend(['--flowcell', flowcell])
516
517     if mid is not None:
518         args.extend(['-m', str(mid)])
519
520     if force:
521         args.extend(['--force'])
522
523     script = """
524 arguments="%s"
525 queue
526 """ % (" ".join(args),)
527     
528     return  script 
529
530
531 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
532     py = qseq2fastq.__file__
533     args = [py, '-i', qseq_file, '-o', target_pathname ]
534     if flowcell is not None:
535         args.extend(['-f', flowcell])
536     script = """
537 arguments="%s"
538 queue
539 """ % (" ".join(args))
540
541     return script 
542
543 def find_archive_sequence_files(host, apidata, sequences_path, 
544                                 library_result_map):
545     """
546     Find all the archive sequence files possibly associated with our results.
547
548     """
549     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
550
551     lib_db = {}
552     seq_dirs = set()
553     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
554     candidate_lanes = {}
555     for lib_id, result_dir in library_result_map:
556         lib_info = get_library_info(host, apidata, lib_id)
557         lib_info['lanes'] = {}
558         lib_db[lib_id] = lib_info
559
560         for lane in lib_info['lane_set']:
561             lane_key = (lane['flowcell'], lane['lane_number'])
562             candidate_lanes[lane_key] = lib_id
563             seq_dirs.add(os.path.join(sequences_path, 
564                                          'flowcells', 
565                                          lane['flowcell']))
566     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
567     candidate_seq_list = scan_for_sequences(seq_dirs)
568
569     # at this point we have too many sequences as scan_for_sequences
570     # returns all the sequences in a flowcell directory
571     # so lets filter out the extras
572     
573     for seq in candidate_seq_list:
574         lane_key = (seq.flowcell, seq.lane)
575         lib_id = candidate_lanes.get(lane_key, None)
576         if lib_id is not None:
577             lib_info = lib_db[lib_id]
578             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
579     
580     return lib_db
581
582
583 class NameToViewMap(object):
584     """Determine view attributes for a given submission file name
585     """
586     def __init__(self, root_url, apidata):
587         self.root_url = root_url
588         self.apidata = apidata
589         
590         self.lib_cache = {}
591         self.lib_paired = {}
592         # ma is "map algorithm"
593         ma = 'TH1014'
594
595         self.patterns = [
596             ('*.bai',                   None),
597             ('*.bam',                   self._guess_bam_view),
598             ('*.splices.bam',           'Splices'),
599             ('junctions.bed',           'Junctions'),
600             ('*.jnct',                  'Junctions'),
601             ('*.plus.bigwig',           'PlusSignal'),
602             ('*.minus.bigwig',          'MinusSignal'),
603             ('*.bigwig',                'Signal'),
604             ('*.tar.bz2',               None),
605             ('*.condor',                None),
606             ('*.daf',                   None),
607             ('*.ddf',                   None),
608             ('cufflinks-0.9.0-genes.expr',       'GeneDeNovo'),
609             ('cufflinks-0.9.0-transcripts.expr', 'TranscriptDeNovo'),
610             ('cufflinks-0.9.0-transcripts.gtf',  'GeneModel'),
611             ('GENCODE-v3c-genes.expr',       'GeneGencV3c'),
612             ('GENCODE-v3c-transcripts.expr', 'TranscriptGencV3c'),
613             ('GENCODE-v4-genes.expr',       'GeneGencV4'),
614             ('GENCODE-v4-transcripts.expr', 'TranscriptGencV4'),
615             ('GENCODE-v4-transcript.expr', 'TranscriptGencV4'),
616             ('*_r1.fastq',              'FastqRd1'),
617             ('*_r2.fastq',              'FastqRd2'),
618             ('*.fastq',                 'Fastq'),
619             ('*.gtf',                   'GeneModel'),
620             ('*.ini',                   None),
621             ('*.log',                   None),
622             ('*.stats.txt',             'InsLength'),
623             ('*.srf',                   None),
624             ('*.wig',                   None),
625             ('*.zip',                   None),
626             ]
627
628         self.views = {
629             None: {"MapAlgorithm": "NA"},
630             "Paired": {"MapAlgorithm": ma},
631             "Single": {"MapAlgorithm": ma},
632             "Splices": {"MapAlgorithm": ma},
633             "Junctions": {"MapAlgorithm": ma},
634             "PlusSignal": {"MapAlgorithm": ma},
635             "MinusSignal": {"MapAlgorithm": ma},
636             "Signal": {"MapAlgorithm": ma},
637             "GeneModel": {"MapAlgorithm": ma},
638             "GeneDeNovo": {"MapAlgorithm": ma},
639             "TranscriptDeNovo": {"MapAlgorithm": ma},
640             "GeneGencV3c": {"MapAlgorithm": ma},
641             "TranscriptGencV3c": {"MapAlgorithm": ma},
642             "GeneGencV4": {"MapAlgorithm": ma},
643             "TranscriptGencV4": {"MapAlgorithm": ma},
644             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
645             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
646             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
647             "GeneModel": {"MapAlgorithm": ma},
648             "InsLength": {"MapAlgorithm": ma},
649             }
650         # view name is one of the attributes
651         for v in self.views.keys():
652             self.views[v]['view'] = v
653             
654     def find_attributes(self, pathname, lib_id):
655         """Looking for the best extension
656         The 'best' is the longest match
657         
658         :Args:
659         filename (str): the filename whose extention we are about to examine
660         """
661         path, filename = os.path.splitext(pathname)
662         if not self.lib_cache.has_key(lib_id):
663             self.lib_cache[lib_id] = get_library_info(self.root_url,
664                                                       self.apidata, lib_id)
665
666         lib_info = self.lib_cache[lib_id]
667         if lib_info['cell_line'].lower() == 'unknown':
668             logging.warn("Library %s missing cell_line" % (lib_id,))
669         attributes = {
670             'cell': lib_info['cell_line'],
671             'replicate': lib_info['replicate'],
672             }
673         is_paired = self._is_paired(lib_id, lib_info)
674         
675         if is_paired:
676             attributes.update(self.get_paired_attributes(lib_info))
677         else:
678             attributes.update(self.get_single_attributes(lib_info))
679             
680         for pattern, view in self.patterns:
681             if fnmatch.fnmatch(pathname, pattern):
682                 if callable(view):
683                     view = view(is_paired=is_paired)
684                     
685                 attributes.update(self.views[view])
686                 attributes["extension"] = pattern
687                 return attributes
688
689
690     def _guess_bam_view(self, is_paired=True):
691         """Guess a view name based on library attributes
692         """
693         if is_paired:
694             return "Paired"
695         else:
696             return "Align"
697
698
699     def _is_paired(self, lib_id, lib_info):
700         """Determine if a library is paired end"""
701         if len(lib_info["lane_set"]) == 0:
702             return False
703
704         if not self.lib_paired.has_key(lib_id):
705             is_paired = 0
706             isnot_paired = 0
707             failed = 0
708             # check to see if all the flowcells are the same.
709             # otherwise we might need to do something complicated
710             for flowcell in lib_info["lane_set"]:
711                 # yes there's also a status code, but this comparison 
712                 # is easier to read
713                 if flowcell["status"].lower() == "failed":
714                     # ignore failed flowcell
715                     failed += 1
716                     pass
717                 elif flowcell["paired_end"]:
718                     is_paired += 1
719                 else:
720                     isnot_paired += 1
721                     
722             logging.debug("Library %s: %d paired, %d single, %d failed" % \
723                      (lib_info["library_id"], is_paired, isnot_paired, failed))
724
725             if is_paired > isnot_paired:
726                 self.lib_paired[lib_id] = True
727             elif is_paired < isnot_paired:
728                 self.lib_paired[lib_id] = False
729             else:
730                 raise RuntimeError("Equal number of paired & unpaired lanes."\
731                                    "Can't guess library paired status")
732             
733         return self.lib_paired[lib_id]
734
735     def get_paired_attributes(self, lib_info):
736         if lib_info['insert_size'] is None:
737             errmsg = "Library %s is missing insert_size, assuming 200"
738             logging.warn(errmsg % (lib_info["library_id"],))
739             insert_size = 200
740         else:
741             insert_size = lib_info['insert_size']
742         return {'insertLength': insert_size,
743                 'readType': '2x75'}
744
745     def get_single_attributes(self, lib_info):
746         return {'insertLength':'ilNA',
747                 'readType': '1x75D'
748                 }
749
750 def make_submission_section(line_counter, files, attributes):
751     """
752     Create a section in the submission ini file
753     """
754     inifile = [ "[line%s]" % (line_counter,) ]
755     inifile += ["files=%s" % (",".join(files))]
756
757     for k,v in attributes.items():
758         inifile += ["%s=%s" % (k,v)]
759     return inifile
760
761
762 def make_base_name(pathname):
763     base = os.path.basename(pathname)
764     name, ext = os.path.splitext(base)
765     return name
766
767
768 def make_submission_name(ininame):
769     name = make_base_name(ininame)
770     return name + ".tgz"
771
772
773 def make_ddf_name(pathname):
774     name = make_base_name(pathname)
775     return name + ".ddf"
776
777
778 def make_condor_name(pathname, run_type=None):
779     name = make_base_name(pathname)
780     elements = [name]
781     if run_type is not None:
782         elements.append(run_type)
783     elements.append("condor")
784     return ".".join(elements)
785
786
787 def make_submit_script(target, header, body_list):
788     """
789     write out a text file
790
791     this was intended for condor submit scripts
792
793     Args:
794       target (str or stream): 
795         if target is a string, we will open and close the file
796         if target is a stream, the caller is responsible.
797
798       header (str);
799         header to write at the beginning of the file
800       body_list (list of strs):
801         a list of blocks to add to the file.
802     """
803     if type(target) in types.StringTypes:
804         f = open(target,"w")
805     else:
806         f = target
807     f.write(header)
808     for entry in body_list:
809         f.write(entry)
810     if type(target) in types.StringTypes:
811         f.close()
812
813 def parse_filelist(file_string):
814     return file_string.split(",")
815
816
817 def validate_filelist(files):
818     """
819     Die if a file doesn't exist in a file list
820     """
821     for f in files:
822         if not os.path.exists(f):
823             raise RuntimeError("%s does not exist" % (f,))
824
825
826 if __name__ == "__main__":
827     main()