Cache the attributes for each type of fastq file.
[htsworkflow.git] / extra / ucsc_encode_submission / ucsc_gather.py
1 #!/usr/bin/env python
2 from ConfigParser import SafeConfigParser
3 import fnmatch
4 from glob import glob
5 import json
6 import logging
7 from optparse import OptionParser
8 import os
9 from pprint import pprint, pformat
10 import shlex
11 from StringIO import StringIO
12 import time
13 import sys
14 import types
15 import urllib
16 import urllib2
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.pipelines.sequences import \
21     create_sequence_table, \
22     scan_for_sequences
23 from htsworkflow.pipelines import qseq2fastq
24 from htsworkflow.pipelines import srf2fastq
25
26 def main(cmdline=None):
27     parser = make_parser()
28     opts, args = parser.parse_args(cmdline)
29     
30     if opts.debug:
31         logging.basicConfig(level = logging.DEBUG )
32     elif opts.verbose:
33         logging.basicConfig(level = logging.INFO )
34     else:
35         logging.basicConfig(level = logging.WARNING )        
36     
37     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
38
39     if opts.host is None or opts.apiid is None or opts.apikey is None:
40         parser.error("Please specify host url, apiid, apikey")
41
42     if len(args) == 0:
43         parser.error("I need at least one library submission-dir input file")
44         
45     library_result_map = []
46     for a in args:
47         library_result_map.extend(read_library_result_map(a))
48
49     if opts.daf is not None:
50         link_daf(opts.daf, library_result_map)
51
52     if opts.fastq:
53         build_fastqs(opts.host, 
54                      apidata, 
55                      opts.sequence, 
56                      library_result_map,
57                      force=opts.force)
58
59     if opts.ini:
60         make_submission_ini(opts.host, apidata, library_result_map)
61
62     if opts.makeddf:
63         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
64
65
66 def make_parser():
67     # Load defaults from the config files
68     config = SafeConfigParser()
69     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
70     
71     sequence_archive = None
72     apiid = None
73     apikey = None
74     apihost = None
75     SECTION = 'sequence_archive'
76     if config.has_section(SECTION):
77         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
78         sequence_archive = os.path.expanduser(sequence_archive)
79         apiid = config.get(SECTION, 'apiid', apiid)
80         apikey = config.get(SECTION, 'apikey', apikey)
81         apihost = config.get(SECTION, 'host', apihost)
82
83     parser = OptionParser()
84
85     # commands
86     parser.add_option('--fastq', help="generate scripts for making fastq files",
87                       default=False, action="store_true")
88
89     parser.add_option('--ini', help="generate submission ini file", default=False,
90                       action="store_true")
91
92     parser.add_option('--makeddf', help='make the ddfs', default=False,
93                       action="store_true")
94     
95     parser.add_option('--daf', default=None, help='specify daf name')
96     parser.add_option('--force', default=False, action="store_true",
97                       help="Force regenerating fastqs")
98
99     # configuration options
100     parser.add_option('--apiid', default=apiid, help="Specify API ID")
101     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
102     parser.add_option('--host',  default=apihost,
103                       help="specify HTSWorkflow host",)
104     parser.add_option('--sequence', default=sequence_archive,
105                       help="sequence repository")
106
107     # debugging
108     parser.add_option('--verbose', default=False, action="store_true",
109                       help='verbose logging')
110     parser.add_option('--debug', default=False, action="store_true",
111                       help='debug logging')
112
113     return parser
114
115
116 def build_fastqs(host, apidata, sequences_path, library_result_map, 
117                  force=False ):
118     """
119     Generate condor scripts to build any needed fastq files
120     
121     Args:
122       host (str): root of the htsworkflow api server
123       apidata (dict): id & key to post to the server
124       sequences_path (str): root of the directory tree to scan for files
125       library_result_map (list):  [(library_id, destination directory), ...]
126     """
127     qseq_condor_header = """
128 Universe=vanilla
129 executable=%(exe)s
130 error=log/qseq2fastq.err.$(process).log
131 output=log/qseq2fastq.out.$(process).log
132 log=log/qseq2fastq.log
133
134 """ % {'exe': sys.executable }
135     qseq_condor_entries = []
136     srf_condor_header = """
137 Universe=vanilla
138 executable=%(exe)s
139 output=log/srf_pair_fastq.out.$(process).log
140 error=log/srf_pair_fastq.err.$(process).log
141 log=log/srf_pair_fastq.log
142 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
143
144 """ % {'exe': sys.executable }
145     srf_condor_entries = []
146     lib_db = find_archive_sequence_files(host, 
147                                          apidata, 
148                                          sequences_path, 
149                                          library_result_map)
150
151     needed_targets = find_missing_targets(library_result_map, lib_db, force)
152
153     for target_pathname, available_sources in needed_targets.items():
154         logging.debug(' target : %s' % (target_pathname,))
155         logging.debug(' candidate sources: %s' % (available_sources,))
156         if available_sources.has_key('qseq'):
157             source = available_sources['qseq']
158             qseq_condor_entries.append(
159                 condor_qseq_to_fastq(source.path, 
160                                      target_pathname, 
161                                      source.flowcell,
162                                      force=force)
163             )
164         elif available_sources.has_key('srf'):
165             source = available_sources['srf']
166             mid = getattr(source, 'mid_point', None)
167             srf_condor_entries.append(
168                 condor_srf_to_fastq(source.path, 
169                                     target_pathname,
170                                     source.paired,
171                                     source.flowcell,
172                                     mid,
173                                     force=force)
174             )
175         else:
176             print " need file", target_pathname
177
178     if len(srf_condor_entries) > 0:
179         make_submit_script('srf.fastq.condor', 
180                            srf_condor_header,
181                            srf_condor_entries)
182
183     if len(qseq_condor_entries) > 0:
184         make_submit_script('qseq.fastq.condor', 
185                            qseq_condor_header,
186                            qseq_condor_entries)
187
188
189 def find_missing_targets(library_result_map, lib_db, force=False):
190     """
191     Check if the sequence file exists.
192     This requires computing what the sequence name is and checking
193     to see if it can be found in the sequence location.
194
195     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
196     """
197     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
198     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
199     # find what targets we're missing
200     needed_targets = {}
201     for lib_id, result_dir in library_result_map:
202         lib = lib_db[lib_id]
203         lane_dict = make_lane_dict(lib_db, lib_id)
204         
205         for lane_key, sequences in lib['lanes'].items():
206             for seq in sequences:
207                 seq.paired = lane_dict[seq.flowcell]['paired_end']
208                 lane_status = lane_dict[seq.flowcell]['status']
209
210                 if seq.paired and seq.read is None:
211                     seq.read = 1
212                 filename_attributes = { 
213                     'flowcell': seq.flowcell,
214                     'lib_id': lib_id,
215                     'lane': seq.lane,
216                     'read': seq.read,
217                     'cycle': seq.cycle
218                     }
219                 # skip bad runs
220                 if lane_status == 'Failed':
221                     continue
222                 if seq.flowcell == '30DY0AAXX':
223                     # 30DY0 only ran for 151 bases instead of 152
224                     # it is actually 76 1st read, 75 2nd read
225                     seq.mid_point = 76
226
227                 # end filters
228                 if seq.paired:
229                     target_name = fastq_paired_template % filename_attributes
230                 else:
231                     target_name = fastq_single_template % filename_attributes
232
233                 target_pathname = os.path.join(result_dir, target_name)
234                 if force or not os.path.exists(target_pathname):
235                     t = needed_targets.setdefault(target_pathname, {})
236                     t[seq.filetype] = seq
237
238     return needed_targets
239
240
241 def link_daf(daf_path, library_result_map):
242     if not os.path.exists(daf_path):
243         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
244
245     base_daf = os.path.basename(daf_path)
246     
247     for lib_id, result_dir in library_result_map:
248         submission_daf = os.path.join(result_dir, base_daf)
249         if not os.path.exists(submission_daf):
250             os.link(daf_path, submission_daf)
251
252
253 def make_submission_ini(host, apidata, library_result_map, paired=True):
254     #attributes = get_filename_attribute_map(paired)
255     view_map = NameToViewMap(host, apidata)
256     
257     candidate_fastq_src = {}
258
259     for lib_id, result_dir in library_result_map:
260         order_by = ['order_by=files', 'view', 'replicate', 'cell', 
261                     'readType', 'mapAlgorithm', 'insertLength' ]
262         inifile =  ['[config]']
263         inifile += [" ".join(order_by)]
264         inifile += ['']
265         line_counter = 1
266         result_ini = os.path.join(result_dir, result_dir+'.ini')
267
268         # write other lines
269         submission_files = os.listdir(result_dir)
270         fastqs = {}
271         fastq_attributes = {}
272         for f in submission_files:
273             attributes = view_map.find_attributes(f, lib_id)
274             if attributes is None:
275                 raise ValueError("Unrecognized file: %s" % (f,))
276             
277             ext = attributes["extension"]
278             if attributes['view'] is None:                   
279                 continue               
280             elif attributes.get("type", None) == 'fastq':
281                 fastqs.setdefault(ext, set()).add(f)
282                 fastq_attributes[ext] = attributes
283             else:
284                 inifile.extend(
285                     make_submission_section(line_counter,
286                                             [f],
287                                             attributes
288                                             )
289                     )
290                 inifile += ['']
291                 line_counter += 1
292                 # add in fastqs on a single line.
293
294         for extension, fastq_files in fastqs.items():
295             inifile.extend(
296                 make_submission_section(line_counter, 
297                                         fastq_files,
298                                         fastq_attributes[extension])
299             )
300             inifile += ['']
301             line_counter += 1
302             
303         f = open(result_ini,'w')
304         f.write(os.linesep.join(inifile))
305
306         
307 def make_lane_dict(lib_db, lib_id):
308     """
309     Convert the lane_set in a lib_db to a dictionary
310     indexed by flowcell ID
311     """
312     result = []
313     for lane in lib_db[lib_id]['lane_set']:
314         result.append((lane['flowcell'], lane))
315     return dict(result)
316
317
318 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
319     dag_fragment = []
320     for lib_id, result_dir in library_result_map:
321         ininame = result_dir+'.ini'
322         inipathname = os.path.join(result_dir, ininame)
323         if os.path.exists(inipathname):
324             dag_fragment.extend(
325                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
326             )
327
328     if make_condor and len(dag_fragment) > 0:
329         dag_filename = 'submission.dagman'
330         if not force and os.path.exists(dag_filename):
331             logging.warn("%s exists, please delete" % (dag_filename,))
332         else:
333             f = open(dag_filename,'w')
334             f.write( os.linesep.join(dag_fragment))
335             f.write( os.linesep )
336             f.close()
337             
338
339 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
340     """
341     Make ddf files, and bonus condor file
342     """
343     dag_fragments = []
344     curdir = os.getcwd()
345     if outdir is not None:
346         os.chdir(outdir)
347     output = sys.stdout
348     ddf_name = None
349     if guess_ddf:
350         ddf_name = make_ddf_name(ininame)
351         print ddf_name
352         output = open(ddf_name,'w')
353
354     file_list = read_ddf_ini(ininame, output)
355
356     file_list.append(daf_name)
357     if ddf_name is not None:
358         file_list.append(ddf_name)
359
360     if make_condor:
361         archive_condor = make_condor_archive_script(ininame, file_list)
362         upload_condor = make_condor_upload_script(ininame)
363         
364         dag_fragments.extend( 
365             make_dag_fragment(ininame, archive_condor, upload_condor)
366         ) 
367         
368     os.chdir(curdir)
369     
370     return dag_fragments
371
372
373 def read_ddf_ini(filename, output=sys.stdout):
374     """
375     Read a ini file and dump out a tab delmited text file
376     """
377     file_list = []
378     config = SafeConfigParser()
379     config.read(filename)
380
381     order_by = shlex.split(config.get("config", "order_by"))
382
383     output.write("\t".join(order_by))
384     output.write(os.linesep)
385     sections = config.sections()
386     sections.sort()
387     for section in sections:
388         if section == "config":
389             # skip the config block
390             continue
391         values = []
392         for key in order_by:
393             v = config.get(section, key)
394             values.append(v)
395             if key == 'files':
396                 file_list.extend(parse_filelist(v))
397                 
398         output.write("\t".join(values))
399         output.write(os.linesep)
400     return file_list
401
402
403 def read_library_result_map(filename):
404     """
405     Read a file that maps library id to result directory.
406     Does not support spaces in filenames. 
407     
408     For example:
409       10000 result/foo/bar
410     """
411     stream = open(filename,'r')
412
413     results = []
414     for line in stream:
415         line = line.rstrip()
416         if not line.startswith('#') and len(line) > 0 :
417             library_id, result_dir = line.split()
418             results.append((library_id, result_dir))
419     return results
420
421
422 def make_condor_archive_script(ininame, files):
423     script = """Universe = vanilla
424
425 Executable = /bin/tar
426 arguments = czvf ../%(archivename)s %(filelist)s
427
428 Error = compress.err.$(Process).log
429 Output = compress.out.$(Process).log
430 Log = /tmp/submission-compress-%(user)s.log
431 initialdir = %(initialdir)s
432
433 queue 
434 """
435     for f in files:
436         if not os.path.exists(f):
437             raise RuntimeError("Missing %s" % (f,))
438
439     context = {'archivename': make_submission_name(ininame),
440                'filelist': " ".join(files),
441                'initialdir': os.getcwd(),
442                'user': os.getlogin()}
443
444     condor_script = make_condor_name(ininame, 'archive')
445     condor_stream = open(condor_script,'w')
446     condor_stream.write(script % context)
447     condor_stream.close()
448     return condor_script
449
450
451 def make_condor_upload_script(ininame):
452     script = """Universe = vanilla
453
454 Executable = /usr/bin/lftp
455 arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/%(archivename)s
456
457 Error = upload.err.$(Process).log
458 Output = upload.out.$(Process).log
459 Log = /tmp/submission-upload-%(user)s.log
460 initialdir = %(initialdir)s
461
462 queue 
463 """
464     context = {'archivename': make_submission_name(ininame),
465                'initialdir': os.getcwd(),
466                'user': os.getlogin()}
467
468     condor_script = make_condor_name(ininame, 'upload')
469     condor_stream = open(condor_script,'w')
470     condor_stream.write(script % context)
471     condor_stream.close()
472     return condor_script
473
474
475 def make_dag_fragment(ininame, archive_condor, upload_condor):
476     """
477     Make the couple of fragments compress and then upload the data.
478     """
479     cur_dir = os.getcwd()
480     archive_condor = os.path.join(cur_dir, archive_condor)
481     upload_condor = os.path.join(cur_dir, upload_condor)
482     job_basename = make_base_name(ininame)
483
484     fragments = []
485     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
486     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
487     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
488
489     return fragments
490
491
492 def get_library_info(host, apidata, library_id):
493     url = api.library_url(host, library_id)
494     contents = api.retrieve_info(url, apidata)
495     return contents
496
497
498 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
499                         mid=None, force=False):
500     py = srf2fastq.__file__
501     args = [ py, srf_file, ]
502     if paired:
503         args.extend(['--left', target_pathname])
504         # this is ugly. I did it because I was pregenerating the target
505         # names before I tried to figure out what sources could generate
506         # those targets, and everything up to this point had been
507         # one-to-one. So I couldn't figure out how to pair the 
508         # target names. 
509         # With this at least the command will run correctly.
510         # however if we rename the default targets, this'll break
511         # also I think it'll generate it twice.
512         args.extend(['--right', 
513                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
514     else:
515         args.extend(['--single', target_pathname ])
516     if flowcell is not None:
517         args.extend(['--flowcell', flowcell])
518
519     if mid is not None:
520         args.extend(['-m', str(mid)])
521
522     if force:
523         args.extend(['--force'])
524
525     script = """
526 arguments="%s"
527 queue
528 """ % (" ".join(args),)
529     
530     return  script 
531
532
533 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
534     py = qseq2fastq.__file__
535     args = [py, '-i', qseq_file, '-o', target_pathname ]
536     if flowcell is not None:
537         args.extend(['-f', flowcell])
538     script = """
539 arguments="%s"
540 queue
541 """ % (" ".join(args))
542
543     return script 
544
545 def find_archive_sequence_files(host, apidata, sequences_path, 
546                                 library_result_map):
547     """
548     Find all the archive sequence files possibly associated with our results.
549
550     """
551     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
552
553     lib_db = {}
554     seq_dirs = set()
555     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
556     candidate_lanes = {}
557     for lib_id, result_dir in library_result_map:
558         lib_info = get_library_info(host, apidata, lib_id)
559         lib_info['lanes'] = {}
560         lib_db[lib_id] = lib_info
561
562         for lane in lib_info['lane_set']:
563             lane_key = (lane['flowcell'], lane['lane_number'])
564             candidate_lanes[lane_key] = lib_id
565             seq_dirs.add(os.path.join(sequences_path, 
566                                          'flowcells', 
567                                          lane['flowcell']))
568     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
569     candidate_seq_list = scan_for_sequences(seq_dirs)
570
571     # at this point we have too many sequences as scan_for_sequences
572     # returns all the sequences in a flowcell directory
573     # so lets filter out the extras
574     
575     for seq in candidate_seq_list:
576         lane_key = (seq.flowcell, seq.lane)
577         lib_id = candidate_lanes.get(lane_key, None)
578         if lib_id is not None:
579             lib_info = lib_db[lib_id]
580             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
581     
582     return lib_db
583
584
585 class NameToViewMap(object):
586     """Determine view attributes for a given submission file name
587     """
588     def __init__(self, root_url, apidata):
589         self.root_url = root_url
590         self.apidata = apidata
591         
592         self.lib_cache = {}
593         self.lib_paired = {}
594         # ma is "map algorithm"
595         ma = 'TH1014'
596
597         self.patterns = [
598             ('*.bai',                   None),
599             ('*.bam',                   self._guess_bam_view),
600             ('*.splices.bam',           'Splices'),
601             ('junctions.bed',           'Junctions'),
602             ('*.jnct',                  'Junctions'),
603             ('*.plus.bigwig',           'PlusSignal'),
604             ('*.minus.bigwig',          'MinusSignal'),
605             ('*.bigwig',                'Signal'),
606             ('*.tar.bz2',               None),
607             ('*.condor',                None),
608             ('*.daf',                   None),
609             ('*.ddf',                   None),
610             ('cufflinks-0.9.0-genes.expr',       'GeneDeNovo'),
611             ('cufflinks-0.9.0-transcripts.expr', 'TranscriptDeNovo'),
612             ('cufflinks-0.9.0-transcripts.gtf',  'GeneModel'),
613             ('GENCODE-v3c-genes.expr',       'GeneGencV3c'),
614             ('GENCODE-v3c-transcripts.expr', 'TranscriptGencV3c'),
615             ('GENCODE-v4-genes.expr',       'GeneGencV4'),
616             ('GENCODE-v4-transcripts.expr', 'TranscriptGencV4'),
617             ('GENCODE-v4-transcript.expr', 'TranscriptGencV4'),
618             ('*_r1.fastq',              'FastqRd1'),
619             ('*_r2.fastq',              'FastqRd2'),
620             ('*.fastq',                 'Fastq'),
621             ('*.gtf',                   'GeneModel'),
622             ('*.ini',                   None),
623             ('*.log',                   None),
624             ('*.stats.txt',             'InsLength'),
625             ('*.srf',                   None),
626             ('*.wig',                   None),
627             ('*.zip',                   None),
628             ]
629
630         self.views = {
631             None: {"MapAlgorithm": "NA"},
632             "Paired": {"MapAlgorithm": ma},
633             "Single": {"MapAlgorithm": ma},
634             "Splices": {"MapAlgorithm": ma},
635             "Junctions": {"MapAlgorithm": ma},
636             "PlusSignal": {"MapAlgorithm": ma},
637             "MinusSignal": {"MapAlgorithm": ma},
638             "Signal": {"MapAlgorithm": ma},
639             "GeneModel": {"MapAlgorithm": ma},
640             "GeneDeNovo": {"MapAlgorithm": ma},
641             "TranscriptDeNovo": {"MapAlgorithm": ma},
642             "GeneGencV3c": {"MapAlgorithm": ma},
643             "TranscriptGencV3c": {"MapAlgorithm": ma},
644             "GeneGencV4": {"MapAlgorithm": ma},
645             "TranscriptGencV4": {"MapAlgorithm": ma},
646             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
647             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
648             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
649             "GeneModel": {"MapAlgorithm": ma},
650             "InsLength": {"MapAlgorithm": ma},
651             }
652         # view name is one of the attributes
653         for v in self.views.keys():
654             self.views[v]['view'] = v
655             
656     def find_attributes(self, pathname, lib_id):
657         """Looking for the best extension
658         The 'best' is the longest match
659         
660         :Args:
661         filename (str): the filename whose extention we are about to examine
662         """
663         path, filename = os.path.splitext(pathname)
664         if not self.lib_cache.has_key(lib_id):
665             self.lib_cache[lib_id] = get_library_info(self.root_url,
666                                                       self.apidata, lib_id)
667
668         lib_info = self.lib_cache[lib_id]
669         if lib_info['cell_line'].lower() == 'unknown':
670             logging.warn("Library %s missing cell_line" % (lib_id,))
671         attributes = {
672             'cell': lib_info['cell_line'],
673             'replicate': lib_info['replicate'],
674             }
675         is_paired = self._is_paired(lib_id, lib_info)
676         
677         if is_paired:
678             attributes.update(self.get_paired_attributes(lib_info))
679         else:
680             attributes.update(self.get_single_attributes(lib_info))
681             
682         for pattern, view in self.patterns:
683             if fnmatch.fnmatch(pathname, pattern):
684                 if callable(view):
685                     view = view(is_paired=is_paired)
686                     
687                 attributes.update(self.views[view])
688                 attributes["extension"] = pattern
689                 return attributes
690
691
692     def _guess_bam_view(self, is_paired=True):
693         """Guess a view name based on library attributes
694         """
695         if is_paired:
696             return "Paired"
697         else:
698             return "Align"
699
700
701     def _is_paired(self, lib_id, lib_info):
702         """Determine if a library is paired end"""
703         if len(lib_info["lane_set"]) == 0:
704             return False
705
706         if not self.lib_paired.has_key(lib_id):
707             is_paired = 0
708             isnot_paired = 0
709             failed = 0
710             # check to see if all the flowcells are the same.
711             # otherwise we might need to do something complicated
712             for flowcell in lib_info["lane_set"]:
713                 # yes there's also a status code, but this comparison 
714                 # is easier to read
715                 if flowcell["status"].lower() == "failed":
716                     # ignore failed flowcell
717                     failed += 1
718                     pass
719                 elif flowcell["paired_end"]:
720                     is_paired += 1
721                 else:
722                     isnot_paired += 1
723                     
724             logging.debug("Library %s: %d paired, %d single, %d failed" % \
725                      (lib_info["library_id"], is_paired, isnot_paired, failed))
726
727             if is_paired > isnot_paired:
728                 self.lib_paired[lib_id] = True
729             elif is_paired < isnot_paired:
730                 self.lib_paired[lib_id] = False
731             else:
732                 raise RuntimeError("Equal number of paired & unpaired lanes."\
733                                    "Can't guess library paired status")
734             
735         return self.lib_paired[lib_id]
736
737     def get_paired_attributes(self, lib_info):
738         if lib_info['insert_size'] is None:
739             errmsg = "Library %s is missing insert_size, assuming 200"
740             logging.warn(errmsg % (lib_info["library_id"],))
741             insert_size = 200
742         else:
743             insert_size = lib_info['insert_size']
744         return {'insertLength': insert_size,
745                 'readType': '2x75'}
746
747     def get_single_attributes(self, lib_info):
748         return {'insertLength':'ilNA',
749                 'readType': '1x75D'
750                 }
751
752 def make_submission_section(line_counter, files, attributes):
753     """
754     Create a section in the submission ini file
755     """
756     inifile = [ "[line%s]" % (line_counter,) ]
757     inifile += ["files=%s" % (",".join(files))]
758
759     for k,v in attributes.items():
760         inifile += ["%s=%s" % (k,v)]
761     return inifile
762
763
764 def make_base_name(pathname):
765     base = os.path.basename(pathname)
766     name, ext = os.path.splitext(base)
767     return name
768
769
770 def make_submission_name(ininame):
771     name = make_base_name(ininame)
772     return name + ".tgz"
773
774
775 def make_ddf_name(pathname):
776     name = make_base_name(pathname)
777     return name + ".ddf"
778
779
780 def make_condor_name(pathname, run_type=None):
781     name = make_base_name(pathname)
782     elements = [name]
783     if run_type is not None:
784         elements.append(run_type)
785     elements.append("condor")
786     return ".".join(elements)
787
788
789 def make_submit_script(target, header, body_list):
790     """
791     write out a text file
792
793     this was intended for condor submit scripts
794
795     Args:
796       target (str or stream): 
797         if target is a string, we will open and close the file
798         if target is a stream, the caller is responsible.
799
800       header (str);
801         header to write at the beginning of the file
802       body_list (list of strs):
803         a list of blocks to add to the file.
804     """
805     if type(target) in types.StringTypes:
806         f = open(target,"w")
807     else:
808         f = target
809     f.write(header)
810     for entry in body_list:
811         f.write(entry)
812     if type(target) in types.StringTypes:
813         f.close()
814
815 def parse_filelist(file_string):
816     return file_string.split(",")
817
818
819 def validate_filelist(files):
820     """
821     Die if a file doesn't exist in a file list
822     """
823     for f in files:
824         if not os.path.exists(f):
825             raise RuntimeError("%s does not exist" % (f,))
826
827
828 if __name__ == "__main__":
829     main()