extra/ucsc_encode_submission/ucsc_gather.py

   1 #!/usr/bin/env python
   2 from ConfigParser import SafeConfigParser
   3 import fnmatch
   4 from glob import glob
   5 import json
   6 import logging
   7 from optparse import OptionParser
   8 import os
   9 from pprint import pprint, pformat
  10 import shlex
  11 from StringIO import StringIO
  12 import time
  13 import sys
  14 import types
  15 import urllib
  16 import urllib2
  17 import urlparse
  18
  19 from htsworkflow.util import api
  20 from htsworkflow.pipelines.sequences import \
  21     create_sequence_table, \
  22     scan_for_sequences
  23 from htsworkflow.pipelines import qseq2fastq
  24 from htsworkflow.pipelines import srf2fastq
  25
  26 def main(cmdline=None):
  27     parser = make_parser()
  28     opts, args = parser.parse_args(cmdline)
  29
  30     if opts.debug:
  31         logging.basicConfig(level = logging.DEBUG )
  32     elif opts.verbose:
  33         logging.basicConfig(level = logging.INFO )
  34     else:
  35         logging.basicConfig(level = logging.WARNING )
  36
  37     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
  38
  39     if opts.host is None or opts.apiid is None or opts.apikey is None:
  40         parser.error("Please specify host url, apiid, apikey")
  41
  42     if len(args) == 0:
  43         parser.error("I need at least one library submission-dir input file")
  44
  45     library_result_map = []
  46     for a in args:
  47         library_result_map.extend(read_library_result_map(a))
  48
  49     if opts.daf is not None:
  50         link_daf(opts.daf, library_result_map)
  51
  52     if opts.fastq:
  53         build_fastqs(opts.host,
  54                      apidata,
  55                      opts.sequence,
  56                      library_result_map,
  57                      force=opts.force)
  58
  59     if opts.ini:
  60         make_submission_ini(opts.host, apidata, library_result_map)
  61
  62     if opts.makeddf:
  63         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
  64
  65
  66 def make_parser():
  67     # Load defaults from the config files
  68     config = SafeConfigParser()
  69     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
  70
  71     sequence_archive = None
  72     apiid = None
  73     apikey = None
  74     apihost = None
  75     SECTION = 'sequence_archive'
  76     if config.has_section(SECTION):
  77         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
  78         sequence_archive = os.path.expanduser(sequence_archive)
  79         apiid = config.get(SECTION, 'apiid', apiid)
  80         apikey = config.get(SECTION, 'apikey', apikey)
  81         apihost = config.get(SECTION, 'host', apihost)
  82
  83     parser = OptionParser()
  84
  85     # commands
  86     parser.add_option('--fastq', help="generate scripts for making fastq files",
  87                       default=False, action="store_true")
  88
  89     parser.add_option('--ini', help="generate submission ini file", default=False,
  90                       action="store_true")
  91
  92     parser.add_option('--makeddf', help='make the ddfs', default=False,
  93                       action="store_true")
  94
  95     parser.add_option('--daf', default=None, help='specify daf name')
  96     parser.add_option('--force', default=False, action="store_true",
  97                       help="Force regenerating fastqs")
  98
  99     # configuration options
 100     parser.add_option('--apiid', default=apiid, help="Specify API ID")
 101     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
 102     parser.add_option('--host',  default=apihost,
 103                       help="specify HTSWorkflow host",)
 104     parser.add_option('--sequence', default=sequence_archive,
 105                       help="sequence repository")
 106
 107     # debugging
 108     parser.add_option('--verbose', default=False, action="store_true",
 109                       help='verbose logging')
 110     parser.add_option('--debug', default=False, action="store_true",
 111                       help='debug logging')
 112
 113     return parser
 114
 115
 116 def build_fastqs(host, apidata, sequences_path, library_result_map,
 117                  force=False ):
 118     """
 119     Generate condor scripts to build any needed fastq files
 120
 121     Args:
 122       host (str): root of the htsworkflow api server
 123       apidata (dict): id & key to post to the server
 124       sequences_path (str): root of the directory tree to scan for files
 125       library_result_map (list):  [(library_id, destination directory), ...]
 126     """
 127     qseq_condor_header = """
 128 Universe=vanilla
 129 executable=%(exe)s
 130 error=log/qseq2fastq.err.$(process).log
 131 output=log/qseq2fastq.out.$(process).log
 132 log=log/qseq2fastq.log
 133
 134 """ % {'exe': sys.executable }
 135     qseq_condor_entries = []
 136     srf_condor_header = """
 137 Universe=vanilla
 138 executable=%(exe)s
 139 output=log/srf_pair_fastq.out.$(process).log
 140 error=log/srf_pair_fastq.err.$(process).log
 141 log=log/srf_pair_fastq.log
 142 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
 143
 144 """ % {'exe': sys.executable }
 145     srf_condor_entries = []
 146     lib_db = find_archive_sequence_files(host,
 147                                          apidata,
 148                                          sequences_path,
 149                                          library_result_map)
 150
 151     needed_targets = find_missing_targets(library_result_map, lib_db, force)
 152
 153     for target_pathname, available_sources in needed_targets.items():
 154         logging.debug(' target : %s' % (target_pathname,))
 155         logging.debug(' candidate sources: %s' % (available_sources,))
 156         if available_sources.has_key('qseq'):
 157             source = available_sources['qseq']
 158             qseq_condor_entries.append(
 159                 condor_qseq_to_fastq(source.path,
 160                                      target_pathname,
 161                                      source.flowcell,
 162                                      force=force)
 163             )
 164         elif available_sources.has_key('srf'):
 165             source = available_sources['srf']
 166             mid = getattr(source, 'mid_point', None)
 167             srf_condor_entries.append(
 168                 condor_srf_to_fastq(source.path,
 169                                     target_pathname,
 170                                     source.paired,
 171                                     source.flowcell,
 172                                     mid,
 173                                     force=force)
 174             )
 175         else:
 176             print " need file", target_pathname
 177
 178     if len(srf_condor_entries) > 0:
 179         make_submit_script('srf.fastq.condor',
 180                            srf_condor_header,
 181                            srf_condor_entries)
 182
 183     if len(qseq_condor_entries) > 0:
 184         make_submit_script('qseq.fastq.condor',
 185                            qseq_condor_header,
 186                            qseq_condor_entries)
 187
 188
 189 def find_missing_targets(library_result_map, lib_db, force=False):
 190     """
 191     Check if the sequence file exists.
 192     This requires computing what the sequence name is and checking
 193     to see if it can be found in the sequence location.
 194
 195     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
 196     """
 197     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
 198     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
 199     # find what targets we're missing
 200     needed_targets = {}
 201     for lib_id, result_dir in library_result_map:
 202         lib = lib_db[lib_id]
 203         lane_dict = make_lane_dict(lib_db, lib_id)
 204
 205         for lane_key, sequences in lib['lanes'].items():
 206             for seq in sequences:
 207                 seq.paired = lane_dict[seq.flowcell]['paired_end']
 208                 lane_status = lane_dict[seq.flowcell]['status']
 209
 210                 if seq.paired and seq.read is None:
 211                     seq.read = 1
 212                 filename_attributes = {
 213                     'flowcell': seq.flowcell,
 214                     'lib_id': lib_id,
 215                     'lane': seq.lane,
 216                     'read': seq.read,
 217                     'cycle': seq.cycle
 218                     }
 219                 # skip bad runs
 220                 if lane_status == 'Failed':
 221                     continue
 222                 if seq.flowcell == '30DY0AAXX':
 223                     # 30DY0 only ran for 151 bases instead of 152
 224                     # it is actually 76 1st read, 75 2nd read
 225                     seq.mid_point = 76
 226
 227                 # end filters
 228                 if seq.paired:
 229                     target_name = fastq_paired_template % filename_attributes
 230                 else:
 231                     target_name = fastq_single_template % filename_attributes
 232
 233                 target_pathname = os.path.join(result_dir, target_name)
 234                 if force or not os.path.exists(target_pathname):
 235                     t = needed_targets.setdefault(target_pathname, {})
 236                     t[seq.filetype] = seq
 237
 238     return needed_targets
 239
 240
 241 def link_daf(daf_path, library_result_map):
 242     if not os.path.exists(daf_path):
 243         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
 244
 245     base_daf = os.path.basename(daf_path)
 246
 247     for lib_id, result_dir in library_result_map:
 248         submission_daf = os.path.join(result_dir, base_daf)
 249         if not os.path.exists(submission_daf):
 250             os.link(daf_path, submission_daf)
 251
 252
 253 def make_submission_ini(host, apidata, library_result_map, paired=True):
 254     #attributes = get_filename_attribute_map(paired)
 255     view_map = NameToViewMap(host, apidata)
 256
 257     candidate_fastq_src = {}
 258
 259     for lib_id, result_dir in library_result_map:
 260         order_by = ['order_by=files', 'view', 'replicate', 'cell',
 261                     'readType', 'mapAlgorithm', 'insertLength' ]
 262         inifile =  ['[config]']
 263         inifile += [" ".join(order_by)]
 264         inifile += ['']
 265         line_counter = 1
 266         result_ini = os.path.join(result_dir, result_dir+'.ini')
 267
 268         # write other lines
 269         submission_files = os.listdir(result_dir)
 270         fastqs = {}
 271         for f in submission_files:
 272             attributes = view_map.find_attributes(f, lib_id)
 273             if attributes is None:
 274                 raise ValueError("Unrecognized file: %s" % (f,))
 275
 276             ext = attributes["extension"]
 277             if attributes['view'] is None:
 278                 continue
 279             elif attributes.get("type", None) == 'fastq':
 280                 fastqs.setdefault(ext, set()).add(f)
 281             else:
 282                 inifile.extend(
 283                     make_submission_section(line_counter,
 284                                             [f],
 285                                             attributes
 286                                             )
 287                     )
 288                 inifile += ['']
 289                 line_counter += 1
 290
 291         # add in fastqs on a single line.
 292         for extension, fastq_set in fastqs.items():
 293             inifile.extend(
 294                 make_submission_section(line_counter,
 295                                         fastq_set,
 296                                         attributes[extension])
 297             )
 298             inifile += ['']
 299             line_counter += 1
 300
 301         f = open(result_ini,'w')
 302         f.write(os.linesep.join(inifile))
 303
 304
 305 def make_lane_dict(lib_db, lib_id):
 306     """
 307     Convert the lane_set in a lib_db to a dictionary
 308     indexed by flowcell ID
 309     """
 310     result = []
 311     for lane in lib_db[lib_id]['lane_set']:
 312         result.append((lane['flowcell'], lane))
 313     return dict(result)
 314
 315
 316 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
 317     dag_fragment = []
 318     for lib_id, result_dir in library_result_map:
 319         ininame = result_dir+'.ini'
 320         inipathname = os.path.join(result_dir, ininame)
 321         if os.path.exists(inipathname):
 322             dag_fragment.extend(
 323                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
 324             )
 325
 326     if make_condor and len(dag_fragment) > 0:
 327         dag_filename = 'submission.dagman'
 328         if not force and os.path.exists(dag_filename):
 329             logging.warn("%s exists, please delete" % (dag_filename,))
 330         else:
 331             f = open(dag_filename,'w')
 332             f.write( os.linesep.join(dag_fragment))
 333             f.write( os.linesep )
 334             f.close()
 335
 336
 337 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
 338     """
 339     Make ddf files, and bonus condor file
 340     """
 341     dag_fragments = []
 342     curdir = os.getcwd()
 343     if outdir is not None:
 344         os.chdir(outdir)
 345     output = sys.stdout
 346     ddf_name = None
 347     if guess_ddf:
 348         ddf_name = make_ddf_name(ininame)
 349         print ddf_name
 350         output = open(ddf_name,'w')
 351
 352     file_list = read_ddf_ini(ininame, output)
 353
 354     file_list.append(daf_name)
 355     if ddf_name is not None:
 356         file_list.append(ddf_name)
 357
 358     if make_condor:
 359         archive_condor = make_condor_archive_script(ininame, file_list)
 360         upload_condor = make_condor_upload_script(ininame)
 361
 362         dag_fragments.extend(
 363             make_dag_fragment(ininame, archive_condor, upload_condor)
 364         )
 365
 366     os.chdir(curdir)
 367
 368     return dag_fragments
 369
 370
 371 def read_ddf_ini(filename, output=sys.stdout):
 372     """
 373     Read a ini file and dump out a tab delmited text file
 374     """
 375     file_list = []
 376     config = SafeConfigParser()
 377     config.read(filename)
 378
 379     order_by = shlex.split(config.get("config", "order_by"))
 380
 381     output.write("\t".join(order_by))
 382     output.write(os.linesep)
 383     sections = config.sections()
 384     sections.sort()
 385     for section in sections:
 386         if section == "config":
 387             # skip the config block
 388             continue
 389         values = []
 390         for key in order_by:
 391             v = config.get(section, key)
 392             values.append(v)
 393             if key == 'files':
 394                 file_list.extend(parse_filelist(v))
 395
 396         output.write("\t".join(values))
 397         output.write(os.linesep)
 398     return file_list
 399
 400
 401 def read_library_result_map(filename):
 402     """
 403     Read a file that maps library id to result directory.
 404     Does not support spaces in filenames.
 405
 406     For example:
 407       10000 result/foo/bar
 408     """
 409     stream = open(filename,'r')
 410
 411     results = []
 412     for line in stream:
 413         line = line.rstrip()
 414         if not line.startswith('#') and len(line) > 0 :
 415             library_id, result_dir = line.split()
 416             results.append((library_id, result_dir))
 417     return results
 418
 419
 420 def make_condor_archive_script(ininame, files):
 421     script = """Universe = vanilla
 422
 423 Executable = /bin/tar
 424 arguments = czvf ../%(archivename)s %(filelist)s
 425
 426 Error = compress.err.$(Process).log
 427 Output = compress.out.$(Process).log
 428 Log = /tmp/submission-compress-%(user)s.log
 429 initialdir = %(initialdir)s
 430
 431 queue
 432 """
 433     for f in files:
 434         if not os.path.exists(f):
 435             raise RuntimeError("Missing %s" % (f,))
 436
 437     context = {'archivename': make_submission_name(ininame),
 438                'filelist': " ".join(files),
 439                'initialdir': os.getcwd(),
 440                'user': os.getlogin()}
 441
 442     condor_script = make_condor_name(ininame, 'archive')
 443     condor_stream = open(condor_script,'w')
 444     condor_stream.write(script % context)
 445     condor_stream.close()
 446     return condor_script
 447
 448
 449 def make_condor_upload_script(ininame):
 450     script = """Universe = vanilla
 451
 452 Executable = /usr/bin/lftp
 453 arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/%(archivename)s
 454
 455 Error = upload.err.$(Process).log
 456 Output = upload.out.$(Process).log
 457 Log = /tmp/submission-upload-%(user)s.log
 458 initialdir = %(initialdir)s
 459
 460 queue
 461 """
 462     context = {'archivename': make_submission_name(ininame),
 463                'initialdir': os.getcwd(),
 464                'user': os.getlogin()}
 465
 466     condor_script = make_condor_name(ininame, 'upload')
 467     condor_stream = open(condor_script,'w')
 468     condor_stream.write(script % context)
 469     condor_stream.close()
 470     return condor_script
 471
 472
 473 def make_dag_fragment(ininame, archive_condor, upload_condor):
 474     """
 475     Make the couple of fragments compress and then upload the data.
 476     """
 477     cur_dir = os.getcwd()
 478     archive_condor = os.path.join(cur_dir, archive_condor)
 479     upload_condor = os.path.join(cur_dir, upload_condor)
 480     job_basename = make_base_name(ininame)
 481
 482     fragments = []
 483     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
 484     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
 485     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
 486
 487     return fragments
 488
 489
 490 def get_library_info(host, apidata, library_id):
 491     url = api.library_url(host, library_id)
 492     contents = api.retrieve_info(url, apidata)
 493     return contents
 494
 495
 496 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
 497                         mid=None, force=False):
 498     py = srf2fastq.__file__
 499     args = [ py, srf_file, ]
 500     if paired:
 501         args.extend(['--left', target_pathname])
 502         # this is ugly. I did it because I was pregenerating the target
 503         # names before I tried to figure out what sources could generate
 504         # those targets, and everything up to this point had been
 505         # one-to-one. So I couldn't figure out how to pair the
 506         # target names.
 507         # With this at least the command will run correctly.
 508         # however if we rename the default targets, this'll break
 509         # also I think it'll generate it twice.
 510         args.extend(['--right',
 511                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
 512     else:
 513         args.extend(['--single', target_pathname ])
 514     if flowcell is not None:
 515         args.extend(['--flowcell', flowcell])
 516
 517     if mid is not None:
 518         args.extend(['-m', str(mid)])
 519
 520     if force:
 521         args.extend(['--force'])
 522
 523     script = """
 524 arguments="%s"
 525 queue
 526 """ % (" ".join(args),)
 527
 528     return  script
 529
 530
 531 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
 532     py = qseq2fastq.__file__
 533     args = [py, '-i', qseq_file, '-o', target_pathname ]
 534     if flowcell is not None:
 535         args.extend(['-f', flowcell])
 536     script = """
 537 arguments="%s"
 538 queue
 539 """ % (" ".join(args))
 540
 541     return script
 542
 543 def find_archive_sequence_files(host, apidata, sequences_path,
 544                                 library_result_map):
 545     """
 546     Find all the archive sequence files possibly associated with our results.
 547
 548     """
 549     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
 550
 551     lib_db = {}
 552     seq_dirs = set()
 553     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
 554     candidate_lanes = {}
 555     for lib_id, result_dir in library_result_map:
 556         lib_info = get_library_info(host, apidata, lib_id)
 557         lib_info['lanes'] = {}
 558         lib_db[lib_id] = lib_info
 559
 560         for lane in lib_info['lane_set']:
 561             lane_key = (lane['flowcell'], lane['lane_number'])
 562             candidate_lanes[lane_key] = lib_id
 563             seq_dirs.add(os.path.join(sequences_path,
 564                                          'flowcells',
 565                                          lane['flowcell']))
 566     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
 567     candidate_seq_list = scan_for_sequences(seq_dirs)
 568
 569     # at this point we have too many sequences as scan_for_sequences
 570     # returns all the sequences in a flowcell directory
 571     # so lets filter out the extras
 572
 573     for seq in candidate_seq_list:
 574         lane_key = (seq.flowcell, seq.lane)
 575         lib_id = candidate_lanes.get(lane_key, None)
 576         if lib_id is not None:
 577             lib_info = lib_db[lib_id]
 578             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
 579
 580     return lib_db
 581
 582
 583 class NameToViewMap(object):
 584     """Determine view attributes for a given submission file name
 585     """
 586     def __init__(self, root_url, apidata):
 587         self.root_url = root_url
 588         self.apidata = apidata
 589
 590         self.lib_cache = {}
 591         self.lib_paired = {}
 592         # ma is "map algorithm"
 593         ma = 'TH1014'
 594
 595         self.patterns = [
 596             ('*.bai',                   None),
 597             ('*.bam',                   self._guess_bam_view),
 598             ('*.splices.bam',           'Splices'),
 599             ('junctions.bed',           'Junctions'),
 600             ('*.jnct',                  'Junctions'),
 601             ('*.plus.bigwig',           'PlusSignal'),
 602             ('*.minus.bigwig',          'MinusSignal'),
 603             ('*.bigwig',                'Signal'),
 604             ('*.tar.bz2',               None),
 605             ('*.condor',                None),
 606             ('*.daf',                   None),
 607             ('*.ddf',                   None),
 608             ('cufflinks-0.9.0-genes.expr',       'GeneDeNovo'),
 609             ('cufflinks-0.9.0-transcripts.expr', 'TranscriptDeNovo'),
 610             ('cufflinks-0.9.0-transcripts.gtf',  'GeneModel'),
 611             ('GENCODE-v3c-genes.expr',       'GeneGencV3c'),
 612             ('GENCODE-v3c-transcripts.expr', 'TranscriptGencV3c'),
 613             ('GENCODE-v4-genes.expr',       'GeneGencV4'),
 614             ('GENCODE-v4-transcripts.expr', 'TranscriptGencV4'),
 615             ('GENCODE-v4-transcript.expr', 'TranscriptGencV4'),
 616             ('*_r1.fastq',              'FastqRd1'),
 617             ('*_r2.fastq',              'FastqRd2'),
 618             ('*.fastq',                 'Fastq'),
 619             ('*.gtf',                   'GeneModel'),
 620             ('*.ini',                   None),
 621             ('*.log',                   None),
 622             ('*.stats.txt',             'InsLength'),
 623             ('*.srf',                   None),
 624             ('*.wig',                   None),
 625             ('*.zip',                   None),
 626             ]
 627
 628         self.views = {
 629             None: {"MapAlgorithm": "NA"},
 630             "Paired": {"MapAlgorithm": ma},
 631             "Single": {"MapAlgorithm": ma},
 632             "Splices": {"MapAlgorithm": ma},
 633             "Junctions": {"MapAlgorithm": ma},
 634             "PlusSignal": {"MapAlgorithm": ma},
 635             "MinusSignal": {"MapAlgorithm": ma},
 636             "Signal": {"MapAlgorithm": ma},
 637             "GeneModel": {"MapAlgorithm": ma},
 638             "GeneDeNovo": {"MapAlgorithm": ma},
 639             "TranscriptDeNovo": {"MapAlgorithm": ma},
 640             "GeneGencV3c": {"MapAlgorithm": ma},
 641             "TranscriptGencV3c": {"MapAlgorithm": ma},
 642             "GeneGencV4": {"MapAlgorithm": ma},
 643             "TranscriptGencV4": {"MapAlgorithm": ma},
 644             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
 645             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
 646             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
 647             "GeneModel": {"MapAlgorithm": ma},
 648             "InsLength": {"MapAlgorithm": ma},
 649             }
 650         # view name is one of the attributes
 651         for v in self.views.keys():
 652             self.views[v]['view'] = v
 653
 654     def find_attributes(self, pathname, lib_id):
 655         """Looking for the best extension
 656         The 'best' is the longest match
 657
 658         :Args:
 659         filename (str): the filename whose extention we are about to examine
 660         """
 661         path, filename = os.path.splitext(pathname)
 662         if not self.lib_cache.has_key(lib_id):
 663             self.lib_cache[lib_id] = get_library_info(self.root_url,
 664                                                       self.apidata, lib_id)
 665
 666         lib_info = self.lib_cache[lib_id]
 667         if lib_info['cell_line'].lower() == 'unknown':
 668             logging.warn("Library %s missing cell_line" % (lib_id,))
 669         attributes = {
 670             'cell': lib_info['cell_line'],
 671             'replicate': lib_info['replicate'],
 672             }
 673         is_paired = self._is_paired(lib_id, lib_info)
 674
 675         if is_paired:
 676             attributes.update(self.get_paired_attributes(lib_info))
 677         else:
 678             attributes.update(self.get_single_attributes(lib_info))
 679
 680         for pattern, view in self.patterns:
 681             if fnmatch.fnmatch(pathname, pattern):
 682                 if callable(view):
 683                     view = view(is_paired=is_paired)
 684
 685                 attributes.update(self.views[view])
 686                 attributes["extension"] = pattern
 687                 return attributes
 688
 689
 690     def _guess_bam_view(self, is_paired=True):
 691         """Guess a view name based on library attributes
 692         """
 693         if is_paired:
 694             return "Paired"
 695         else:
 696             return "Align"
 697
 698
 699     def _is_paired(self, lib_id, lib_info):
 700         """Determine if a library is paired end"""
 701         if len(lib_info["lane_set"]) == 0:
 702             return False
 703
 704         if not self.lib_paired.has_key(lib_id):
 705             is_paired = 0
 706             isnot_paired = 0
 707             failed = 0
 708             # check to see if all the flowcells are the same.
 709             # otherwise we might need to do something complicated
 710             for flowcell in lib_info["lane_set"]:
 711                 # yes there's also a status code, but this comparison
 712                 # is easier to read
 713                 if flowcell["status"].lower() == "failed":
 714                     # ignore failed flowcell
 715                     failed += 1
 716                     pass
 717                 elif flowcell["paired_end"]:
 718                     is_paired += 1
 719                 else:
 720                     isnot_paired += 1
 721
 722             logging.debug("Library %s: %d paired, %d single, %d failed" % \
 723                      (lib_info["library_id"], is_paired, isnot_paired, failed))
 724
 725             if is_paired > isnot_paired:
 726                 self.lib_paired[lib_id] = True
 727             elif is_paired < isnot_paired:
 728                 self.lib_paired[lib_id] = False
 729             else:
 730                 raise RuntimeError("Equal number of paired & unpaired lanes."\
 731                                    "Can't guess library paired status")
 732
 733         return self.lib_paired[lib_id]
 734
 735     def get_paired_attributes(self, lib_info):
 736         if lib_info['insert_size'] is None:
 737             errmsg = "Library %s is missing insert_size, assuming 200"
 738             logging.warn(errmsg % (lib_info["library_id"],))
 739             insert_size = 200
 740         else:
 741             insert_size = lib_info['insert_size']
 742         return {'insertLength': insert_size,
 743                 'readType': '2x75'}
 744
 745     def get_single_attributes(self, lib_info):
 746         return {'insertLength':'ilNA',
 747                 'readType': '1x75D'
 748                 }
 749
 750 def make_submission_section(line_counter, files, attributes):
 751     """
 752     Create a section in the submission ini file
 753     """
 754     inifile = [ "[line%s]" % (line_counter,) ]
 755     inifile += ["files=%s" % (",".join(files))]
 756
 757     for k,v in attributes.items():
 758         inifile += ["%s=%s" % (k,v)]
 759     return inifile
 760
 761
 762 def make_base_name(pathname):
 763     base = os.path.basename(pathname)
 764     name, ext = os.path.splitext(base)
 765     return name
 766
 767
 768 def make_submission_name(ininame):
 769     name = make_base_name(ininame)
 770     return name + ".tgz"
 771
 772
 773 def make_ddf_name(pathname):
 774     name = make_base_name(pathname)
 775     return name + ".ddf"
 776
 777
 778 def make_condor_name(pathname, run_type=None):
 779     name = make_base_name(pathname)
 780     elements = [name]
 781     if run_type is not None:
 782         elements.append(run_type)
 783     elements.append("condor")
 784     return ".".join(elements)
 785
 786
 787 def make_submit_script(target, header, body_list):
 788     """
 789     write out a text file
 790
 791     this was intended for condor submit scripts
 792
 793     Args:
 794       target (str or stream):
 795         if target is a string, we will open and close the file
 796         if target is a stream, the caller is responsible.
 797
 798       header (str);
 799         header to write at the beginning of the file
 800       body_list (list of strs):
 801         a list of blocks to add to the file.
 802     """
 803     if type(target) in types.StringTypes:
 804         f = open(target,"w")
 805     else:
 806         f = target
 807     f.write(header)
 808     for entry in body_list:
 809         f.write(entry)
 810     if type(target) in types.StringTypes:
 811         f.close()
 812
 813 def parse_filelist(file_string):
 814     return file_string.split(",")
 815
 816
 817 def validate_filelist(files):
 818     """
 819     Die if a file doesn't exist in a file list
 820     """
 821     for f in files:
 822         if not os.path.exists(f):
 823             raise RuntimeError("%s does not exist" % (f,))
 824
 825
 826 if __name__ == "__main__":
 827     main()