extra/ucsc_encode_submission/ucsc_gather.py

   1 #!/usr/bin/env python
   2 from ConfigParser import SafeConfigParser
   3 import fnmatch
   4 from glob import glob
   5 import json
   6 import logging
   7 import netrc
   8 from optparse import OptionParser
   9 import os
  10 from pprint import pprint, pformat
  11 import shlex
  12 from StringIO import StringIO
  13 import stat
  14 from subprocess import Popen, PIPE
  15 import sys
  16 import time
  17 import types
  18 import urllib
  19 import urllib2
  20 import urlparse
  21
  22 from htsworkflow.util import api
  23 from htsworkflow.pipelines.sequences import \
  24     create_sequence_table, \
  25     scan_for_sequences
  26 from htsworkflow.pipelines import qseq2fastq
  27 from htsworkflow.pipelines import srf2fastq
  28
  29 def main(cmdline=None):
  30     parser = make_parser()
  31     opts, args = parser.parse_args(cmdline)
  32
  33     if opts.debug:
  34         logging.basicConfig(level = logging.DEBUG )
  35     elif opts.verbose:
  36         logging.basicConfig(level = logging.INFO )
  37     else:
  38         logging.basicConfig(level = logging.WARNING )
  39
  40     apidata = api.make_auth_from_opts(opts, parser)
  41
  42     if opts.makeddf and opts.daf is None:
  43         parser.error("Please specify your daf when making ddf files")
  44
  45     if len(args) == 0:
  46         parser.error("I need at least one library submission-dir input file")
  47
  48     library_result_map = []
  49     for a in args:
  50         library_result_map.extend(read_library_result_map(a))
  51
  52     if opts.make_tree_from is not None:
  53         make_tree_from(opts.make_tree_from, library_result_map)
  54
  55     if opts.daf is not None:
  56         link_daf(opts.daf, library_result_map)
  57
  58     if opts.fastq:
  59         build_fastqs(opts.host,
  60                      apidata,
  61                      opts.sequence,
  62                      library_result_map,
  63                      force=opts.force)
  64
  65     if opts.ini:
  66         make_submission_ini(opts.host, apidata, library_result_map)
  67
  68     if opts.makeddf:
  69         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
  70
  71
  72 def make_parser():
  73     parser = OptionParser()
  74
  75     # commands
  76     parser.add_option('--make-tree-from',
  77                       help="create directories & link data files",
  78                       default=None)
  79     parser.add_option('--fastq', help="generate scripts for making fastq files",
  80                       default=False, action="store_true")
  81
  82     parser.add_option('--ini', help="generate submission ini file", default=False,
  83                       action="store_true")
  84
  85     parser.add_option('--makeddf', help='make the ddfs', default=False,
  86                       action="store_true")
  87
  88     parser.add_option('--daf', default=None, help='specify daf name')
  89     parser.add_option('--force', default=False, action="store_true",
  90                       help="Force regenerating fastqs")
  91
  92     # debugging
  93     parser.add_option('--verbose', default=False, action="store_true",
  94                       help='verbose logging')
  95     parser.add_option('--debug', default=False, action="store_true",
  96                       help='debug logging')
  97
  98     api.add_auth_options(parser)
  99
 100     return parser
 101
 102
 103 def make_tree_from(source_path, library_result_map):
 104     """Create a tree using data files from source path.
 105     """
 106     for lib_id, lib_path in library_result_map:
 107         if not os.path.exists(lib_path):
 108             logging.info("Making dir {0}".format(lib_path))
 109             os.mkdir(lib_path)
 110         source_lib_dir = os.path.join(source_path, lib_path)
 111         if os.path.exists(source_lib_dir):
 112             pass
 113         for filename in os.listdir(source_lib_dir):
 114             source_pathname = os.path.join(source_lib_dir, filename)
 115             target_pathname = os.path.join(lib_path, filename)
 116             if not os.path.exists(source_pathname):
 117                 raise IOError("{0} does not exist".format(source_pathname))
 118             if not os.path.exists(target_pathname):
 119                 os.symlink(source_pathname, target_pathname)
 120                 logging.info(
 121                     'LINK {0} to {1}'.format(source_pathname, target_pathname))
 122
 123 def build_fastqs(host, apidata, sequences_path, library_result_map,
 124                  force=False ):
 125     """
 126     Generate condor scripts to build any needed fastq files
 127
 128     Args:
 129       host (str): root of the htsworkflow api server
 130       apidata (dict): id & key to post to the server
 131       sequences_path (str): root of the directory tree to scan for files
 132       library_result_map (list):  [(library_id, destination directory), ...]
 133     """
 134     qseq_condor_header = """
 135 Universe=vanilla
 136 executable=%(exe)s
 137 error=log/qseq2fastq.err.$(process).log
 138 output=log/qseq2fastq.out.$(process).log
 139 log=log/qseq2fastq.log
 140
 141 """ % {'exe': sys.executable }
 142     qseq_condor_entries = []
 143     srf_condor_header = """
 144 Universe=vanilla
 145 executable=%(exe)s
 146 output=log/srf_pair_fastq.out.$(process).log
 147 error=log/srf_pair_fastq.err.$(process).log
 148 log=log/srf_pair_fastq.log
 149 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
 150
 151 """ % {'exe': sys.executable }
 152     srf_condor_entries = []
 153     lib_db = find_archive_sequence_files(host,
 154                                          apidata,
 155                                          sequences_path,
 156                                          library_result_map)
 157
 158     needed_targets = find_missing_targets(library_result_map, lib_db, force)
 159
 160     for target_pathname, available_sources in needed_targets.items():
 161         logging.debug(' target : %s' % (target_pathname,))
 162         logging.debug(' candidate sources: %s' % (available_sources,))
 163         if available_sources.has_key('qseq'):
 164             source = available_sources['qseq']
 165             qseq_condor_entries.append(
 166                 condor_qseq_to_fastq(source.path,
 167                                      target_pathname,
 168                                      source.flowcell,
 169                                      force=force)
 170             )
 171         elif available_sources.has_key('srf'):
 172             source = available_sources['srf']
 173             mid = getattr(source, 'mid_point', None)
 174             srf_condor_entries.append(
 175                 condor_srf_to_fastq(source.path,
 176                                     target_pathname,
 177                                     source.paired,
 178                                     source.flowcell,
 179                                     mid,
 180                                     force=force)
 181             )
 182         else:
 183             print " need file", target_pathname
 184
 185     if len(srf_condor_entries) > 0:
 186         make_submit_script('srf.fastq.condor',
 187                            srf_condor_header,
 188                            srf_condor_entries)
 189
 190     if len(qseq_condor_entries) > 0:
 191         make_submit_script('qseq.fastq.condor',
 192                            qseq_condor_header,
 193                            qseq_condor_entries)
 194
 195
 196 def find_missing_targets(library_result_map, lib_db, force=False):
 197     """
 198     Check if the sequence file exists.
 199     This requires computing what the sequence name is and checking
 200     to see if it can be found in the sequence location.
 201
 202     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
 203     """
 204     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
 205     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
 206     # find what targets we're missing
 207     needed_targets = {}
 208     for lib_id, result_dir in library_result_map:
 209         lib = lib_db[lib_id]
 210         lane_dict = make_lane_dict(lib_db, lib_id)
 211
 212         for lane_key, sequences in lib['lanes'].items():
 213             for seq in sequences:
 214                 seq.paired = lane_dict[seq.flowcell]['paired_end']
 215                 lane_status = lane_dict[seq.flowcell]['status']
 216
 217                 if seq.paired and seq.read is None:
 218                     seq.read = 1
 219                 filename_attributes = {
 220                     'flowcell': seq.flowcell,
 221                     'lib_id': lib_id,
 222                     'lane': seq.lane,
 223                     'read': seq.read,
 224                     'cycle': seq.cycle
 225                     }
 226                 # skip bad runs
 227                 if lane_status == 'Failed':
 228                     continue
 229                 if seq.flowcell == '30DY0AAXX':
 230                     # 30DY0 only ran for 151 bases instead of 152
 231                     # it is actually 76 1st read, 75 2nd read
 232                     seq.mid_point = 76
 233
 234                 # end filters
 235                 if seq.paired:
 236                     target_name = fastq_paired_template % filename_attributes
 237                 else:
 238                     target_name = fastq_single_template % filename_attributes
 239
 240                 target_pathname = os.path.join(result_dir, target_name)
 241                 if force or not os.path.exists(target_pathname):
 242                     t = needed_targets.setdefault(target_pathname, {})
 243                     t[seq.filetype] = seq
 244
 245     return needed_targets
 246
 247
 248 def link_daf(daf_path, library_result_map):
 249     if not os.path.exists(daf_path):
 250         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
 251
 252     base_daf = os.path.basename(daf_path)
 253
 254     for lib_id, result_dir in library_result_map:
 255         if not os.path.exists(result_dir):
 256             raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
 257         submission_daf = os.path.join(result_dir, base_daf)
 258         if not os.path.exists(submission_daf):
 259             if not os.path.exists(daf_path):
 260                 raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
 261             os.link(daf_path, submission_daf)
 262
 263
 264 def make_submission_ini(host, apidata, library_result_map, paired=True):
 265     #attributes = get_filename_attribute_map(paired)
 266     view_map = NameToViewMap(host, apidata)
 267
 268     candidate_fastq_src = {}
 269
 270     for lib_id, result_dir in library_result_map:
 271         order_by = ['order_by=files', 'view', 'replicate', 'cell',
 272                     'readType', 'mapAlgorithm', 'insertLength', 'md5sum' ]
 273         inifile =  ['[config]']
 274         inifile += [" ".join(order_by)]
 275         inifile += ['']
 276         line_counter = 1
 277         result_ini = os.path.join(result_dir, result_dir+'.ini')
 278
 279         # write other lines
 280         submission_files = os.listdir(result_dir)
 281         fastqs = {}
 282         fastq_attributes = {}
 283         for f in submission_files:
 284             attributes = view_map.find_attributes(f, lib_id)
 285             if attributes is None:
 286                 raise ValueError("Unrecognized file: %s" % (f,))
 287             attributes['md5sum'] = "None"
 288
 289             ext = attributes["extension"]
 290             if attributes['view'] is None:
 291                 continue
 292             elif attributes.get("type", None) == 'fastq':
 293                 fastqs.setdefault(ext, set()).add(f)
 294                 fastq_attributes[ext] = attributes
 295             else:
 296                 md5sum = make_md5sum(os.path.join(result_dir,f))
 297                 if md5sum is not None:
 298                     attributes['md5sum']=md5sum
 299                 inifile.extend(
 300                     make_submission_section(line_counter,
 301                                             [f],
 302                                             attributes
 303                                             )
 304                     )
 305                 inifile += ['']
 306                 line_counter += 1
 307                 # add in fastqs on a single line.
 308
 309         for extension, fastq_files in fastqs.items():
 310             inifile.extend(
 311                 make_submission_section(line_counter,
 312                                         fastq_files,
 313                                         fastq_attributes[extension])
 314             )
 315             inifile += ['']
 316             line_counter += 1
 317
 318         f = open(result_ini,'w')
 319         f.write(os.linesep.join(inifile))
 320
 321
 322 def make_lane_dict(lib_db, lib_id):
 323     """
 324     Convert the lane_set in a lib_db to a dictionary
 325     indexed by flowcell ID
 326     """
 327     result = []
 328     for lane in lib_db[lib_id]['lane_set']:
 329         result.append((lane['flowcell'], lane))
 330     return dict(result)
 331
 332
 333 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
 334     dag_fragment = []
 335     for lib_id, result_dir in library_result_map:
 336         ininame = result_dir+'.ini'
 337         inipathname = os.path.join(result_dir, ininame)
 338         if os.path.exists(inipathname):
 339             dag_fragment.extend(
 340                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
 341             )
 342
 343     if make_condor and len(dag_fragment) > 0:
 344         dag_filename = 'submission.dagman'
 345         if not force and os.path.exists(dag_filename):
 346             logging.warn("%s exists, please delete" % (dag_filename,))
 347         else:
 348             f = open(dag_filename,'w')
 349             f.write( os.linesep.join(dag_fragment))
 350             f.write( os.linesep )
 351             f.close()
 352
 353
 354 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
 355     """
 356     Make ddf files, and bonus condor file
 357     """
 358     dag_fragments = []
 359     curdir = os.getcwd()
 360     if outdir is not None:
 361         os.chdir(outdir)
 362     output = sys.stdout
 363     ddf_name = None
 364     if guess_ddf:
 365         ddf_name = make_ddf_name(ininame)
 366         print ddf_name
 367         output = open(ddf_name,'w')
 368
 369     file_list = read_ddf_ini(ininame, output)
 370     logging.info(
 371         "Read config {0}, found files: {1}".format(
 372             ininame, ", ".join(file_list)))
 373
 374     file_list.append(daf_name)
 375     if ddf_name is not None:
 376         file_list.append(ddf_name)
 377
 378     if make_condor:
 379         archive_condor = make_condor_archive_script(ininame, file_list)
 380         upload_condor = make_condor_upload_script(ininame)
 381
 382         dag_fragments.extend(
 383             make_dag_fragment(ininame, archive_condor, upload_condor)
 384         )
 385
 386     os.chdir(curdir)
 387
 388     return dag_fragments
 389
 390
 391 def read_ddf_ini(filename, output=sys.stdout):
 392     """
 393     Read a ini file and dump out a tab delmited text file
 394     """
 395     file_list = []
 396     config = SafeConfigParser()
 397     config.read(filename)
 398
 399     order_by = shlex.split(config.get("config", "order_by"))
 400
 401     output.write("\t".join(order_by))
 402     output.write(os.linesep)
 403     sections = config.sections()
 404     sections.sort()
 405     for section in sections:
 406         if section == "config":
 407             # skip the config block
 408             continue
 409         values = []
 410         for key in order_by:
 411             v = config.get(section, key)
 412             values.append(v)
 413             if key == 'files':
 414                 file_list.extend(parse_filelist(v))
 415
 416         output.write("\t".join(values))
 417         output.write(os.linesep)
 418     return file_list
 419
 420
 421 def read_library_result_map(filename):
 422     """
 423     Read a file that maps library id to result directory.
 424     Does not support spaces in filenames.
 425
 426     For example:
 427       10000 result/foo/bar
 428     """
 429     stream = open(filename,'r')
 430
 431     results = []
 432     for line in stream:
 433         line = line.rstrip()
 434         if not line.startswith('#') and len(line) > 0 :
 435             library_id, result_dir = line.split()
 436             results.append((library_id, result_dir))
 437     return results
 438
 439
 440 def make_condor_archive_script(ininame, files):
 441     script = """Universe = vanilla
 442
 443 Executable = /bin/tar
 444 arguments = czvhf ../%(archivename)s %(filelist)s
 445
 446 Error = compress.err.$(Process).log
 447 Output = compress.out.$(Process).log
 448 Log = /tmp/submission-compress-%(user)s.log
 449 initialdir = %(initialdir)s
 450 environment="GZIP=-3"
 451 request_memory = 20
 452
 453 queue
 454 """
 455     for f in files:
 456         if not os.path.exists(f):
 457             raise RuntimeError("Missing %s" % (f,))
 458
 459     context = {'archivename': make_submission_name(ininame),
 460                'filelist': " ".join(files),
 461                'initialdir': os.getcwd(),
 462                'user': os.getlogin()}
 463
 464     condor_script = make_condor_name(ininame, 'archive')
 465     condor_stream = open(condor_script,'w')
 466     condor_stream.write(script % context)
 467     condor_stream.close()
 468     return condor_script
 469
 470
 471 def make_condor_upload_script(ininame):
 472     script = """Universe = vanilla
 473
 474 Executable = /usr/bin/lftp
 475 arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
 476
 477 Error = upload.err.$(Process).log
 478 Output = upload.out.$(Process).log
 479 Log = /tmp/submission-upload-%(user)s.log
 480 initialdir = %(initialdir)s
 481
 482 queue
 483 """
 484     auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
 485
 486     encodeftp = 'encodeftp.cse.ucsc.edu'
 487     ftpuser = auth.hosts[encodeftp][0]
 488     ftppassword = auth.hosts[encodeftp][2]
 489     context = {'archivename': make_submission_name(ininame),
 490                'initialdir': os.getcwd(),
 491                'user': os.getlogin(),
 492                'ftpuser': ftpuser,
 493                'ftppassword': ftppassword,
 494                'ftphost': encodeftp}
 495
 496     condor_script = make_condor_name(ininame, 'upload')
 497     condor_stream = open(condor_script,'w')
 498     condor_stream.write(script % context)
 499     condor_stream.close()
 500     os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
 501
 502     return condor_script
 503
 504
 505 def make_dag_fragment(ininame, archive_condor, upload_condor):
 506     """
 507     Make the couple of fragments compress and then upload the data.
 508     """
 509     cur_dir = os.getcwd()
 510     archive_condor = os.path.join(cur_dir, archive_condor)
 511     upload_condor = os.path.join(cur_dir, upload_condor)
 512     job_basename = make_base_name(ininame)
 513
 514     fragments = []
 515     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
 516     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
 517     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
 518
 519     return fragments
 520
 521
 522 def get_library_info(host, apidata, library_id):
 523     url = api.library_url(host, library_id)
 524     contents = api.retrieve_info(url, apidata)
 525     return contents
 526
 527
 528 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
 529                         mid=None, force=False):
 530     py = srf2fastq.__file__
 531     args = [ py, srf_file, ]
 532     if paired:
 533         args.extend(['--left', target_pathname])
 534         # this is ugly. I did it because I was pregenerating the target
 535         # names before I tried to figure out what sources could generate
 536         # those targets, and everything up to this point had been
 537         # one-to-one. So I couldn't figure out how to pair the
 538         # target names.
 539         # With this at least the command will run correctly.
 540         # however if we rename the default targets, this'll break
 541         # also I think it'll generate it twice.
 542         args.extend(['--right',
 543                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
 544     else:
 545         args.extend(['--single', target_pathname ])
 546     if flowcell is not None:
 547         args.extend(['--flowcell', flowcell])
 548
 549     if mid is not None:
 550         args.extend(['-m', str(mid)])
 551
 552     if force:
 553         args.extend(['--force'])
 554
 555     script = """
 556 arguments="%s"
 557 queue
 558 """ % (" ".join(args),)
 559
 560     return  script
 561
 562
 563 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
 564     py = qseq2fastq.__file__
 565     args = [py, '-i', qseq_file, '-o', target_pathname ]
 566     if flowcell is not None:
 567         args.extend(['-f', flowcell])
 568     script = """
 569 arguments="%s"
 570 queue
 571 """ % (" ".join(args))
 572
 573     return script
 574
 575 def find_archive_sequence_files(host, apidata, sequences_path,
 576                                 library_result_map):
 577     """
 578     Find all the archive sequence files possibly associated with our results.
 579
 580     """
 581     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
 582
 583     lib_db = {}
 584     seq_dirs = set()
 585     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
 586     candidate_lanes = {}
 587     for lib_id, result_dir in library_result_map:
 588         lib_info = get_library_info(host, apidata, lib_id)
 589         lib_info['lanes'] = {}
 590         lib_db[lib_id] = lib_info
 591
 592         for lane in lib_info['lane_set']:
 593             lane_key = (lane['flowcell'], lane['lane_number'])
 594             candidate_lanes[lane_key] = lib_id
 595             seq_dirs.add(os.path.join(sequences_path,
 596                                          'flowcells',
 597                                          lane['flowcell']))
 598     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
 599     candidate_seq_list = scan_for_sequences(seq_dirs)
 600
 601     # at this point we have too many sequences as scan_for_sequences
 602     # returns all the sequences in a flowcell directory
 603     # so lets filter out the extras
 604
 605     for seq in candidate_seq_list:
 606         lane_key = (seq.flowcell, seq.lane)
 607         lib_id = candidate_lanes.get(lane_key, None)
 608         if lib_id is not None:
 609             lib_info = lib_db[lib_id]
 610             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
 611
 612     return lib_db
 613
 614
 615 class NameToViewMap(object):
 616     """Determine view attributes for a given submission file name
 617     """
 618     def __init__(self, root_url, apidata):
 619         self.root_url = root_url
 620         self.apidata = apidata
 621
 622         self.lib_cache = {}
 623         self.lib_paired = {}
 624         # ma is "map algorithm"
 625         ma = 'TH1014'
 626
 627         self.patterns = [
 628             # for 2011 Feb 18 elements submission
 629             ('final_Cufflinks_genes_*gtf',       'GeneDeNovo'),
 630             ('final_Cufflinks_transcripts_*gtf', 'TranscriptDeNovo'),
 631             ('final_exonFPKM-Cufflinks-0.9.3-GENCODE-v3c-*.gtf',
 632              'ExonsGencV3c'),
 633             ('final_GENCODE-v3-Cufflinks-0.9.3.genes-*gtf',
 634              'GeneGencV3c'),
 635             ('final_GENCODE-v3-Cufflinks-0.9.3.transcripts-*gtf',
 636              'TranscriptGencV3c'),
 637             ('final_TSS-Cufflinks-0.9.3-GENCODE-v3c-*.gtf', 'TSS'),
 638             ('final_junctions-*.bed6+3',                    'Junctions'),
 639
 640             ('*.bai',                   None),
 641             ('*.splices.bam',           'Splices'),
 642             ('*.bam',                   self._guess_bam_view),
 643             ('junctions.bed',           'Junctions'),
 644             ('*.jnct',                  'Junctions'),
 645             ('*unique.bigwig',         None),
 646             ('*plus.bigwig',           'PlusSignal'),
 647             ('*minus.bigwig',          'MinusSignal'),
 648             ('*.bigwig',                'Signal'),
 649             ('*.tar.bz2',               None),
 650             ('*.condor',                None),
 651             ('*.daf',                   None),
 652             ('*.ddf',                   None),
 653
 654             ('*ufflinks?0.9.3.genes.gtf',       'GeneDeNovo'),
 655             ('*ufflinks?0.9.3.transcripts.gtf', 'TranscriptDeNovo'),
 656             ('*GENCODE-v3c.exonFPKM.gtf',        'ExonsGencV3c'),
 657             ('*GENCODE-v3c.genes.gtf',           'GeneGencV3c'),
 658             ('*GENCODE-v3c.transcripts.gtf',     'TranscriptGencV3c'),
 659             ('*GENCODE-v3c.TSS.gtf',             'TSS'),
 660             ('*.junctions.bed6+3',                'Junctions'),
 661
 662             ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
 663             ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
 664             ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
 665
 666             ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
 667             ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
 668             ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
 669             ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
 670             ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
 671             ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
 672             ('*_1.75mers.fastq',              'FastqRd1'),
 673             ('*_2.75mers.fastq',              'FastqRd2'),
 674             ('*_r1.fastq',              'FastqRd1'),
 675             ('*_r2.fastq',              'FastqRd2'),
 676             ('*.fastq',                 'Fastq'),
 677             ('*.gtf',                   'GeneModel'),
 678             ('*.ini',                   None),
 679             ('*.log',                   None),
 680             ('*.md5',                   None),
 681             ('paired-end-distribution*', 'InsLength'),
 682             ('*.stats.txt',              'InsLength'),
 683             ('*.srf',                   None),
 684             ('*.wig',                   None),
 685             ('*.zip',                   None),
 686             ('transfer_log',            None),
 687             ]
 688
 689         self.views = {
 690             None: {"MapAlgorithm": "NA"},
 691             "Paired": {"MapAlgorithm": ma},
 692             "Aligns": {"MapAlgorithm": ma},
 693             "Single": {"MapAlgorithm": ma},
 694             "Splices": {"MapAlgorithm": ma},
 695             "Junctions": {"MapAlgorithm": ma},
 696             "PlusSignal": {"MapAlgorithm": ma},
 697             "MinusSignal": {"MapAlgorithm": ma},
 698             "Signal": {"MapAlgorithm": ma},
 699             "GeneModel": {"MapAlgorithm": ma},
 700             "GeneDeNovo": {"MapAlgorithm": ma},
 701             "TranscriptDeNovo": {"MapAlgorithm": ma},
 702             "ExonsGencV3c": {"MapAlgorithm": ma},
 703             "GeneGencV3c": {"MapAlgorithm": ma},
 704             "TSS": {"MapAlgorithm": ma},
 705             "GeneGCV3c": {"MapAlgorithm": ma},
 706             "TranscriptGCV3c": {"MapAlgorithm": ma},
 707             "TranscriptGencV3c": {"MapAlgorithm": ma},
 708             "GeneGCV4": {"MapAlgorithm": ma},
 709             "TranscriptGCV4": {"MapAlgorithm": ma},
 710             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
 711             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
 712             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
 713             "InsLength": {"MapAlgorithm": ma},
 714             }
 715         # view name is one of the attributes
 716         for v in self.views.keys():
 717             self.views[v]['view'] = v
 718
 719     def find_attributes(self, pathname, lib_id):
 720         """Looking for the best extension
 721         The 'best' is the longest match
 722
 723         :Args:
 724         filename (str): the filename whose extention we are about to examine
 725         """
 726         path, filename = os.path.splitext(pathname)
 727         if not self.lib_cache.has_key(lib_id):
 728             self.lib_cache[lib_id] = get_library_info(self.root_url,
 729                                                       self.apidata, lib_id)
 730
 731         lib_info = self.lib_cache[lib_id]
 732         if lib_info['cell_line'].lower() == 'unknown':
 733             logging.warn("Library %s missing cell_line" % (lib_id,))
 734         attributes = {
 735             'cell': lib_info['cell_line'],
 736             'replicate': lib_info['replicate'],
 737             }
 738         is_paired = self._is_paired(lib_id, lib_info)
 739
 740         if is_paired:
 741             attributes.update(self.get_paired_attributes(lib_info))
 742         else:
 743             attributes.update(self.get_single_attributes(lib_info))
 744
 745         for pattern, view in self.patterns:
 746             if fnmatch.fnmatch(pathname, pattern):
 747                 if callable(view):
 748                     view = view(is_paired=is_paired)
 749
 750                 attributes.update(self.views[view])
 751                 attributes["extension"] = pattern
 752                 return attributes
 753
 754
 755     def _guess_bam_view(self, is_paired=True):
 756         """Guess a view name based on library attributes
 757         """
 758         if is_paired:
 759             return "Paired"
 760         else:
 761             return "Aligns"
 762
 763
 764     def _is_paired(self, lib_id, lib_info):
 765         """Determine if a library is paired end"""
 766         # TODO: encode this information in the library type page.
 767         single = (1,3,6)
 768         if len(lib_info["lane_set"]) == 0:
 769             # we haven't sequenced anything so guess based on library type
 770             if lib_info['library_type_id'] in single:
 771                 return False
 772             else:
 773                 return True
 774
 775         if not self.lib_paired.has_key(lib_id):
 776             is_paired = 0
 777             isnot_paired = 0
 778             failed = 0
 779             # check to see if all the flowcells are the same.
 780             # otherwise we might need to do something complicated
 781             for flowcell in lib_info["lane_set"]:
 782                 # yes there's also a status code, but this comparison
 783                 # is easier to read
 784                 if flowcell["status"].lower() == "failed":
 785                     # ignore failed flowcell
 786                     failed += 1
 787                     pass
 788                 elif flowcell["paired_end"]:
 789                     is_paired += 1
 790                 else:
 791                     isnot_paired += 1
 792
 793             logging.debug("Library %s: %d paired, %d single, %d failed" % \
 794                      (lib_info["library_id"], is_paired, isnot_paired, failed))
 795
 796             if is_paired > isnot_paired:
 797                 self.lib_paired[lib_id] = True
 798             elif is_paired < isnot_paired:
 799                 self.lib_paired[lib_id] = False
 800             else:
 801                 raise RuntimeError("Equal number of paired & unpaired lanes."\
 802                                    "Can't guess library paired status")
 803
 804         return self.lib_paired[lib_id]
 805
 806     def get_paired_attributes(self, lib_info):
 807         if lib_info['insert_size'] is None:
 808             errmsg = "Library %s is missing insert_size, assuming 200"
 809             logging.warn(errmsg % (lib_info["library_id"],))
 810             insert_size = 200
 811         else:
 812             insert_size = lib_info['insert_size']
 813         return {'insertLength': insert_size,
 814                 'readType': '2x75'}
 815
 816     def get_single_attributes(self, lib_info):
 817         return {'insertLength':'ilNA',
 818                 'readType': '1x75D'
 819                 }
 820
 821 def make_submission_section(line_counter, files, attributes):
 822     """
 823     Create a section in the submission ini file
 824     """
 825     inifile = [ "[line%s]" % (line_counter,) ]
 826     inifile += ["files=%s" % (",".join(files))]
 827
 828     for k,v in attributes.items():
 829         inifile += ["%s=%s" % (k,v)]
 830     return inifile
 831
 832
 833 def make_base_name(pathname):
 834     base = os.path.basename(pathname)
 835     name, ext = os.path.splitext(base)
 836     return name
 837
 838
 839 def make_submission_name(ininame):
 840     name = make_base_name(ininame)
 841     return name + ".tgz"
 842
 843
 844 def make_ddf_name(pathname):
 845     name = make_base_name(pathname)
 846     return name + ".ddf"
 847
 848
 849 def make_condor_name(pathname, run_type=None):
 850     name = make_base_name(pathname)
 851     elements = [name]
 852     if run_type is not None:
 853         elements.append(run_type)
 854     elements.append("condor")
 855     return ".".join(elements)
 856
 857
 858 def make_submit_script(target, header, body_list):
 859     """
 860     write out a text file
 861
 862     this was intended for condor submit scripts
 863
 864     Args:
 865       target (str or stream):
 866         if target is a string, we will open and close the file
 867         if target is a stream, the caller is responsible.
 868
 869       header (str);
 870         header to write at the beginning of the file
 871       body_list (list of strs):
 872         a list of blocks to add to the file.
 873     """
 874     if type(target) in types.StringTypes:
 875         f = open(target,"w")
 876     else:
 877         f = target
 878     f.write(header)
 879     for entry in body_list:
 880         f.write(entry)
 881     if type(target) in types.StringTypes:
 882         f.close()
 883
 884 def parse_filelist(file_string):
 885     return file_string.split(",")
 886
 887
 888 def validate_filelist(files):
 889     """
 890     Die if a file doesn't exist in a file list
 891     """
 892     for f in files:
 893         if not os.path.exists(f):
 894             raise RuntimeError("%s does not exist" % (f,))
 895
 896 def make_md5sum(filename):
 897     """Quickly find the md5sum of a file
 898     """
 899     md5_cache = os.path.join(filename+".md5")
 900     print md5_cache
 901     if os.path.exists(md5_cache):
 902         logging.debug("Found md5sum in {0}".format(md5_cache))
 903         stream = open(md5_cache,'r')
 904         lines = stream.readlines()
 905         md5sum = parse_md5sum_line(lines, filename)
 906     else:
 907         md5sum = make_md5sum_unix(filename, md5_cache)
 908     return md5sum
 909
 910 def make_md5sum_unix(filename, md5_cache):
 911     cmd = ["md5sum", filename]
 912     logging.debug("Running {0}".format(" ".join(cmd)))
 913     p = Popen(cmd, stdout=PIPE)
 914     stdin, stdout = p.communicate()
 915     retcode = p.wait()
 916     logging.debug("Finished {0} retcode {1}".format(" ".join(cmd), retcode))
 917     if retcode != 0:
 918         logging.error("Trouble with md5sum for {0}".format(filename))
 919         return None
 920     lines = stdin.split(os.linesep)
 921     md5sum = parse_md5sum_line(lines, filename)
 922     if md5sum is not None:
 923         logging.debug("Caching sum in {0}".format(md5_cache))
 924         stream = open(md5_cache, "w")
 925         stream.write(stdin)
 926         stream.close()
 927     return md5sum
 928
 929 def parse_md5sum_line(lines, filename):
 930     md5sum, md5sum_filename = lines[0].split()
 931     if md5sum_filename != filename:
 932         errmsg = "MD5sum and I disagre about filename. {0} != {1}"
 933         logging.error(errmsg.format(filename, md5sum_filename))
 934         return None
 935     return md5sum
 936
 937 if __name__ == "__main__":
 938     main()