extra/ucsc_encode_submission/ucsc_gather.py

   1 #!/usr/bin/env python
   2 from ConfigParser import SafeConfigParser
   3 import fnmatch
   4 from glob import glob
   5 import json
   6 import logging
   7 import netrc
   8 from optparse import OptionParser
   9 import os
  10 from pprint import pprint, pformat
  11 import shlex
  12 from StringIO import StringIO
  13 import stat
  14 from subprocess import Popen, PIPE
  15 import sys
  16 import time
  17 import types
  18 import urllib
  19 import urllib2
  20 import urlparse
  21
  22 from htsworkflow.util import api
  23 from htsworkflow.pipelines.sequences import \
  24     create_sequence_table, \
  25     scan_for_sequences
  26 from htsworkflow.pipelines import qseq2fastq
  27 from htsworkflow.pipelines import srf2fastq
  28
  29 def main(cmdline=None):
  30     parser = make_parser()
  31     opts, args = parser.parse_args(cmdline)
  32
  33     if opts.debug:
  34         logging.basicConfig(level = logging.DEBUG )
  35     elif opts.verbose:
  36         logging.basicConfig(level = logging.INFO )
  37     else:
  38         logging.basicConfig(level = logging.WARNING )
  39
  40     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
  41
  42     if opts.host is None or opts.apiid is None or opts.apikey is None:
  43         parser.error("Please specify host url, apiid, apikey")
  44
  45     if opts.makeddf and opts.daf is None:
  46         parser.error("Please specify your daf when making ddf files")
  47
  48     if len(args) == 0:
  49         parser.error("I need at least one library submission-dir input file")
  50
  51     library_result_map = []
  52     for a in args:
  53         library_result_map.extend(read_library_result_map(a))
  54
  55     if opts.make_tree_from is not None:
  56         make_tree_from(opts.make_tree_from, library_result_map)
  57
  58     if opts.daf is not None:
  59         link_daf(opts.daf, library_result_map)
  60
  61     if opts.fastq:
  62         build_fastqs(opts.host,
  63                      apidata,
  64                      opts.sequence,
  65                      library_result_map,
  66                      force=opts.force)
  67
  68     if opts.ini:
  69         make_submission_ini(opts.host, apidata, library_result_map)
  70
  71     if opts.makeddf:
  72         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
  73
  74
  75 def make_parser():
  76     # Load defaults from the config files
  77     config = SafeConfigParser()
  78     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
  79
  80     sequence_archive = None
  81     apiid = None
  82     apikey = None
  83     apihost = None
  84     SECTION = 'sequence_archive'
  85     if config.has_section(SECTION):
  86         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
  87         sequence_archive = os.path.expanduser(sequence_archive)
  88         apiid = config.get(SECTION, 'apiid', apiid)
  89         apikey = config.get(SECTION, 'apikey', apikey)
  90         apihost = config.get(SECTION, 'host', apihost)
  91
  92     parser = OptionParser()
  93
  94     # commands
  95     parser.add_option('--make-tree-from',
  96                       help="create directories & link data files",
  97                       default=None)
  98     parser.add_option('--fastq', help="generate scripts for making fastq files",
  99                       default=False, action="store_true")
 100
 101     parser.add_option('--ini', help="generate submission ini file", default=False,
 102                       action="store_true")
 103
 104     parser.add_option('--makeddf', help='make the ddfs', default=False,
 105                       action="store_true")
 106
 107     parser.add_option('--daf', default=None, help='specify daf name')
 108     parser.add_option('--force', default=False, action="store_true",
 109                       help="Force regenerating fastqs")
 110
 111     # configuration options
 112     parser.add_option('--apiid', default=apiid, help="Specify API ID")
 113     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
 114     parser.add_option('--host',  default=apihost,
 115                       help="specify HTSWorkflow host",)
 116     parser.add_option('--sequence', default=sequence_archive,
 117                       help="sequence repository")
 118
 119     # debugging
 120     parser.add_option('--verbose', default=False, action="store_true",
 121                       help='verbose logging')
 122     parser.add_option('--debug', default=False, action="store_true",
 123                       help='debug logging')
 124
 125     return parser
 126
 127
 128 def make_tree_from(source_path, library_result_map):
 129     """Create a tree using data files from source path.
 130     """
 131     for lib_id, lib_path in library_result_map:
 132         if not os.path.exists(lib_path):
 133             logging.info("Making dir {0}".format(lib_path))
 134             os.mkdir(lib_path)
 135         source_lib_dir = os.path.join(source_path, lib_path)
 136         if os.path.exists(source_lib_dir):
 137             pass
 138         for filename in os.listdir(source_lib_dir):
 139             source_pathname = os.path.join(source_lib_dir, filename)
 140             target_pathname = os.path.join(lib_path, filename)
 141             if not os.path.exists(source_pathname):
 142                 raise IOError("{0} does not exist".format(source_pathname))
 143             if not os.path.exists(target_pathname):
 144                 os.symlink(source_pathname, target_pathname)
 145                 logging.info(
 146                     'LINK {0} to {1}'.format(source_pathname, target_pathname))
 147
 148 def build_fastqs(host, apidata, sequences_path, library_result_map,
 149                  force=False ):
 150     """
 151     Generate condor scripts to build any needed fastq files
 152
 153     Args:
 154       host (str): root of the htsworkflow api server
 155       apidata (dict): id & key to post to the server
 156       sequences_path (str): root of the directory tree to scan for files
 157       library_result_map (list):  [(library_id, destination directory), ...]
 158     """
 159     qseq_condor_header = """
 160 Universe=vanilla
 161 executable=%(exe)s
 162 error=log/qseq2fastq.err.$(process).log
 163 output=log/qseq2fastq.out.$(process).log
 164 log=log/qseq2fastq.log
 165
 166 """ % {'exe': sys.executable }
 167     qseq_condor_entries = []
 168     srf_condor_header = """
 169 Universe=vanilla
 170 executable=%(exe)s
 171 output=log/srf_pair_fastq.out.$(process).log
 172 error=log/srf_pair_fastq.err.$(process).log
 173 log=log/srf_pair_fastq.log
 174 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
 175
 176 """ % {'exe': sys.executable }
 177     srf_condor_entries = []
 178     lib_db = find_archive_sequence_files(host,
 179                                          apidata,
 180                                          sequences_path,
 181                                          library_result_map)
 182
 183     needed_targets = find_missing_targets(library_result_map, lib_db, force)
 184
 185     for target_pathname, available_sources in needed_targets.items():
 186         logging.debug(' target : %s' % (target_pathname,))
 187         logging.debug(' candidate sources: %s' % (available_sources,))
 188         if available_sources.has_key('qseq'):
 189             source = available_sources['qseq']
 190             qseq_condor_entries.append(
 191                 condor_qseq_to_fastq(source.path,
 192                                      target_pathname,
 193                                      source.flowcell,
 194                                      force=force)
 195             )
 196         elif available_sources.has_key('srf'):
 197             source = available_sources['srf']
 198             mid = getattr(source, 'mid_point', None)
 199             srf_condor_entries.append(
 200                 condor_srf_to_fastq(source.path,
 201                                     target_pathname,
 202                                     source.paired,
 203                                     source.flowcell,
 204                                     mid,
 205                                     force=force)
 206             )
 207         else:
 208             print " need file", target_pathname
 209
 210     if len(srf_condor_entries) > 0:
 211         make_submit_script('srf.fastq.condor',
 212                            srf_condor_header,
 213                            srf_condor_entries)
 214
 215     if len(qseq_condor_entries) > 0:
 216         make_submit_script('qseq.fastq.condor',
 217                            qseq_condor_header,
 218                            qseq_condor_entries)
 219
 220
 221 def find_missing_targets(library_result_map, lib_db, force=False):
 222     """
 223     Check if the sequence file exists.
 224     This requires computing what the sequence name is and checking
 225     to see if it can be found in the sequence location.
 226
 227     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
 228     """
 229     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
 230     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
 231     # find what targets we're missing
 232     needed_targets = {}
 233     for lib_id, result_dir in library_result_map:
 234         lib = lib_db[lib_id]
 235         lane_dict = make_lane_dict(lib_db, lib_id)
 236
 237         for lane_key, sequences in lib['lanes'].items():
 238             for seq in sequences:
 239                 seq.paired = lane_dict[seq.flowcell]['paired_end']
 240                 lane_status = lane_dict[seq.flowcell]['status']
 241
 242                 if seq.paired and seq.read is None:
 243                     seq.read = 1
 244                 filename_attributes = {
 245                     'flowcell': seq.flowcell,
 246                     'lib_id': lib_id,
 247                     'lane': seq.lane,
 248                     'read': seq.read,
 249                     'cycle': seq.cycle
 250                     }
 251                 # skip bad runs
 252                 if lane_status == 'Failed':
 253                     continue
 254                 if seq.flowcell == '30DY0AAXX':
 255                     # 30DY0 only ran for 151 bases instead of 152
 256                     # it is actually 76 1st read, 75 2nd read
 257                     seq.mid_point = 76
 258
 259                 # end filters
 260                 if seq.paired:
 261                     target_name = fastq_paired_template % filename_attributes
 262                 else:
 263                     target_name = fastq_single_template % filename_attributes
 264
 265                 target_pathname = os.path.join(result_dir, target_name)
 266                 if force or not os.path.exists(target_pathname):
 267                     t = needed_targets.setdefault(target_pathname, {})
 268                     t[seq.filetype] = seq
 269
 270     return needed_targets
 271
 272
 273 def link_daf(daf_path, library_result_map):
 274     if not os.path.exists(daf_path):
 275         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
 276
 277     base_daf = os.path.basename(daf_path)
 278
 279     for lib_id, result_dir in library_result_map:
 280         if not os.path.exists(result_dir):
 281             raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
 282         submission_daf = os.path.join(result_dir, base_daf)
 283         if not os.path.exists(submission_daf):
 284             if not os.path.exists(daf_path):
 285                 raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
 286             os.link(daf_path, submission_daf)
 287
 288
 289 def make_submission_ini(host, apidata, library_result_map, paired=True):
 290     #attributes = get_filename_attribute_map(paired)
 291     view_map = NameToViewMap(host, apidata)
 292
 293     candidate_fastq_src = {}
 294
 295     for lib_id, result_dir in library_result_map:
 296         order_by = ['order_by=files', 'view', 'replicate', 'cell',
 297                     'readType', 'mapAlgorithm', 'insertLength', 'md5sum' ]
 298         inifile =  ['[config]']
 299         inifile += [" ".join(order_by)]
 300         inifile += ['']
 301         line_counter = 1
 302         result_ini = os.path.join(result_dir, result_dir+'.ini')
 303
 304         # write other lines
 305         submission_files = os.listdir(result_dir)
 306         fastqs = {}
 307         fastq_attributes = {}
 308         for f in submission_files:
 309             attributes = view_map.find_attributes(f, lib_id)
 310             if attributes is None:
 311                 raise ValueError("Unrecognized file: %s" % (f,))
 312             attributes['md5sum'] = "None"
 313
 314             ext = attributes["extension"]
 315             if attributes['view'] is None:
 316                 continue
 317             elif attributes.get("type", None) == 'fastq':
 318                 fastqs.setdefault(ext, set()).add(f)
 319                 fastq_attributes[ext] = attributes
 320             else:
 321                 md5sum = make_md5sum(os.path.join(result_dir,f))
 322                 if md5sum is not None:
 323                     attributes['md5sum']=md5sum
 324                 inifile.extend(
 325                     make_submission_section(line_counter,
 326                                             [f],
 327                                             attributes
 328                                             )
 329                     )
 330                 inifile += ['']
 331                 line_counter += 1
 332                 # add in fastqs on a single line.
 333
 334         for extension, fastq_files in fastqs.items():
 335             inifile.extend(
 336                 make_submission_section(line_counter,
 337                                         fastq_files,
 338                                         fastq_attributes[extension])
 339             )
 340             inifile += ['']
 341             line_counter += 1
 342
 343         f = open(result_ini,'w')
 344         f.write(os.linesep.join(inifile))
 345
 346
 347 def make_lane_dict(lib_db, lib_id):
 348     """
 349     Convert the lane_set in a lib_db to a dictionary
 350     indexed by flowcell ID
 351     """
 352     result = []
 353     for lane in lib_db[lib_id]['lane_set']:
 354         result.append((lane['flowcell'], lane))
 355     return dict(result)
 356
 357
 358 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
 359     dag_fragment = []
 360     for lib_id, result_dir in library_result_map:
 361         ininame = result_dir+'.ini'
 362         inipathname = os.path.join(result_dir, ininame)
 363         if os.path.exists(inipathname):
 364             dag_fragment.extend(
 365                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
 366             )
 367
 368     if make_condor and len(dag_fragment) > 0:
 369         dag_filename = 'submission.dagman'
 370         if not force and os.path.exists(dag_filename):
 371             logging.warn("%s exists, please delete" % (dag_filename,))
 372         else:
 373             f = open(dag_filename,'w')
 374             f.write( os.linesep.join(dag_fragment))
 375             f.write( os.linesep )
 376             f.close()
 377
 378
 379 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
 380     """
 381     Make ddf files, and bonus condor file
 382     """
 383     dag_fragments = []
 384     curdir = os.getcwd()
 385     if outdir is not None:
 386         os.chdir(outdir)
 387     output = sys.stdout
 388     ddf_name = None
 389     if guess_ddf:
 390         ddf_name = make_ddf_name(ininame)
 391         print ddf_name
 392         output = open(ddf_name,'w')
 393
 394     file_list = read_ddf_ini(ininame, output)
 395     logging.info(
 396         "Read config {0}, found files: {1}".format(
 397             ininame, ", ".join(file_list)))
 398
 399     file_list.append(daf_name)
 400     if ddf_name is not None:
 401         file_list.append(ddf_name)
 402
 403     if make_condor:
 404         archive_condor = make_condor_archive_script(ininame, file_list)
 405         upload_condor = make_condor_upload_script(ininame)
 406
 407         dag_fragments.extend(
 408             make_dag_fragment(ininame, archive_condor, upload_condor)
 409         )
 410
 411     os.chdir(curdir)
 412
 413     return dag_fragments
 414
 415
 416 def read_ddf_ini(filename, output=sys.stdout):
 417     """
 418     Read a ini file and dump out a tab delmited text file
 419     """
 420     file_list = []
 421     config = SafeConfigParser()
 422     config.read(filename)
 423
 424     order_by = shlex.split(config.get("config", "order_by"))
 425
 426     output.write("\t".join(order_by))
 427     output.write(os.linesep)
 428     sections = config.sections()
 429     sections.sort()
 430     for section in sections:
 431         if section == "config":
 432             # skip the config block
 433             continue
 434         values = []
 435         for key in order_by:
 436             v = config.get(section, key)
 437             values.append(v)
 438             if key == 'files':
 439                 file_list.extend(parse_filelist(v))
 440
 441         output.write("\t".join(values))
 442         output.write(os.linesep)
 443     return file_list
 444
 445
 446 def read_library_result_map(filename):
 447     """
 448     Read a file that maps library id to result directory.
 449     Does not support spaces in filenames.
 450
 451     For example:
 452       10000 result/foo/bar
 453     """
 454     stream = open(filename,'r')
 455
 456     results = []
 457     for line in stream:
 458         line = line.rstrip()
 459         if not line.startswith('#') and len(line) > 0 :
 460             library_id, result_dir = line.split()
 461             results.append((library_id, result_dir))
 462     return results
 463
 464
 465 def make_condor_archive_script(ininame, files):
 466     script = """Universe = vanilla
 467
 468 Executable = /bin/tar
 469 arguments = czvhf ../%(archivename)s %(filelist)s
 470
 471 Error = compress.err.$(Process).log
 472 Output = compress.out.$(Process).log
 473 Log = /tmp/submission-compress-%(user)s.log
 474 initialdir = %(initialdir)s
 475 environment="GZIP=-3"
 476 request_memory = 20
 477
 478 queue
 479 """
 480     for f in files:
 481         if not os.path.exists(f):
 482             raise RuntimeError("Missing %s" % (f,))
 483
 484     context = {'archivename': make_submission_name(ininame),
 485                'filelist': " ".join(files),
 486                'initialdir': os.getcwd(),
 487                'user': os.getlogin()}
 488
 489     condor_script = make_condor_name(ininame, 'archive')
 490     condor_stream = open(condor_script,'w')
 491     condor_stream.write(script % context)
 492     condor_stream.close()
 493     return condor_script
 494
 495
 496 def make_condor_upload_script(ininame):
 497     script = """Universe = vanilla
 498
 499 Executable = /usr/bin/lftp
 500 arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
 501
 502 Error = upload.err.$(Process).log
 503 Output = upload.out.$(Process).log
 504 Log = /tmp/submission-upload-%(user)s.log
 505 initialdir = %(initialdir)s
 506
 507 queue
 508 """
 509     auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
 510
 511     encodeftp = 'encodeftp.cse.ucsc.edu'
 512     ftpuser = auth.hosts[encodeftp][0]
 513     ftppassword = auth.hosts[encodeftp][2]
 514     context = {'archivename': make_submission_name(ininame),
 515                'initialdir': os.getcwd(),
 516                'user': os.getlogin(),
 517                'ftpuser': ftpuser,
 518                'ftppassword': ftppassword,
 519                'ftphost': encodeftp}
 520
 521     condor_script = make_condor_name(ininame, 'upload')
 522     condor_stream = open(condor_script,'w')
 523     condor_stream.write(script % context)
 524     condor_stream.close()
 525     os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
 526
 527     return condor_script
 528
 529
 530 def make_dag_fragment(ininame, archive_condor, upload_condor):
 531     """
 532     Make the couple of fragments compress and then upload the data.
 533     """
 534     cur_dir = os.getcwd()
 535     archive_condor = os.path.join(cur_dir, archive_condor)
 536     upload_condor = os.path.join(cur_dir, upload_condor)
 537     job_basename = make_base_name(ininame)
 538
 539     fragments = []
 540     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
 541     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
 542     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
 543
 544     return fragments
 545
 546
 547 def get_library_info(host, apidata, library_id):
 548     url = api.library_url(host, library_id)
 549     contents = api.retrieve_info(url, apidata)
 550     return contents
 551
 552
 553 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
 554                         mid=None, force=False):
 555     py = srf2fastq.__file__
 556     args = [ py, srf_file, ]
 557     if paired:
 558         args.extend(['--left', target_pathname])
 559         # this is ugly. I did it because I was pregenerating the target
 560         # names before I tried to figure out what sources could generate
 561         # those targets, and everything up to this point had been
 562         # one-to-one. So I couldn't figure out how to pair the
 563         # target names.
 564         # With this at least the command will run correctly.
 565         # however if we rename the default targets, this'll break
 566         # also I think it'll generate it twice.
 567         args.extend(['--right',
 568                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
 569     else:
 570         args.extend(['--single', target_pathname ])
 571     if flowcell is not None:
 572         args.extend(['--flowcell', flowcell])
 573
 574     if mid is not None:
 575         args.extend(['-m', str(mid)])
 576
 577     if force:
 578         args.extend(['--force'])
 579
 580     script = """
 581 arguments="%s"
 582 queue
 583 """ % (" ".join(args),)
 584
 585     return  script
 586
 587
 588 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
 589     py = qseq2fastq.__file__
 590     args = [py, '-i', qseq_file, '-o', target_pathname ]
 591     if flowcell is not None:
 592         args.extend(['-f', flowcell])
 593     script = """
 594 arguments="%s"
 595 queue
 596 """ % (" ".join(args))
 597
 598     return script
 599
 600 def find_archive_sequence_files(host, apidata, sequences_path,
 601                                 library_result_map):
 602     """
 603     Find all the archive sequence files possibly associated with our results.
 604
 605     """
 606     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
 607
 608     lib_db = {}
 609     seq_dirs = set()
 610     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
 611     candidate_lanes = {}
 612     for lib_id, result_dir in library_result_map:
 613         lib_info = get_library_info(host, apidata, lib_id)
 614         lib_info['lanes'] = {}
 615         lib_db[lib_id] = lib_info
 616
 617         for lane in lib_info['lane_set']:
 618             lane_key = (lane['flowcell'], lane['lane_number'])
 619             candidate_lanes[lane_key] = lib_id
 620             seq_dirs.add(os.path.join(sequences_path,
 621                                          'flowcells',
 622                                          lane['flowcell']))
 623     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
 624     candidate_seq_list = scan_for_sequences(seq_dirs)
 625
 626     # at this point we have too many sequences as scan_for_sequences
 627     # returns all the sequences in a flowcell directory
 628     # so lets filter out the extras
 629
 630     for seq in candidate_seq_list:
 631         lane_key = (seq.flowcell, seq.lane)
 632         lib_id = candidate_lanes.get(lane_key, None)
 633         if lib_id is not None:
 634             lib_info = lib_db[lib_id]
 635             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
 636
 637     return lib_db
 638
 639
 640 class NameToViewMap(object):
 641     """Determine view attributes for a given submission file name
 642     """
 643     def __init__(self, root_url, apidata):
 644         self.root_url = root_url
 645         self.apidata = apidata
 646
 647         self.lib_cache = {}
 648         self.lib_paired = {}
 649         # ma is "map algorithm"
 650         ma = 'TH1014'
 651
 652         self.patterns = [
 653             ('*.bai',                   None),
 654             ('*.splices.bam',           'Splices'),
 655             ('*.bam',                   self._guess_bam_view),
 656             ('junctions.bed',           'Junctions'),
 657             ('*.jnct',                  'Junctions'),
 658             ('*unique.bigwig',         None),
 659             ('*plus.bigwig',           'PlusSignal'),
 660             ('*minus.bigwig',          'MinusSignal'),
 661             ('*.bigwig',                'Signal'),
 662             ('*.tar.bz2',               None),
 663             ('*.condor',                None),
 664             ('*.daf',                   None),
 665             ('*.ddf',                   None),
 666
 667             ('*ufflinks?0.9.3.genes.gtf',       'GeneDeNovo'),
 668             ('*ufflinks?0.9.3.transcripts.gtf', 'TranscriptDeNovo'),
 669             ('*GENCODE-v3c.exonFPKM.gtf',        'ExonsGencV3c'),
 670             ('*GENCODE-v3c.genes.gtf',           'GeneGencV3c'),
 671             ('*GENCODE-v3c.transcripts.gtf',     'TranscriptGencV3c'),
 672             ('*GENCODE-v3c.TSS.gtf',             'TSS'),
 673             ('*.junctions.bed6+3',                'Junctions'),
 674
 675             ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
 676             ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
 677             ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
 678
 679             ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
 680             ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
 681             ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
 682             ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
 683             ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
 684             ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
 685             ('*_1.75mers.fastq',              'FastqRd1'),
 686             ('*_2.75mers.fastq',              'FastqRd2'),
 687             ('*_r1.fastq',              'FastqRd1'),
 688             ('*_r2.fastq',              'FastqRd2'),
 689             ('*.fastq',                 'Fastq'),
 690             ('*.gtf',                   'GeneModel'),
 691             ('*.ini',                   None),
 692             ('*.log',                   None),
 693             ('*.md5',                   None),
 694             ('paired-end-distribution*', 'InsLength'),
 695             ('*.stats.txt',              'InsLength'),
 696             ('*.srf',                   None),
 697             ('*.wig',                   None),
 698             ('*.zip',                   None),
 699             ('transfer_log',            None),
 700             ]
 701
 702         self.views = {
 703             None: {"MapAlgorithm": "NA"},
 704             "Paired": {"MapAlgorithm": ma},
 705             "Aligns": {"MapAlgorithm": ma},
 706             "Single": {"MapAlgorithm": ma},
 707             "Splices": {"MapAlgorithm": ma},
 708             "Junctions": {"MapAlgorithm": ma},
 709             "PlusSignal": {"MapAlgorithm": ma},
 710             "MinusSignal": {"MapAlgorithm": ma},
 711             "Signal": {"MapAlgorithm": ma},
 712             "GeneModel": {"MapAlgorithm": ma},
 713             "GeneDeNovo": {"MapAlgorithm": ma},
 714             "TranscriptDeNovo": {"MapAlgorithm": ma},
 715             "ExonsGencV3c": {"MapAlgorithm": ma},
 716             "GeneGencV3c": {"MapAlgorithm": ma},
 717             "TSS": {"MapAlgorithm": ma},
 718             "GeneGCV3c": {"MapAlgorithm": ma},
 719             "TranscriptGCV3c": {"MapAlgorithm": ma},
 720             "TranscriptGencV3c": {"MapAlgorithm": ma},
 721             "GeneGCV4": {"MapAlgorithm": ma},
 722             "TranscriptGCV4": {"MapAlgorithm": ma},
 723             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
 724             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
 725             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
 726             "InsLength": {"MapAlgorithm": ma},
 727             }
 728         # view name is one of the attributes
 729         for v in self.views.keys():
 730             self.views[v]['view'] = v
 731
 732     def find_attributes(self, pathname, lib_id):
 733         """Looking for the best extension
 734         The 'best' is the longest match
 735
 736         :Args:
 737         filename (str): the filename whose extention we are about to examine
 738         """
 739         path, filename = os.path.splitext(pathname)
 740         if not self.lib_cache.has_key(lib_id):
 741             self.lib_cache[lib_id] = get_library_info(self.root_url,
 742                                                       self.apidata, lib_id)
 743
 744         lib_info = self.lib_cache[lib_id]
 745         if lib_info['cell_line'].lower() == 'unknown':
 746             logging.warn("Library %s missing cell_line" % (lib_id,))
 747         attributes = {
 748             'cell': lib_info['cell_line'],
 749             'replicate': lib_info['replicate'],
 750             }
 751         is_paired = self._is_paired(lib_id, lib_info)
 752
 753         if is_paired:
 754             attributes.update(self.get_paired_attributes(lib_info))
 755         else:
 756             attributes.update(self.get_single_attributes(lib_info))
 757
 758         for pattern, view in self.patterns:
 759             if fnmatch.fnmatch(pathname, pattern):
 760                 if callable(view):
 761                     view = view(is_paired=is_paired)
 762
 763                 attributes.update(self.views[view])
 764                 attributes["extension"] = pattern
 765                 return attributes
 766
 767
 768     def _guess_bam_view(self, is_paired=True):
 769         """Guess a view name based on library attributes
 770         """
 771         if is_paired:
 772             return "Paired"
 773         else:
 774             return "Aligns"
 775
 776
 777     def _is_paired(self, lib_id, lib_info):
 778         """Determine if a library is paired end"""
 779         if len(lib_info["lane_set"]) == 0:
 780             return False
 781
 782         if not self.lib_paired.has_key(lib_id):
 783             is_paired = 0
 784             isnot_paired = 0
 785             failed = 0
 786             # check to see if all the flowcells are the same.
 787             # otherwise we might need to do something complicated
 788             for flowcell in lib_info["lane_set"]:
 789                 # yes there's also a status code, but this comparison
 790                 # is easier to read
 791                 if flowcell["status"].lower() == "failed":
 792                     # ignore failed flowcell
 793                     failed += 1
 794                     pass
 795                 elif flowcell["paired_end"]:
 796                     is_paired += 1
 797                 else:
 798                     isnot_paired += 1
 799
 800             logging.debug("Library %s: %d paired, %d single, %d failed" % \
 801                      (lib_info["library_id"], is_paired, isnot_paired, failed))
 802
 803             if is_paired > isnot_paired:
 804                 self.lib_paired[lib_id] = True
 805             elif is_paired < isnot_paired:
 806                 self.lib_paired[lib_id] = False
 807             else:
 808                 raise RuntimeError("Equal number of paired & unpaired lanes."\
 809                                    "Can't guess library paired status")
 810
 811         return self.lib_paired[lib_id]
 812
 813     def get_paired_attributes(self, lib_info):
 814         if lib_info['insert_size'] is None:
 815             errmsg = "Library %s is missing insert_size, assuming 200"
 816             logging.warn(errmsg % (lib_info["library_id"],))
 817             insert_size = 200
 818         else:
 819             insert_size = lib_info['insert_size']
 820         return {'insertLength': insert_size,
 821                 'readType': '2x75'}
 822
 823     def get_single_attributes(self, lib_info):
 824         return {'insertLength':'ilNA',
 825                 'readType': '1x75D'
 826                 }
 827
 828 def make_submission_section(line_counter, files, attributes):
 829     """
 830     Create a section in the submission ini file
 831     """
 832     inifile = [ "[line%s]" % (line_counter,) ]
 833     inifile += ["files=%s" % (",".join(files))]
 834
 835     for k,v in attributes.items():
 836         inifile += ["%s=%s" % (k,v)]
 837     return inifile
 838
 839
 840 def make_base_name(pathname):
 841     base = os.path.basename(pathname)
 842     name, ext = os.path.splitext(base)
 843     return name
 844
 845
 846 def make_submission_name(ininame):
 847     name = make_base_name(ininame)
 848     return name + ".tgz"
 849
 850
 851 def make_ddf_name(pathname):
 852     name = make_base_name(pathname)
 853     return name + ".ddf"
 854
 855
 856 def make_condor_name(pathname, run_type=None):
 857     name = make_base_name(pathname)
 858     elements = [name]
 859     if run_type is not None:
 860         elements.append(run_type)
 861     elements.append("condor")
 862     return ".".join(elements)
 863
 864
 865 def make_submit_script(target, header, body_list):
 866     """
 867     write out a text file
 868
 869     this was intended for condor submit scripts
 870
 871     Args:
 872       target (str or stream):
 873         if target is a string, we will open and close the file
 874         if target is a stream, the caller is responsible.
 875
 876       header (str);
 877         header to write at the beginning of the file
 878       body_list (list of strs):
 879         a list of blocks to add to the file.
 880     """
 881     if type(target) in types.StringTypes:
 882         f = open(target,"w")
 883     else:
 884         f = target
 885     f.write(header)
 886     for entry in body_list:
 887         f.write(entry)
 888     if type(target) in types.StringTypes:
 889         f.close()
 890
 891 def parse_filelist(file_string):
 892     return file_string.split(",")
 893
 894
 895 def validate_filelist(files):
 896     """
 897     Die if a file doesn't exist in a file list
 898     """
 899     for f in files:
 900         if not os.path.exists(f):
 901             raise RuntimeError("%s does not exist" % (f,))
 902
 903 def make_md5sum(filename):
 904     """Quickly find the md5sum of a file
 905     """
 906     md5_cache = os.path.join(filename+".md5")
 907     print md5_cache
 908     if os.path.exists(md5_cache):
 909         logging.debug("Found md5sum in {0}".format(md5_cache))
 910         stream = open(md5_cache,'r')
 911         lines = stream.readlines()
 912         md5sum = parse_md5sum_line(lines, filename)
 913     else:
 914         md5sum = make_md5sum_unix(filename, md5_cache)
 915     return md5sum
 916
 917 def make_md5sum_unix(filename, md5_cache):
 918     cmd = ["md5sum", filename]
 919     logging.debug("Running {0}".format(" ".join(cmd)))
 920     p = Popen(cmd, stdout=PIPE)
 921     stdin, stdout = p.communicate()
 922     retcode = p.wait()
 923     logging.debug("Finished {0} retcode {1}".format(" ".join(cmd), retcode))
 924     if retcode != 0:
 925         logging.error("Trouble with md5sum for {0}".format(filename))
 926         return None
 927     lines = stdin.split(os.linesep)
 928     md5sum = parse_md5sum_line(lines, filename)
 929     if md5sum is not None:
 930         logging.debug("Caching sum in {0}".format(md5_cache))
 931         stream = open(md5_cache, "w")
 932         stream.write(stdin)
 933         stream.close()
 934     return md5sum
 935
 936 def parse_md5sum_line(lines, filename):
 937     md5sum, md5sum_filename = lines[0].split()
 938     if md5sum_filename != filename:
 939         errmsg = "MD5sum and I disagre about filename. {0} != {1}"
 940         logging.error(errmsg.format(filename, md5sum_filename))
 941         return None
 942     return md5sum
 943
 944 if __name__ == "__main__":
 945     main()