extra/ucsc_encode_submission/ucsc_gather.py

   1 #!/usr/bin/env python
   2 from ConfigParser import SafeConfigParser
   3 import fnmatch
   4 from glob import glob
   5 import json
   6 import logging
   7 import netrc
   8 from optparse import OptionParser
   9 import os
  10 from pprint import pprint, pformat
  11 import shlex
  12 from StringIO import StringIO
  13 import stat
  14 from subprocess import Popen, PIPE
  15 import sys
  16 import time
  17 import types
  18 import urllib
  19 import urllib2
  20 import urlparse
  21
  22 from htsworkflow.util import api
  23 from htsworkflow.pipelines.sequences import \
  24     create_sequence_table, \
  25     scan_for_sequences
  26 from htsworkflow.pipelines import qseq2fastq
  27 from htsworkflow.pipelines import srf2fastq
  28
  29 def main(cmdline=None):
  30     parser = make_parser()
  31     opts, args = parser.parse_args(cmdline)
  32
  33     if opts.debug:
  34         logging.basicConfig(level = logging.DEBUG )
  35     elif opts.verbose:
  36         logging.basicConfig(level = logging.INFO )
  37     else:
  38         logging.basicConfig(level = logging.WARNING )
  39
  40     apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
  41
  42     if opts.host is None or opts.apiid is None or opts.apikey is None:
  43         parser.error("Please specify host url, apiid, apikey")
  44
  45     if opts.makeddf and opts.daf is None:
  46         parser.error("Please specify your daf when making ddf files")
  47
  48     if len(args) == 0:
  49         parser.error("I need at least one library submission-dir input file")
  50
  51     library_result_map = []
  52     for a in args:
  53         library_result_map.extend(read_library_result_map(a))
  54
  55     if opts.make_tree_from is not None:
  56         make_tree_from(opts.make_tree_from, library_result_map)
  57
  58     if opts.daf is not None:
  59         link_daf(opts.daf, library_result_map)
  60
  61     if opts.fastq:
  62         build_fastqs(opts.host,
  63                      apidata,
  64                      opts.sequence,
  65                      library_result_map,
  66                      force=opts.force)
  67
  68     if opts.ini:
  69         make_submission_ini(opts.host, apidata, library_result_map)
  70
  71     if opts.makeddf:
  72         make_all_ddfs(library_result_map, opts.daf, force=opts.force)
  73
  74
  75 def make_parser():
  76     # Load defaults from the config files
  77     config = SafeConfigParser()
  78     config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
  79
  80     sequence_archive = None
  81     apiid = None
  82     apikey = None
  83     apihost = None
  84     SECTION = 'sequence_archive'
  85     if config.has_section(SECTION):
  86         sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
  87         sequence_archive = os.path.expanduser(sequence_archive)
  88         apiid = config.get(SECTION, 'apiid', apiid)
  89         apikey = config.get(SECTION, 'apikey', apikey)
  90         apihost = config.get(SECTION, 'host', apihost)
  91
  92     parser = OptionParser()
  93
  94     # commands
  95     parser.add_option('--make-tree-from',
  96                       help="create directories & link data files",
  97                       default=None)
  98     parser.add_option('--fastq', help="generate scripts for making fastq files",
  99                       default=False, action="store_true")
 100
 101     parser.add_option('--ini', help="generate submission ini file", default=False,
 102                       action="store_true")
 103
 104     parser.add_option('--makeddf', help='make the ddfs', default=False,
 105                       action="store_true")
 106
 107     parser.add_option('--daf', default=None, help='specify daf name')
 108     parser.add_option('--force', default=False, action="store_true",
 109                       help="Force regenerating fastqs")
 110
 111     # configuration options
 112     parser.add_option('--apiid', default=apiid, help="Specify API ID")
 113     parser.add_option('--apikey', default=apikey, help="Specify API KEY")
 114     parser.add_option('--host',  default=apihost,
 115                       help="specify HTSWorkflow host",)
 116     parser.add_option('--sequence', default=sequence_archive,
 117                       help="sequence repository")
 118
 119     # debugging
 120     parser.add_option('--verbose', default=False, action="store_true",
 121                       help='verbose logging')
 122     parser.add_option('--debug', default=False, action="store_true",
 123                       help='debug logging')
 124
 125     return parser
 126
 127
 128 def make_tree_from(source_path, library_result_map):
 129     """Create a tree using data files from source path.
 130     """
 131     for lib_id, lib_path in library_result_map:
 132         if not os.path.exists(lib_path):
 133             logging.info("Making dir {0}".format(lib_path))
 134             os.mkdir(lib_path)
 135         source_lib_dir = os.path.join(source_path, lib_path)
 136         if os.path.exists(source_lib_dir):
 137             pass
 138         for filename in os.listdir(source_lib_dir):
 139             source_pathname = os.path.join(source_lib_dir, filename)
 140             target_pathname = os.path.join(lib_path, filename)
 141             if not os.path.exists(source_pathname):
 142                 raise IOError("{0} does not exist".format(source_pathname))
 143             if not os.path.exists(target_pathname):
 144                 os.symlink(source_pathname, target_pathname)
 145                 logging.info(
 146                     'LINK {0} to {1}'.format(source_pathname, target_pathname))
 147
 148 def build_fastqs(host, apidata, sequences_path, library_result_map,
 149                  force=False ):
 150     """
 151     Generate condor scripts to build any needed fastq files
 152
 153     Args:
 154       host (str): root of the htsworkflow api server
 155       apidata (dict): id & key to post to the server
 156       sequences_path (str): root of the directory tree to scan for files
 157       library_result_map (list):  [(library_id, destination directory), ...]
 158     """
 159     qseq_condor_header = """
 160 Universe=vanilla
 161 executable=%(exe)s
 162 error=log/qseq2fastq.err.$(process).log
 163 output=log/qseq2fastq.out.$(process).log
 164 log=log/qseq2fastq.log
 165
 166 """ % {'exe': sys.executable }
 167     qseq_condor_entries = []
 168     srf_condor_header = """
 169 Universe=vanilla
 170 executable=%(exe)s
 171 output=log/srf_pair_fastq.out.$(process).log
 172 error=log/srf_pair_fastq.err.$(process).log
 173 log=log/srf_pair_fastq.log
 174 environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
 175
 176 """ % {'exe': sys.executable }
 177     srf_condor_entries = []
 178     lib_db = find_archive_sequence_files(host,
 179                                          apidata,
 180                                          sequences_path,
 181                                          library_result_map)
 182
 183     needed_targets = find_missing_targets(library_result_map, lib_db, force)
 184
 185     for target_pathname, available_sources in needed_targets.items():
 186         logging.debug(' target : %s' % (target_pathname,))
 187         logging.debug(' candidate sources: %s' % (available_sources,))
 188         if available_sources.has_key('qseq'):
 189             source = available_sources['qseq']
 190             qseq_condor_entries.append(
 191                 condor_qseq_to_fastq(source.path,
 192                                      target_pathname,
 193                                      source.flowcell,
 194                                      force=force)
 195             )
 196         elif available_sources.has_key('srf'):
 197             source = available_sources['srf']
 198             mid = getattr(source, 'mid_point', None)
 199             srf_condor_entries.append(
 200                 condor_srf_to_fastq(source.path,
 201                                     target_pathname,
 202                                     source.paired,
 203                                     source.flowcell,
 204                                     mid,
 205                                     force=force)
 206             )
 207         else:
 208             print " need file", target_pathname
 209
 210     if len(srf_condor_entries) > 0:
 211         make_submit_script('srf.fastq.condor',
 212                            srf_condor_header,
 213                            srf_condor_entries)
 214
 215     if len(qseq_condor_entries) > 0:
 216         make_submit_script('qseq.fastq.condor',
 217                            qseq_condor_header,
 218                            qseq_condor_entries)
 219
 220
 221 def find_missing_targets(library_result_map, lib_db, force=False):
 222     """
 223     Check if the sequence file exists.
 224     This requires computing what the sequence name is and checking
 225     to see if it can be found in the sequence location.
 226
 227     Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
 228     """
 229     fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
 230     fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
 231     # find what targets we're missing
 232     needed_targets = {}
 233     for lib_id, result_dir in library_result_map:
 234         lib = lib_db[lib_id]
 235         lane_dict = make_lane_dict(lib_db, lib_id)
 236
 237         for lane_key, sequences in lib['lanes'].items():
 238             for seq in sequences:
 239                 seq.paired = lane_dict[seq.flowcell]['paired_end']
 240                 lane_status = lane_dict[seq.flowcell]['status']
 241
 242                 if seq.paired and seq.read is None:
 243                     seq.read = 1
 244                 filename_attributes = {
 245                     'flowcell': seq.flowcell,
 246                     'lib_id': lib_id,
 247                     'lane': seq.lane,
 248                     'read': seq.read,
 249                     'cycle': seq.cycle
 250                     }
 251                 # skip bad runs
 252                 if lane_status == 'Failed':
 253                     continue
 254                 if seq.flowcell == '30DY0AAXX':
 255                     # 30DY0 only ran for 151 bases instead of 152
 256                     # it is actually 76 1st read, 75 2nd read
 257                     seq.mid_point = 76
 258
 259                 # end filters
 260                 if seq.paired:
 261                     target_name = fastq_paired_template % filename_attributes
 262                 else:
 263                     target_name = fastq_single_template % filename_attributes
 264
 265                 target_pathname = os.path.join(result_dir, target_name)
 266                 if force or not os.path.exists(target_pathname):
 267                     t = needed_targets.setdefault(target_pathname, {})
 268                     t[seq.filetype] = seq
 269
 270     return needed_targets
 271
 272
 273 def link_daf(daf_path, library_result_map):
 274     if not os.path.exists(daf_path):
 275         raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
 276
 277     base_daf = os.path.basename(daf_path)
 278
 279     for lib_id, result_dir in library_result_map:
 280         if not os.path.exists(result_dir):
 281             raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
 282         submission_daf = os.path.join(result_dir, base_daf)
 283         if not os.path.exists(submission_daf):
 284             if not os.path.exists(daf_path):
 285                 raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
 286             os.link(daf_path, submission_daf)
 287
 288
 289 def make_submission_ini(host, apidata, library_result_map, paired=True):
 290     #attributes = get_filename_attribute_map(paired)
 291     view_map = NameToViewMap(host, apidata)
 292
 293     candidate_fastq_src = {}
 294
 295     for lib_id, result_dir in library_result_map:
 296         order_by = ['order_by=files', 'view', 'replicate', 'cell',
 297                     'readType', 'mapAlgorithm', 'insertLength', 'md5sum' ]
 298         inifile =  ['[config]']
 299         inifile += [" ".join(order_by)]
 300         inifile += ['']
 301         line_counter = 1
 302         result_ini = os.path.join(result_dir, result_dir+'.ini')
 303
 304         # write other lines
 305         submission_files = os.listdir(result_dir)
 306         fastqs = {}
 307         fastq_attributes = {}
 308         for f in submission_files:
 309             attributes = view_map.find_attributes(f, lib_id)
 310             if attributes is None:
 311                 raise ValueError("Unrecognized file: %s" % (f,))
 312             attributes['md5sum'] = "None"
 313
 314             ext = attributes["extension"]
 315             if attributes['view'] is None:
 316                 continue
 317             elif attributes.get("type", None) == 'fastq':
 318                 fastqs.setdefault(ext, set()).add(f)
 319                 fastq_attributes[ext] = attributes
 320             else:
 321                 md5sum = make_md5sum(os.path.join(result_dir,f))
 322                 if md5sum is not None:
 323                     attributes['md5sum']=md5sum
 324                 inifile.extend(
 325                     make_submission_section(line_counter,
 326                                             [f],
 327                                             attributes
 328                                             )
 329                     )
 330                 inifile += ['']
 331                 line_counter += 1
 332                 # add in fastqs on a single line.
 333
 334         for extension, fastq_files in fastqs.items():
 335             inifile.extend(
 336                 make_submission_section(line_counter,
 337                                         fastq_files,
 338                                         fastq_attributes[extension])
 339             )
 340             inifile += ['']
 341             line_counter += 1
 342
 343         f = open(result_ini,'w')
 344         f.write(os.linesep.join(inifile))
 345
 346
 347 def make_lane_dict(lib_db, lib_id):
 348     """
 349     Convert the lane_set in a lib_db to a dictionary
 350     indexed by flowcell ID
 351     """
 352     result = []
 353     for lane in lib_db[lib_id]['lane_set']:
 354         result.append((lane['flowcell'], lane))
 355     return dict(result)
 356
 357
 358 def make_all_ddfs(library_result_map, daf_name, make_condor=True, force=False):
 359     dag_fragment = []
 360     for lib_id, result_dir in library_result_map:
 361         ininame = result_dir+'.ini'
 362         inipathname = os.path.join(result_dir, ininame)
 363         if os.path.exists(inipathname):
 364             dag_fragment.extend(
 365                 make_ddf(ininame, daf_name, True, make_condor, result_dir)
 366             )
 367
 368     if make_condor and len(dag_fragment) > 0:
 369         dag_filename = 'submission.dagman'
 370         if not force and os.path.exists(dag_filename):
 371             logging.warn("%s exists, please delete" % (dag_filename,))
 372         else:
 373             f = open(dag_filename,'w')
 374             f.write( os.linesep.join(dag_fragment))
 375             f.write( os.linesep )
 376             f.close()
 377
 378
 379 def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
 380     """
 381     Make ddf files, and bonus condor file
 382     """
 383     dag_fragments = []
 384     curdir = os.getcwd()
 385     if outdir is not None:
 386         os.chdir(outdir)
 387     output = sys.stdout
 388     ddf_name = None
 389     if guess_ddf:
 390         ddf_name = make_ddf_name(ininame)
 391         print ddf_name
 392         output = open(ddf_name,'w')
 393
 394     file_list = read_ddf_ini(ininame, output)
 395     logging.info(
 396         "Read config {0}, found files: {1}".format(
 397             ininame, ", ".join(file_list)))
 398
 399     file_list.append(daf_name)
 400     if ddf_name is not None:
 401         file_list.append(ddf_name)
 402
 403     if make_condor:
 404         archive_condor = make_condor_archive_script(ininame, file_list)
 405         upload_condor = make_condor_upload_script(ininame)
 406
 407         dag_fragments.extend(
 408             make_dag_fragment(ininame, archive_condor, upload_condor)
 409         )
 410
 411     os.chdir(curdir)
 412
 413     return dag_fragments
 414
 415
 416 def read_ddf_ini(filename, output=sys.stdout):
 417     """
 418     Read a ini file and dump out a tab delmited text file
 419     """
 420     file_list = []
 421     config = SafeConfigParser()
 422     config.read(filename)
 423
 424     order_by = shlex.split(config.get("config", "order_by"))
 425
 426     output.write("\t".join(order_by))
 427     output.write(os.linesep)
 428     sections = config.sections()
 429     sections.sort()
 430     for section in sections:
 431         if section == "config":
 432             # skip the config block
 433             continue
 434         values = []
 435         for key in order_by:
 436             v = config.get(section, key)
 437             values.append(v)
 438             if key == 'files':
 439                 file_list.extend(parse_filelist(v))
 440
 441         output.write("\t".join(values))
 442         output.write(os.linesep)
 443     return file_list
 444
 445
 446 def read_library_result_map(filename):
 447     """
 448     Read a file that maps library id to result directory.
 449     Does not support spaces in filenames.
 450
 451     For example:
 452       10000 result/foo/bar
 453     """
 454     stream = open(filename,'r')
 455
 456     results = []
 457     for line in stream:
 458         line = line.rstrip()
 459         if not line.startswith('#') and len(line) > 0 :
 460             library_id, result_dir = line.split()
 461             results.append((library_id, result_dir))
 462     return results
 463
 464
 465 def make_condor_archive_script(ininame, files):
 466     script = """Universe = vanilla
 467
 468 Executable = /bin/tar
 469 arguments = czvhf ../%(archivename)s %(filelist)s
 470
 471 Error = compress.err.$(Process).log
 472 Output = compress.out.$(Process).log
 473 Log = /tmp/submission-compress-%(user)s.log
 474 initialdir = %(initialdir)s
 475 environment="GZIP=-3"
 476 request_memory = 20
 477
 478 queue
 479 """
 480     for f in files:
 481         if not os.path.exists(f):
 482             raise RuntimeError("Missing %s" % (f,))
 483
 484     context = {'archivename': make_submission_name(ininame),
 485                'filelist': " ".join(files),
 486                'initialdir': os.getcwd(),
 487                'user': os.getlogin()}
 488
 489     condor_script = make_condor_name(ininame, 'archive')
 490     condor_stream = open(condor_script,'w')
 491     condor_stream.write(script % context)
 492     condor_stream.close()
 493     return condor_script
 494
 495
 496 def make_condor_upload_script(ininame):
 497     script = """Universe = vanilla
 498
 499 Executable = /usr/bin/lftp
 500 arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
 501
 502 Error = upload.err.$(Process).log
 503 Output = upload.out.$(Process).log
 504 Log = /tmp/submission-upload-%(user)s.log
 505 initialdir = %(initialdir)s
 506
 507 queue
 508 """
 509     auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
 510
 511     encodeftp = 'encodeftp.cse.ucsc.edu'
 512     ftpuser = auth.hosts[encodeftp][0]
 513     ftppassword = auth.hosts[encodeftp][2]
 514     context = {'archivename': make_submission_name(ininame),
 515                'initialdir': os.getcwd(),
 516                'user': os.getlogin(),
 517                'ftpuser': ftpuser,
 518                'ftppassword': ftppassword,
 519                'ftphost': encodeftp}
 520
 521     condor_script = make_condor_name(ininame, 'upload')
 522     condor_stream = open(condor_script,'w')
 523     condor_stream.write(script % context)
 524     condor_stream.close()
 525     os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
 526
 527     return condor_script
 528
 529
 530 def make_dag_fragment(ininame, archive_condor, upload_condor):
 531     """
 532     Make the couple of fragments compress and then upload the data.
 533     """
 534     cur_dir = os.getcwd()
 535     archive_condor = os.path.join(cur_dir, archive_condor)
 536     upload_condor = os.path.join(cur_dir, upload_condor)
 537     job_basename = make_base_name(ininame)
 538
 539     fragments = []
 540     fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
 541     fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
 542     fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
 543
 544     return fragments
 545
 546
 547 def get_library_info(host, apidata, library_id):
 548     url = api.library_url(host, library_id)
 549     contents = api.retrieve_info(url, apidata)
 550     return contents
 551
 552
 553 def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
 554                         mid=None, force=False):
 555     py = srf2fastq.__file__
 556     args = [ py, srf_file, ]
 557     if paired:
 558         args.extend(['--left', target_pathname])
 559         # this is ugly. I did it because I was pregenerating the target
 560         # names before I tried to figure out what sources could generate
 561         # those targets, and everything up to this point had been
 562         # one-to-one. So I couldn't figure out how to pair the
 563         # target names.
 564         # With this at least the command will run correctly.
 565         # however if we rename the default targets, this'll break
 566         # also I think it'll generate it twice.
 567         args.extend(['--right',
 568                      target_pathname.replace('_r1.fastq', '_r2.fastq')])
 569     else:
 570         args.extend(['--single', target_pathname ])
 571     if flowcell is not None:
 572         args.extend(['--flowcell', flowcell])
 573
 574     if mid is not None:
 575         args.extend(['-m', str(mid)])
 576
 577     if force:
 578         args.extend(['--force'])
 579
 580     script = """
 581 arguments="%s"
 582 queue
 583 """ % (" ".join(args),)
 584
 585     return  script
 586
 587
 588 def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None, force=False):
 589     py = qseq2fastq.__file__
 590     args = [py, '-i', qseq_file, '-o', target_pathname ]
 591     if flowcell is not None:
 592         args.extend(['-f', flowcell])
 593     script = """
 594 arguments="%s"
 595 queue
 596 """ % (" ".join(args))
 597
 598     return script
 599
 600 def find_archive_sequence_files(host, apidata, sequences_path,
 601                                 library_result_map):
 602     """
 603     Find all the archive sequence files possibly associated with our results.
 604
 605     """
 606     logging.debug("Searching for sequence files in: %s" %(sequences_path,))
 607
 608     lib_db = {}
 609     seq_dirs = set()
 610     #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
 611     candidate_lanes = {}
 612     for lib_id, result_dir in library_result_map:
 613         lib_info = get_library_info(host, apidata, lib_id)
 614         lib_info['lanes'] = {}
 615         lib_db[lib_id] = lib_info
 616
 617         for lane in lib_info['lane_set']:
 618             lane_key = (lane['flowcell'], lane['lane_number'])
 619             candidate_lanes[lane_key] = lib_id
 620             seq_dirs.add(os.path.join(sequences_path,
 621                                          'flowcells',
 622                                          lane['flowcell']))
 623     logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
 624     candidate_seq_list = scan_for_sequences(seq_dirs)
 625
 626     # at this point we have too many sequences as scan_for_sequences
 627     # returns all the sequences in a flowcell directory
 628     # so lets filter out the extras
 629
 630     for seq in candidate_seq_list:
 631         lane_key = (seq.flowcell, seq.lane)
 632         lib_id = candidate_lanes.get(lane_key, None)
 633         if lib_id is not None:
 634             lib_info = lib_db[lib_id]
 635             lib_info['lanes'].setdefault(lane_key, set()).add(seq)
 636
 637     return lib_db
 638
 639
 640 class NameToViewMap(object):
 641     """Determine view attributes for a given submission file name
 642     """
 643     def __init__(self, root_url, apidata):
 644         self.root_url = root_url
 645         self.apidata = apidata
 646
 647         self.lib_cache = {}
 648         self.lib_paired = {}
 649         # ma is "map algorithm"
 650         ma = 'TH1014'
 651
 652         self.patterns = [
 653             ('*.bai',                   None),
 654             ('*.splices.bam',           'Splices'),
 655             ('*.bam',                   self._guess_bam_view),
 656             ('junctions.bed',           'Junctions'),
 657             ('*.jnct',                  'Junctions'),
 658             ('*unique.bigwig',         None),
 659             ('*plus.bigwig',           'PlusSignal'),
 660             ('*minus.bigwig',          'MinusSignal'),
 661             ('*.bigwig',                'Signal'),
 662             ('*.tar.bz2',               None),
 663             ('*.condor',                None),
 664             ('*.daf',                   None),
 665             ('*.ddf',                   None),
 666             ('*.?ufflinks-0.9.0?genes.expr',       'GeneDeNovo'),
 667             ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
 668             ('*.?ufflinks-0.9.0?transcripts.gtf',  'GeneModel'),
 669             ('*.GENCODE-v3c?genes.expr',       'GeneGCV3c'),
 670             ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
 671             ('*.GENCODE-v3c?transcript*.gtf',  'TranscriptGencV3c'),
 672             ('*.GENCODE-v4?genes.expr',        None), #'GeneGCV4'),
 673             ('*.GENCODE-v4?transcript*.expr',  None), #'TranscriptGCV4'),
 674             ('*.GENCODE-v4?transcript*.gtf',   None), #'TranscriptGencV4'),
 675             ('*_1.75mers.fastq',              'FastqRd1'),
 676             ('*_2.75mers.fastq',              'FastqRd2'),
 677             ('*_r1.fastq',              'FastqRd1'),
 678             ('*_r2.fastq',              'FastqRd2'),
 679             ('*.fastq',                 'Fastq'),
 680             ('*.gtf',                   'GeneModel'),
 681             ('*.ini',                   None),
 682             ('*.log',                   None),
 683             ('*.md5',                   None),
 684             ('paired-end-distribution*', 'InsLength'),
 685             ('*.stats.txt',              'InsLength'),
 686             ('*.srf',                   None),
 687             ('*.wig',                   None),
 688             ('*.zip',                   None),
 689             ]
 690
 691         self.views = {
 692             None: {"MapAlgorithm": "NA"},
 693             "Paired": {"MapAlgorithm": ma},
 694             "Aligns": {"MapAlgorithm": ma},
 695             "Single": {"MapAlgorithm": ma},
 696             "Splices": {"MapAlgorithm": ma},
 697             "Junctions": {"MapAlgorithm": ma},
 698             "PlusSignal": {"MapAlgorithm": ma},
 699             "MinusSignal": {"MapAlgorithm": ma},
 700             "Signal": {"MapAlgorithm": ma},
 701             "GeneModel": {"MapAlgorithm": ma},
 702             "GeneDeNovo": {"MapAlgorithm": ma},
 703             "TranscriptDeNovo": {"MapAlgorithm": ma},
 704             "GeneGCV3c": {"MapAlgorithm": ma},
 705             "TranscriptGCV3c": {"MapAlgorithm": ma},
 706             "TranscriptGencV3c": {"MapAlgorithm": ma},
 707             "GeneGCV4": {"MapAlgorithm": ma},
 708             "TranscriptGCV4": {"MapAlgorithm": ma},
 709             "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
 710             "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
 711             "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
 712             "InsLength": {"MapAlgorithm": ma},
 713             }
 714         # view name is one of the attributes
 715         for v in self.views.keys():
 716             self.views[v]['view'] = v
 717
 718     def find_attributes(self, pathname, lib_id):
 719         """Looking for the best extension
 720         The 'best' is the longest match
 721
 722         :Args:
 723         filename (str): the filename whose extention we are about to examine
 724         """
 725         path, filename = os.path.splitext(pathname)
 726         if not self.lib_cache.has_key(lib_id):
 727             self.lib_cache[lib_id] = get_library_info(self.root_url,
 728                                                       self.apidata, lib_id)
 729
 730         lib_info = self.lib_cache[lib_id]
 731         if lib_info['cell_line'].lower() == 'unknown':
 732             logging.warn("Library %s missing cell_line" % (lib_id,))
 733         attributes = {
 734             'cell': lib_info['cell_line'],
 735             'replicate': lib_info['replicate'],
 736             }
 737         is_paired = self._is_paired(lib_id, lib_info)
 738
 739         if is_paired:
 740             attributes.update(self.get_paired_attributes(lib_info))
 741         else:
 742             attributes.update(self.get_single_attributes(lib_info))
 743
 744         for pattern, view in self.patterns:
 745             if fnmatch.fnmatch(pathname, pattern):
 746                 if callable(view):
 747                     view = view(is_paired=is_paired)
 748
 749                 attributes.update(self.views[view])
 750                 attributes["extension"] = pattern
 751                 return attributes
 752
 753
 754     def _guess_bam_view(self, is_paired=True):
 755         """Guess a view name based on library attributes
 756         """
 757         if is_paired:
 758             return "Paired"
 759         else:
 760             return "Aligns"
 761
 762
 763     def _is_paired(self, lib_id, lib_info):
 764         """Determine if a library is paired end"""
 765         if len(lib_info["lane_set"]) == 0:
 766             return False
 767
 768         if not self.lib_paired.has_key(lib_id):
 769             is_paired = 0
 770             isnot_paired = 0
 771             failed = 0
 772             # check to see if all the flowcells are the same.
 773             # otherwise we might need to do something complicated
 774             for flowcell in lib_info["lane_set"]:
 775                 # yes there's also a status code, but this comparison
 776                 # is easier to read
 777                 if flowcell["status"].lower() == "failed":
 778                     # ignore failed flowcell
 779                     failed += 1
 780                     pass
 781                 elif flowcell["paired_end"]:
 782                     is_paired += 1
 783                 else:
 784                     isnot_paired += 1
 785
 786             logging.debug("Library %s: %d paired, %d single, %d failed" % \
 787                      (lib_info["library_id"], is_paired, isnot_paired, failed))
 788
 789             if is_paired > isnot_paired:
 790                 self.lib_paired[lib_id] = True
 791             elif is_paired < isnot_paired:
 792                 self.lib_paired[lib_id] = False
 793             else:
 794                 raise RuntimeError("Equal number of paired & unpaired lanes."\
 795                                    "Can't guess library paired status")
 796
 797         return self.lib_paired[lib_id]
 798
 799     def get_paired_attributes(self, lib_info):
 800         if lib_info['insert_size'] is None:
 801             errmsg = "Library %s is missing insert_size, assuming 200"
 802             logging.warn(errmsg % (lib_info["library_id"],))
 803             insert_size = 200
 804         else:
 805             insert_size = lib_info['insert_size']
 806         return {'insertLength': insert_size,
 807                 'readType': '2x75'}
 808
 809     def get_single_attributes(self, lib_info):
 810         return {'insertLength':'ilNA',
 811                 'readType': '1x75D'
 812                 }
 813
 814 def make_submission_section(line_counter, files, attributes):
 815     """
 816     Create a section in the submission ini file
 817     """
 818     inifile = [ "[line%s]" % (line_counter,) ]
 819     inifile += ["files=%s" % (",".join(files))]
 820
 821     for k,v in attributes.items():
 822         inifile += ["%s=%s" % (k,v)]
 823     return inifile
 824
 825
 826 def make_base_name(pathname):
 827     base = os.path.basename(pathname)
 828     name, ext = os.path.splitext(base)
 829     return name
 830
 831
 832 def make_submission_name(ininame):
 833     name = make_base_name(ininame)
 834     return name + ".tgz"
 835
 836
 837 def make_ddf_name(pathname):
 838     name = make_base_name(pathname)
 839     return name + ".ddf"
 840
 841
 842 def make_condor_name(pathname, run_type=None):
 843     name = make_base_name(pathname)
 844     elements = [name]
 845     if run_type is not None:
 846         elements.append(run_type)
 847     elements.append("condor")
 848     return ".".join(elements)
 849
 850
 851 def make_submit_script(target, header, body_list):
 852     """
 853     write out a text file
 854
 855     this was intended for condor submit scripts
 856
 857     Args:
 858       target (str or stream):
 859         if target is a string, we will open and close the file
 860         if target is a stream, the caller is responsible.
 861
 862       header (str);
 863         header to write at the beginning of the file
 864       body_list (list of strs):
 865         a list of blocks to add to the file.
 866     """
 867     if type(target) in types.StringTypes:
 868         f = open(target,"w")
 869     else:
 870         f = target
 871     f.write(header)
 872     for entry in body_list:
 873         f.write(entry)
 874     if type(target) in types.StringTypes:
 875         f.close()
 876
 877 def parse_filelist(file_string):
 878     return file_string.split(",")
 879
 880
 881 def validate_filelist(files):
 882     """
 883     Die if a file doesn't exist in a file list
 884     """
 885     for f in files:
 886         if not os.path.exists(f):
 887             raise RuntimeError("%s does not exist" % (f,))
 888
 889 def make_md5sum(filename):
 890     """Quickly find the md5sum of a file
 891     """
 892     md5_cache = os.path.join(filename+".md5")
 893     print md5_cache
 894     if os.path.exists(md5_cache):
 895         logging.debug("Found md5sum in {0}".format(md5_cache))
 896         stream = open(md5_cache,'r')
 897         lines = stream.readlines()
 898         md5sum = parse_md5sum_line(lines, filename)
 899     else:
 900         md5sum = make_md5sum_unix(filename, md5_cache)
 901     return md5sum
 902
 903 def make_md5sum_unix(filename, md5_cache):
 904     cmd = ["md5sum", filename]
 905     logging.debug("Running {0}".format(" ".join(cmd)))
 906     p = Popen(cmd, stdout=PIPE)
 907     stdin, stdout = p.communicate()
 908     retcode = p.wait()
 909     logging.debug("Finished {0} retcode {1}".format(" ".join(cmd), retcode))
 910     if retcode != 0:
 911         logging.error("Trouble with md5sum for {0}".format(filename))
 912         return None
 913     lines = stdin.split(os.linesep)
 914     md5sum = parse_md5sum_line(lines, filename)
 915     if md5sum is not None:
 916         logging.debug("Caching sum in {0}".format(md5_cache))
 917         stream = open(md5_cache, "w")
 918         stream.write(stdin)
 919         stream.close()
 920     return md5sum
 921
 922 def parse_md5sum_line(lines, filename):
 923     md5sum, md5sum_filename = lines[0].split()
 924     if md5sum_filename != filename:
 925         errmsg = "MD5sum and I disagre about filename. {0} != {1}"
 926         logging.error(errmsg.format(filename, md5sum_filename))
 927         return None
 928     return md5sum
 929
 930 if __name__ == "__main__":
 931     main()