Save changes needed to submit to UCSC in Jun 2010.

author Diane Trout <diane@caltech.edu>

Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)

committer Diane Trout <diane@caltech.edu>

Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)
author Diane Trout <diane@caltech.edu>
Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)
committer Diane Trout <diane@caltech.edu>
Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)
diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py

index 1ede70b4d534697d6032854c6af1297e41c76bec..e230e5c186d66cd06e74b74dcc2a7ca945bc31ba 100755 (executable)
--- a/extra/ucsc_encode_submission/ucsc_gather.py
+++ b/extra/ucsc_encode_submission/ucsc_gather.py
@@ -21,212 +21,95 @@ from htsworkflow.pipelines.sequences import \
      create_sequence_table, \
      scan_for_sequences
  
-def make_submission_name(ininame):
-    base = os.path.basename(ininame)
-    name, ext = os.path.splitext(base)
-    return name + '.tgz'
-
-def make_ddf_name(pathname):
-    base = os.path.basename(pathname)
-    name, ext = os.path.splitext(base)
-    return name + '.ddf'
  
-def make_condor_name(pathname):
-    base = os.path.basename(pathname)
-    name, ext = os.path.splitext(base)
-    return name + '.condor'
-    
-def make_submit_script(target, header, body_list):
-    """
-    write out a text file
-
-    this was intended for condor submit scripts
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
      
-    Args:
-      target (str or stream): 
-        if target is a string, we will open and close the file
-        if target is a stream, the caller is responsible.
-
-      header (str);
-        header to write at the beginning of the file
-      body_list (list of strs):
-        a list of blocks to add to the file.
-    """
-    if type(target) in types.StringTypes:
-        f = open(target,'w')
+    if opts.debug:
+        logging.basicConfig(level = logging.DEBUG )
+    elif opts.verbose:
+        logging.basicConfig(level = logging.INFO )
      else:
-        f = target
-    f.write(header)
-    for entry in body_list:
-        f.write(entry)
-    if type(target) in types.StringTypes:
-        f.close()
-
-def parse_filelist(file_string):
-    return file_string.split(',')
-
-def validate_filelist(files):
-    """
-    Die if a file doesn't exist in a file list
-    """
-    for f in files:
-        if not os.path.exists(f):
-            raise RuntimeError("%s does not exist" % (f,))
-
-def read_ddf_ini(filename, output=sys.stdout):
-    """
-    Read a ini file and dump out a tab delmited text file
-    """
-    file_list = []
-    config = SafeConfigParser()
-    config.read(filename)
-
-    order_by = shlex.split(config.get("config", "order_by"))
-
-    output.write("\t".join(order_by))
-    output.write(os.linesep)
-    sections = config.sections()
-    sections.sort()
-    for section in sections:
-        if section == "config":
-            # skip the config block
-            continue
-        values = []
-        for key in order_by:
-            v = config.get(section, key)
-            values.append(v)
-            if key == 'files':
-                file_list.extend(parse_filelist(v))
-                
-        output.write("\t".join(values))
-        output.write(os.linesep)
-    return file_list
-            
-def make_condor_archive_script(ininame, files):
-    script = """Universe = vanilla
-
-Executable = /bin/tar
-arguments = czvf ../%(archivename)s %(filelist)s
-
-Error = compress.err.$(Process).log
-Output = compress.out.$(Process).log
-Log = compress.log
-initialdir = %(initialdir)s
-
-queue 
-"""
-    for f in files:
-        if not os.path.exists(f):
-            raise RuntimeError("Missing %s" % (f,))
-
-    context = {'archivename': make_submission_name(ininame),
-               'filelist': " ".join(files),
-               'initialdir': os.getcwd()}
+        logging.basicConfig(level = logging.WARNING )
+        
+    
+    apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
  
-    condor_script = make_condor_name(ininame)
-    condor_stream = open(condor_script,'w')
-    condor_stream.write(script % context)
-    condor_stream.close()
+    if opts.host is None or opts.apiid is None or opts.apikey is None:
+        parser.error("Please specify host url, apiid, apikey")
  
-def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
-    """
-    Make ddf files, and bonus condor file
-    """
-    curdir = os.getcwd()
-    if outdir is not None:
-        os.chdir(outdir)
-    output = sys.stdout
-    ddf_name = None
-    if guess_ddf:
-        ddf_name = make_ddf_name(ininame)
-        print ddf_name
-        output = open(ddf_name,'w')
+    if len(args) == 0:
+        parser.error("I need at least one library submission-dir input file")
+        
+    library_result_map = []
+    for a in args:
+        library_result_map.extend(read_library_result_map(a))
  
-    file_list = read_ddf_ini(ininame, output)
+    if opts.daf is not None:
+        link_daf(opts.daf, library_result_map)
  
-    file_list.append(daf_name)
-    if ddf_name is not None:
-        file_list.append(ddf_name)
+    if opts.fastq:
+        build_fastqs(opts.host, 
+                     apidata, 
+                     opts.sequence, 
+                     library_result_map,
+                     not opts.single)
  
-    if make_condor:
-        make_condor_archive_script(ininame, file_list)
-        
-    os.chdir(curdir)
+    if opts.ini:
+        make_submission_ini(opts.host, apidata, library_result_map, not opts.single)
  
+    if opts.makeddf:
+        make_all_ddfs(library_result_map, opts.daf)
  
-def get_library_info(host, apidata, library_id):
-    url = api.library_url(host, library_id)
-    contents = api.retrieve_info(url, apidata)
-    return contents
+def make_parser():
+    # Load defaults from the config files
+    config = SafeConfigParser()
+    config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
      
-def read_library_result_map(filename):
-    stream = open(filename,'r')
-
-    results = []
-    for line in stream:
-        if not line.startswith('#'):
-            library_id, result_dir = line.strip().split()
-            results.append((library_id, result_dir))
-    return results
+    sequence_archive = None
+    apiid = None
+    apikey = None
+    apihost = None
+    SECTION = 'sequence_archive'
+    if config.has_section(SECTION):
+        sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
+        sequence_archive = os.path.expanduser(sequence_archive)
+        apiid = config.get(SECTION, 'apiid', apiid)
+        apikey = config.get(SECTION, 'apikey', apikey)
+        apihost = config.get(SECTION, 'host', apihost)
  
-def condor_srf_to_fastq(srf_file, target_pathname):
-    script = """output=%(target_pathname)s
-arguments="-c %(srf_file)s"
-queue
-"""
-    params = {'srf_file': srf_file,
-              'target_pathname': target_pathname}
-    
-    return  script % params
+    parser = OptionParser()
  
-def condor_qseq_to_fastq(qseq_file, target_pathname):
-    script = """
-arguments="-i %(qseq_file)s -o %(target_pathname)s"
-queue
-"""
-    params = {'qseq_file': qseq_file,
-              'target_pathname': target_pathname}
-    
-    return script % params
+    # commands
+    parser.add_option('--fastq', help="generate scripts for making fastq files",
+                      default=False, action="store_true")
  
-def find_archive_sequence_files(host, apidata, sequences_path, 
-                                library_result_map):
-    """
-    Find all the archive sequence files possibly associated with our results.
+    parser.add_option('--ini', help="generate submission ini file", default=False,
+                      action="store_true")
  
-    """
-    logging.debug("Searching for sequence files in: %s" %(sequences_path,))
+    parser.add_option('--makeddf', help='make the ddfs', default=False,
+                      action="store_true")
+    
+    parser.add_option('--daf', default=None, help='specify daf name')
  
-    lib_db = {}
-    seq_dirs = set()
-    #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
-    candidate_lanes = {}
-    for lib_id, result_dir in library_result_map:
-        lib_info = get_library_info(host, apidata, lib_id)
-        lib_db[lib_id] = lib_info
+    # configuration options
+    parser.add_option('--apiid', default=apiid, help="Specify API ID")
+    parser.add_option('--apikey', default=apikey, help="Specify API KEY")
+    parser.add_option('--host',  default=apihost,
+                      help="specify HTSWorkflow host",)
+    parser.add_option('--sequence', default=sequence_archive,
+                      help="sequence repository")
+    parser.add_option('--single', default=False, action="store_true", 
+                      help="treat the sequences as single ended runs")
  
-        for lane in lib_info['lane_set']:
-            lane_key = (lane['flowcell'], lane['lane_number'])
-            candidate_lanes[lane_key] = lib_id
-            seq_dirs.add(os.path.join(sequences_path, 
-                                         'flowcells', 
-                                         lane['flowcell']))
-    logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
-    candidate_seq_list = scan_for_sequences(seq_dirs)
+    # debugging
+    parser.add_option('--verbose', default=False, action="store_true",
+                      help='verbose logging')
+    parser.add_option('--debug', default=False, action="store_true",
+                      help='debug logging')
  
-    # at this point we have too many sequences as scan_for_sequences
-    # returns all the sequences in a flowcell directory
-    # so lets filter out the extras
-    
-    for seq in candidate_seq_list:
-        lane_key = (seq.flowcell, seq.lane)
-        lib_id = candidate_lanes.get(lane_key, None)
-        if lib_id is not None:
-            lib_info = lib_db[lib_id]
-            lanes = lib_info.setdefault('lanes', {})
-            lanes.setdefault(lane_key, set()).add(seq)
-    
-    return lib_db
+    return parser
  
  def build_fastqs(host, apidata, sequences_path, library_result_map, 
                   paired=True ):
@@ -243,19 +126,20 @@ def build_fastqs(host, apidata, sequences_path, library_result_map,
      """
      qseq_condor_header = """
  Universe=vanilla
-executable=/woldlab/rattus/lvol0/mus/home/diane/proj/gaworkflow/scripts/qseq2fastq
-error=qseqfastq.err.$(process).log
-output=qseqfastq.out.$(process).log
-log=qseqfastq.log
+executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/qseq2fastq
+error=log/qseq2fastq.err.$(process).log
+output=log/qseq2fastq.out.$(process).log
+log=log/qseq2fastq.log
  
  """
      qseq_condor_entries = []
      srf_condor_header = """
  Universe=vanilla
-executable=/woldlab/rattus/lvol0/mus/home/diane/bin/srf2fastq
-output=srf2fastq.out.$(process).log
-error=srf2fastq.err.$(process).log
-log=srffastq.log
+executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py
+output=log/srf_pair_fastq.out.$(process).log
+error=log/srf_pair_fastq.err.$(process).log
+log=log/srf_pair_fastq.log
+environment="PYTHONPATH=/home/diane/lib/python2.6/site-packages:/home/diane/proj/solexa/gaworkflow PATH=/woldlab/rattus/lvol0/mus/home/diane/bin:/usr/bin:/bin"
  
  """
      srf_condor_entries = []
@@ -272,6 +156,8 @@ log=srffastq.log
          lib = lib_db[lib_id]
          for lane_key, sequences in lib['lanes'].items():
              for seq in sequences:
+                if paired and seq.read is None:
+                    seq.read = 1
                  filename_attributes = { 
                      'flowcell': seq.flowcell,
                      'lib_id': lib_id,
@@ -286,6 +172,10 @@ log=srffastq.log
                  if seq.flowcell == '30CUUAAXX':
                      # 30CUUAAXX run sucked
                      continue
+                if seq.flowcell == '30DY0AAXX':
+                    # 30DY0 only ran for 151 bases instead of 152
+                    # it is actually 76 1st read, 75 2nd read
+                    seq.mid_point = 76
  
                  # end filters
                  if paired:
@@ -304,18 +194,20 @@ log=srffastq.log
          if available_sources.has_key('qseq'):
              source = available_sources['qseq']
              qseq_condor_entries.append(
-                condor_qseq_to_fastq(source.path, target_pathname)
+                condor_qseq_to_fastq(source.path, 
+                                     target_pathname, 
+                                     source.flowcell)
              )
          elif available_sources.has_key('srf'):
              source = available_sources['srf']
-            if source.read is not None:
-                logging.warn(
-                    "srf -> fastq paired end doesn't work yet: %s" % (source,)
-                )
-            else:
-                srf_condor_entries.append(
-                    condor_srf_to_fastq(source.path, target_pathname)
-                )
+            mid = getattr(source, 'mid_point', None)
+            srf_condor_entries.append(
+                condor_srf_to_fastq(source.path, 
+                                    target_pathname,
+                                    paired,
+                                    source.flowcell,
+                                    mid)
+            )
          else:
              print " need file", target_pathname
  
@@ -329,71 +221,62 @@ log=srffastq.log
                             qseq_condor_header,
                             qseq_condor_entries)
  
-def find_best_extension(extension_map, filename):
-    """
-    Search through extension_map looking for the best extension
-    The 'best' is the longest match
+def link_daf(daf_path, library_result_map):
+    if not os.path.exists(daf_path):
+        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
  
-    :Args:
-      extension_map (dict): '.ext' -> { 'view': 'name' or None }
-      filename (str): the filename whose extention we are about to examine
-    """
-    best_ext = None
-    path, last_ext = os.path.splitext(filename)
+    base_daf = os.path.basename(daf_path)
+    
+    for lib_id, result_dir in library_result_map:
+        submission_daf = os.path.join(result_dir, base_daf)
+        if not os.path.exists(submission_daf):
+            os.link(daf_path, submission_daf)
  
-    for ext in extension_map.keys():
-        if filename.endswith(ext):
-            if best_ext is None:
-                best_ext = ext
-            elif len(ext) > len(best_ext):
-                best_ext = ext
-    return best_ext
+def make_submission_ini(host, apidata, library_result_map, paired=True):
+    # ma is "map algorithm"
+    ma = 'TH1014'
+
+    if paired:
+        aligns = "Paired"
+    else:
+        aligns = "Aligns"
  
-def make_submission_section(line_counter, files, standard_attributes, file_attributes):
-    """
-    Create a section in the submission ini file
-    """
-    inifile = [ '[line%s]' % (line_counter,) ]
-    inifile += ["files=%s" % (",".join(files))]
-    cur_attributes = {}
-    cur_attributes.update(standard_attributes)
-    cur_attributes.update(file_attributes)
-    
-    for k,v in cur_attributes.items():
-        inifile += ["%s=%s" % (k,v)]
-    return inifile
-    
-def make_submission_ini(host, apidata, library_result_map):
      attributes = {
-        '.bai':                   {'view': None}, # bam index
-        '.bam':                   {'view': 'Signal'},
-        '.splices.bam':           {'view': 'Splices'},
-        '.bed':                   {'view': 'TopHatJunctions'},
-        '.bigwig':                {'view': 'RawSigna'},
-        '.tar.bz2':               {'view': None},
-        '.condor':                {'view': None},
-        '.daf':                   {'view': None},
-        '.ddf':                   {'view': None},
-        'novel.genes.expr':       {'view': 'GeneDeNovoFPKM'},
-        'novel.transcripts.expr': {'view': 'TranscriptDeNovoFPKM'},
-        '.genes.expr':            {'view': 'GeneFPKM'},
-        '.transcripts.expr':      {'view': 'TranscriptFPKM'},
-        '.fastq':                 {'view': 'Fastq' },
-        '_r1.fastq':              {'view': 'FastqRd1'},
-        '_r2.fastq':              {'view': 'FastqRd2'},
-        '.gtf':                   {'view': 'CufflinksGeneModel'},
-        '.ini':                   {'view': None},
-        '.log':                   {'view': None},
-        '.stats.txt':             {'view': 'InsLength'},
-        '.srf':                   {'view': None},
-        '.wig':                   {'view': 'RawSignal'},
+      # bam index
+     '.bai':                   {'view': None, 'MapAlgorithm': 'NA'},
+     '.bam':                   {'view': aligns, 'MapAlgorithm': ma},
+     '.splices.bam':           {'view': 'Splices', 'MapAlgorithm': ma},
+     '.jnct':                  {'view': 'Junctions', 'MapAlgorithm': ma},
+     '.plus.bigwig':           {'view': 'PlusSignal', 'MapAlgorithm': ma},
+     '.minus.bigwig':          {'view': 'MinusSignal', 'MapAlgorithm': ma},
+     '.bigwig':                {'view': 'Signal', 'MapAlgorithm': ma},
+     '.tar.bz2':               {'view': None},
+     '.condor':                {'view': None},
+     '.daf':                   {'view': None},
+     '.ddf':                   {'view': None},
+     'novel.genes.expr':       {'view': 'GeneDeNovo', 'MapAlgorithm': ma},
+     'novel.transcripts.expr': {'view': 'TranscriptDeNovo', 'MapAlgorithm': ma},
+     '.genes.expr':            {'view': 'GeneFPKM', 'MapAlgorithm': ma},
+     '.transcripts.expr':      {'view': 'TranscriptFPKM', 'MapAlgorithm': ma},
+     '.fastq':                 {'view': 'Fastq', 'MapAlgorithm': 'NA' },
+     '_r1.fastq':              {'view': 'FastqRd1', 'MapAlgorithm': 'NA'},
+     '_r2.fastq':              {'view': 'FastqRd2', 'MapAlgorithm': 'NA'},
+     '.gtf':                   {'view': 'GeneModel', 'MapAlgorithm': ma},
+     '.ini':                   {'view': None},
+     '.log':                   {'view': None},
+     '.stats.txt':             {'view': 'InsLength', 'MapAlgorithm': ma},
+     '.srf':                   {'view': None},
+     '.wig':                   {'view': None},
+     '.zip':                   {'view': None},
      }
     
      candidate_fastq_src = {}
  
      for lib_id, result_dir in library_result_map:
+        order_by = ['order_by=files', 'view', 'replicate', 'cell', 
+                    'readType', 'mapAlgorithm', 'insertLength' ]
          inifile =  ['[config]']
-        inifile += ['order_by=files view cell localization rnaExtract mapAlgorithm readType replicate labVersion']
+        inifile += [" ".join(order_by)]
          inifile += ['']
          line_counter = 1
          lib_info = get_library_info(host, apidata, lib_id)
@@ -403,14 +286,20 @@ def make_submission_ini(host, apidata, library_result_map):
              logging.warn("Library %s missing cell_line" % (lib_id,))
  
          standard_attributes = {'cell': lib_info['cell_line'],
-                               'insertLength': '200', 
-                               'labVersion': 'TopHat',
-                               'localization': 'cell',
-                               'mapAlgorithm': 'TopHat',
-                               'readType': '2x75', 
                                 'replicate': lib_info['replicate'],
-                               'rnaExtract': 'longPolyA',
                                 }
+        if paired:
+            if lib_info['insert_size'] is None:
+                errmsg = "Library %s is missing insert_size, assuming 200"
+                logging.warn(errmsg % (lib_id,))
+                insert_size = 200
+            else:
+                insert_size = lib_info['insert_size']
+            standard_attributes['insertLength'] = insert_size
+            standard_attributes['readType'] = '2x75'
+        else:
+            standard_attributes['insertLength'] = 'ilNA'
+            standard_attributes['readType'] = '1x75D'
  
          # write other lines
          submission_files = os.listdir(result_dir)
@@ -451,112 +340,348 @@ def make_submission_ini(host, apidata, library_result_map):
          f = open(result_ini,'w')
          f.write(os.linesep.join(inifile))
  
-def link_daf(daf_path, library_result_map):
-    if not os.path.exists(daf_path):
-        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
-
-    base_daf = os.path.basename(daf_path)
-    
-    for lib_id, result_dir in library_result_map:
-        submission_daf = os.path.join(result_dir, base_daf)
-        if not os.path.exists(submission_daf):
-            os.link(daf_path, submission_daf)
  
-def make_all_ddfs(library_result_map, daf_name):
+def make_all_ddfs(library_result_map, daf_name, make_condor=True):
+    dag_fragment = []
      for lib_id, result_dir in library_result_map:
          ininame = result_dir+'.ini'
          inipathname = os.path.join(result_dir, ininame)
          if os.path.exists(inipathname):
-            make_ddf(ininame, daf_name, True, True, result_dir)
+            dag_fragment.extend(
+                make_ddf(ininame, daf_name, True, make_condor, result_dir)
+            )
+
+    if make_condor and len(dag_fragment) > 0:
+        dag_filename = 'submission.dagman'
+        if os.path.exists(dag_filename):
+            logging.warn("%s exists, please delete" % (dag_filename,))
+        else:
+            f = open(dag_filename,'w')
+            f.write( os.linesep.join(dag_fragment))
+            f.write( os.linesep )
+            f.close()
              
-def make_parser():
-    # Load defaults from the config files
+
+def make_ddf(ininame,  daf_name, guess_ddf=False, make_condor=False, outdir=None):
+    """
+    Make ddf files, and bonus condor file
+    """
+    dag_fragments = []
+    curdir = os.getcwd()
+    if outdir is not None:
+        os.chdir(outdir)
+    output = sys.stdout
+    ddf_name = None
+    if guess_ddf:
+        ddf_name = make_ddf_name(ininame)
+        print ddf_name
+        output = open(ddf_name,'w')
+
+    file_list = read_ddf_ini(ininame, output)
+
+    file_list.append(daf_name)
+    if ddf_name is not None:
+        file_list.append(ddf_name)
+
+    if make_condor:
+        archive_condor = make_condor_archive_script(ininame, file_list)
+        upload_condor = make_condor_upload_script(ininame)
+        
+        dag_fragments.extend( 
+            make_dag_fragment(ininame, archive_condor, upload_condor)
+        ) 
+        
+    os.chdir(curdir)
+    
+    return dag_fragments
+
+def read_ddf_ini(filename, output=sys.stdout):
+    """
+    Read a ini file and dump out a tab delmited text file
+    """
+    file_list = []
      config = SafeConfigParser()
-    config.read([os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'])
+    config.read(filename)
+
+    order_by = shlex.split(config.get("config", "order_by"))
+
+    output.write("\t".join(order_by))
+    output.write(os.linesep)
+    sections = config.sections()
+    sections.sort()
+    for section in sections:
+        if section == "config":
+            # skip the config block
+            continue
+        values = []
+        for key in order_by:
+            v = config.get(section, key)
+            values.append(v)
+            if key == 'files':
+                file_list.extend(parse_filelist(v))
+                
+        output.write("\t".join(values))
+        output.write(os.linesep)
+    return file_list
      
-    sequence_archive = None
-    apiid = None
-    apikey = None
-    apihost = None
-    SECTION = 'sequence_archive'
-    if config.has_section(SECTION):
-        sequence_archive = config.get(SECTION, 'sequence_archive',sequence_archive)
-        sequence_archive = os.path.expanduser(sequence_archive)
-        apiid = config.get(SECTION, 'apiid', apiid)
-        apikey = config.get(SECTION, 'apikey', apikey)
-        apihost = config.get(SECTION, 'host', apihost)
+def read_library_result_map(filename):
+    """
+    Read a file that maps library id to result directory.
+    Does not support spaces in filenames. 
+    
+    For example:
+      10000 result/foo/bar
+    """
+    stream = open(filename,'r')
  
-    parser = OptionParser()
+    results = []
+    for line in stream:
+        if not line.startswith('#'):
+            library_id, result_dir = line.strip().split()
+            results.append((library_id, result_dir))
+    return results
+            
+def make_condor_archive_script(ininame, files):
+    script = """Universe = vanilla
  
-    # commands
-    parser.add_option('--fastq', help="generate scripts for making fastq files",
-                      default=False, action="store_true")
+Executable = /bin/tar
+arguments = czvf ../%(archivename)s %(filelist)s
  
-    parser.add_option('--ini', help="generate submission ini file", default=False,
-                      action="store_true")
+Error = compress.err.$(Process).log
+Output = compress.out.$(Process).log
+Log = /tmp/submission-compress.log
+initialdir = %(initialdir)s
  
-    parser.add_option('--makeddf', help='make the ddfs', default=False,
-                      action="store_true")
+queue 
+"""
+    for f in files:
+        if not os.path.exists(f):
+            raise RuntimeError("Missing %s" % (f,))
+
+    context = {'archivename': make_submission_name(ininame),
+               'filelist': " ".join(files),
+               'initialdir': os.getcwd()}
+
+    condor_script = make_condor_name(ininame, 'archive')
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    return condor_script
+
+def make_condor_upload_script(ininame):
+    script = """Universe = vanilla
+
+Executable = /usr/bin/lftp
+arguments = -c put ../%(archivename)s -o ftp://detrout@encodeftp.cse.ucsc.edu/
+
+Error = upload.err.$(Process).log
+Output = upload.out.$(Process).log
+Log = /tmp/submission-upload.log
+initialdir = %(initialdir)s
+
+queue 
+"""
+    context = {'archivename': make_submission_name(ininame),
+               'initialdir': os.getcwd()}
+
+    condor_script = make_condor_name(ininame, 'upload')
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    return condor_script
+
+def make_dag_fragment(ininame, archive_condor, upload_condor):
+    """
+    Make the couple of fragments compress and then upload the data.
+    """
+    cur_dir = os.getcwd()
+    archive_condor = os.path.join(cur_dir, archive_condor)
+    upload_condor = os.path.join(cur_dir, upload_condor)
+    job_basename = make_base_name(ininame)
+
+    fragments = []
+    fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
+    fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
+    fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
+
+    return fragments
+
+def get_library_info(host, apidata, library_id):
+    url = api.library_url(host, library_id)
+    contents = api.retrieve_info(url, apidata)
+    return contents
+
+def condor_srf_to_fastq(srf_file, target_pathname, paired, flowcell=None,
+                        mid=None):
+    args = ['-c', srf_file, ]
+    if paired:
+        args.extend(['--left', target_pathname])
+        # this is ugly. I did it because I was pregenerating the target
+        # names before I tried to figure out what sources could generate
+        # those targets, and everything up to this point had been
+        # one-to-one. So I couldn't figure out how to pair the 
+        # target names. 
+        # With this at least the command will run correctly.
+        # however if we rename the default targets, this'll break
+        # also I think it'll generate it twice.
+        args.extend(['--right', 
+                     target_pathname.replace('_r1.fastq', '_r2.fastq')])
+    else:
+        args.extend(['--single', target_pathname ])
+    if flowcell is not None:
+        args.extend(['--flowcell', flowcell])
+
+    if mid is not None:
+        args.extend(['-m', str(mid)])
+
+    script = """
+arguments="%s"
+queue
+""" % (" ".join(args),)
      
-    parser.add_option('--daf', default=None, help='specify daf name')
+    return  script 
  
-    # configuration options
-    parser.add_option('--apiid', default=apiid, help="Specify API ID")
-    parser.add_option('--apikey', default=apikey, help="Specify API KEY")
-    parser.add_option('--host',  default=apihost,
-                      help="specify HTSWorkflow host",)
-    parser.add_option('--sequence', default=sequence_archive,
-                      help="sequence repository")
-    parser.add_option('--single', default=False, action="store_true", 
-                      help="treat the sequences as single ended runs")
+def condor_qseq_to_fastq(qseq_file, target_pathname, flowcell=None):
+    args = ['-i', qseq_file, '-o', target_pathname ]
+    if flowcell is not None:
+        args.extend(['-f', flowcell])
+    script = """
+arguments="%s"
+queue
+""" % (" ".join(args))
  
-    # debugging
-    parser.add_option('--verbose', default=False, action="store_true",
-                      help='verbose logging')
-    parser.add_option('--debug', default=False, action="store_true",
-                      help='debug logging')
+    return script 
  
-    return parser
+def find_archive_sequence_files(host, apidata, sequences_path, 
+                                library_result_map):
+    """
+    Find all the archive sequence files possibly associated with our results.
  
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
+    """
+    logging.debug("Searching for sequence files in: %s" %(sequences_path,))
+
+    lib_db = {}
+    seq_dirs = set()
+    #seq_dirs = set(os.path.join(sequences_path, 'srfs'))
+    candidate_lanes = {}
+    for lib_id, result_dir in library_result_map:
+        lib_info = get_library_info(host, apidata, lib_id)
+        lib_db[lib_id] = lib_info
+
+        for lane in lib_info['lane_set']:
+            lane_key = (lane['flowcell'], lane['lane_number'])
+            candidate_lanes[lane_key] = lib_id
+            seq_dirs.add(os.path.join(sequences_path, 
+                                         'flowcells', 
+                                         lane['flowcell']))
+    logging.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
+    candidate_seq_list = scan_for_sequences(seq_dirs)
+
+    # at this point we have too many sequences as scan_for_sequences
+    # returns all the sequences in a flowcell directory
+    # so lets filter out the extras
      
-    if opts.debug:
-        logging.basicConfig(level = logging.DEBUG )
-    elif opts.verbose:
-        logging.basicConfig(level = logging.INFO )
-    else:
-        logging.basicConfig(level = logging.WARNING )
-        
+    for seq in candidate_seq_list:
+        lane_key = (seq.flowcell, seq.lane)
+        lib_id = candidate_lanes.get(lane_key, None)
+        if lib_id is not None:
+            lib_info = lib_db[lib_id]
+            lanes = lib_info.setdefault('lanes', {})
+            lanes.setdefault(lane_key, set()).add(seq)
      
-    apidata = {'apiid': opts.apiid, 'apikey': opts.apikey }
+    return lib_db
  
-    if opts.host is None or opts.apiid is None or opts.apikey is None:
-        parser.error("Please specify host url, apiid, apikey")
+def find_best_extension(extension_map, filename):
+    """
+    Search through extension_map looking for the best extension
+    The 'best' is the longest match
  
-    if len(args) == 0:
-        parser.error("I need at least one library submission-dir input file")
-        
-    library_result_map = []
-    for a in args:
-        library_result_map.extend(read_library_result_map(a))
+    :Args:
+      extension_map (dict): '.ext' -> { 'view': 'name' or None }
+      filename (str): the filename whose extention we are about to examine
+    """
+    best_ext = None
+    path, last_ext = os.path.splitext(filename)
  
-    if opts.daf is not None:
-        link_daf(opts.daf, library_result_map)
+    for ext in extension_map.keys():
+        if filename.endswith(ext):
+            if best_ext is None:
+                best_ext = ext
+            elif len(ext) > len(best_ext):
+                best_ext = ext
+    return best_ext
+    
+def make_submission_section(line_counter, files, standard_attributes, file_attributes):
+    """
+    Create a section in the submission ini file
+    """
+    inifile = [ '[line%s]' % (line_counter,) ]
+    inifile += ["files=%s" % (",".join(files))]
+    cur_attributes = {}
+    cur_attributes.update(standard_attributes)
+    cur_attributes.update(file_attributes)
+    
+    for k,v in cur_attributes.items():
+        inifile += ["%s=%s" % (k,v)]
+    return inifile
  
-    if opts.fastq:
-        build_fastqs(opts.host, 
-                     apidata, 
-                     opts.sequence, 
-                     library_result_map,
-                     not opts.single)
+def make_base_name(pathname):
+    base = os.path.basename(pathname)
+    name, ext = os.path.splitext(base)
+    return name
  
-    if opts.ini:
-        make_submission_ini(opts.host, apidata, library_result_map)
+def make_submission_name(ininame):
+    name = make_base_name(ininame)
+    return name + '.tgz'
  
-    if opts.makeddf:
-        make_all_ddfs(library_result_map, opts.daf)
+def make_ddf_name(pathname):
+    name = make_base_name(pathname)
+    return name + '.ddf'
+
+def make_condor_name(pathname, run_type=None):
+    name = make_base_name(pathname)
+    elements = [name]
+    if run_type is not None:
+        elements.append(run_type)
+    elements.append('condor')
+    return ".".join(elements)
+    
+def make_submit_script(target, header, body_list):
+    """
+    write out a text file
+
+    this was intended for condor submit scripts
+    
+    Args:
+      target (str or stream): 
+        if target is a string, we will open and close the file
+        if target is a stream, the caller is responsible.
+
+      header (str);
+        header to write at the beginning of the file
+      body_list (list of strs):
+        a list of blocks to add to the file.
+    """
+    if type(target) in types.StringTypes:
+        f = open(target,'w')
+    else:
+        f = target
+    f.write(header)
+    for entry in body_list:
+        f.write(entry)
+    if type(target) in types.StringTypes:
+        f.close()
+
+def parse_filelist(file_string):
+    return file_string.split(',')
+
+def validate_filelist(files):
+    """
+    Die if a file doesn't exist in a file list
+    """
+    for f in files:
+        if not os.path.exists(f):
+            raise RuntimeError("%s does not exist" % (f,))
          
  if __name__ == "__main__":
      main()
author	Diane Trout <diane@caltech.edu>
	Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)
committer	Diane Trout <diane@caltech.edu>
	Tue, 22 Jun 2010 19:07:49 +0000 (19:07 +0000)