add rerun_eland.py which extracts sub-sequences from eland files and runs
authorDiane Trout <diane@caltech.edu>
Thu, 15 May 2008 00:32:47 +0000 (00:32 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 15 May 2008 00:32:47 +0000 (00:32 +0000)
eland on them with a new sequence length.

The script also helpfully uses the gerald config file to figure out the
correct genome path.

gaworkflow/pipeline/gerald.py
scripts/elandseq
scripts/rerun_eland.py [new file with mode: 0644]

index e7e09de2f13a79b1b9c342a0342466bae973ca69..83646075355b1d9e565c514ecb4acceb18aae5e6 100644 (file)
@@ -537,6 +537,18 @@ class ElandLane(object):
             else:
                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
 
+def extract_eland_sequence(instream, outstream, start, end):
+    """
+    Extract a chunk of sequence out of an eland file
+    """
+    for line in instream:
+        record = line.split()
+        if len(record) > 1:
+            result = [record[0], record[1][start:end]]
+        else:
+            result = [record[0][start:end]]
+        outstream.write("\t".join(result))
+        outstream.write(os.linesep)
 
 class ELAND(object):
     """
index 1dedc382703e2e5498d34ee86a1c65b1ba488489..ece4bf3acbd8b71f62a3135bff602b48c888b2eb 100755 (executable)
@@ -3,16 +3,7 @@ import optparse
 import os
 import sys
 
-def extract_sequence(instream, outstream, start, end):
-  for line in instream:
-    record = line.split()
-    if len(record) > 1:
-      result = [record[0], record[1][start:end]]
-    else:
-      result = [record[0][start:end]]
-    outstream.write("\t".join(result))
-    outstream.write(os.linesep)
-      
+from gaworkflow.pipeline.gerald import extract_eland_sequence
 
 def make_parser():
   usage = "usage: %prog [options] infile [outfile]"
@@ -53,7 +44,7 @@ def main(argv):
   else:
     outstream = sys.stdout
 
-  extract_sequence(instream, outstream, start, end)
+  extract_eland_sequence(instream, outstream, start, end)
 
 if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))
diff --git a/scripts/rerun_eland.py b/scripts/rerun_eland.py
new file mode 100644 (file)
index 0000000..25aecc4
--- /dev/null
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+
+import logging
+from optparse import OptionParser
+import os
+import subprocess
+import sys
+
+from gaworkflow.pipeline import gerald
+
+def make_query_filename(eland_obj, output_dir):
+    query_name = '%s_%s_eland_query.txt' 
+    query_name %= (eland_obj.sample_name, eland_obj.lane_id)
+
+    query_pathname = os.path.join(output_dir, query_name)
+    
+    if os.path.exists(query_pathname):
+        logging.warn("overwriting %s" % (query_pathname,))
+
+    return query_pathname
+
+def make_result_filename(eland_obj, output_dir):
+    result_name = '%s_%s_eland_result.txt' 
+    result_name %= (eland_obj.sample_name, eland_obj.lane_id)
+
+    result_pathname = os.path.join(output_dir, result_name)
+    
+    if os.path.exists(result_pathname):
+        logging.warn("overwriting %s" % (result_pathname,))
+
+    return result_pathname
+
+def extract_sequence(inpathname, query_pathname, length, dry_run=False):
+    logging.info('extracting %d bases' %(length,))
+    logging.info('extracting from %s' %(inpathname,))
+    logging.info('extracting to %s' %(query_pathname,))
+    
+    if not dry_run: 
+        try:
+            instream = open(inpathname, 'r')
+            outstream = open(query_pathname, 'w')
+            gerald.extract_eland_sequence(instream, outstream, 0, length)
+        finally:
+            outstream.close()
+            instream.close()
+
+def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
+    cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
+    if multi:
+        cmdline += ['--multi']
+
+    logging.info('running eland: ' + " ".join(cmdline))
+    if not dry_run:
+        return subprocess.Popen(cmdline)
+    else:
+        return None
+
+
+def rerun(gerald_dir, output_dir, length=25, dry_run=False):
+    """
+    look for eland files in gerald_dir and write a subset to output_dir
+    """
+    logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
+    g = gerald.gerald(gerald_dir)
+
+    processes = []
+    for lane_id, lane_param in g.lanes.items():
+        eland = g.eland_results[lane_id]
+
+        inpathname = eland.pathname
+        query_pathname = make_query_filename(eland, output_dir)
+        result_pathname = make_result_filename(eland, output_dir)
+
+        extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
+
+        p = run_eland(length, 
+                      query_pathname, 
+                      lane_param.eland_genome, 
+                      result_pathname, 
+                      dry_run=dry_run)
+        if p is not None:
+            processes.append(p)
+
+    for p in processes:
+        p.wait()
+        
+def make_parser():
+    usage = '%prog: --gerald <gerald dir> -o <new dir>'
+
+    parser = OptionParser(usage)
+    
+    parser.add_option('--gerald', 
+                      help='specify location of GERALD directory',
+                      default=None)
+    parser.add_option('-o', '--output',
+                      help='specify output location of files',
+                      default=None)
+    parser.add_option('-l', '--read-length', type='int',
+                      help='specify new eland length',
+                      dest='length',
+                      default=25)
+    parser.add_option('--dry-run', action='store_true',
+                      help='only pretend to run',
+                      default=False)
+    parser.add_option('-v', '--verbose', action='store_true',
+                      help='increase verbosity',
+                      default=False)
+
+    return parser
+
+
+def main(cmdline=None):
+    logging.basicConfig(level=logging.WARNING)
+
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.gerald is None:
+        parser.error("need gerald directory")
+    
+    if opts.output is None:
+        parser.error("specify location for the new eland files")
+
+    if opts.length < 16 or opts.length > 32:
+        parser.error("eland can only process reads in the range 16-32")
+
+    if opts.verbose:
+        root_logger = logging.getLogger()
+        root_logger.setLevel(logging.INFO)
+
+    rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))