From: Diane Trout <diane@caltech.edu>
Date: Fri, 23 May 2008 21:33:07 +0000 (+0000)
Subject: Update pipeline.gerald to handle eland_result files that have been bzipped.
X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~59
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=5ff9e196b8377adb2034f77566ce608a385290fc

Update pipeline.gerald to handle eland_result files that have been bzipped.
Also I added my opener module which will try to guess the right
compression utility for a file.
---

diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py
index 8364607..1a6bce1 100644
--- a/gaworkflow/pipeline/gerald.py
+++ b/gaworkflow/pipeline/gerald.py
@@ -14,6 +14,7 @@ from gaworkflow.pipeline.runfolder import \
    LANES_PER_FLOWCELL, \
    VERSION_RE
 from gaworkflow.util.ethelp import indent, flatten
+from gaworkflow.util.opener import autoopen
 
 class Gerald(object):
     """
@@ -419,7 +420,7 @@ class ElandLane(object):
                        'U0':0, 'U1':0, 'U2':0,
                        'R0':0, 'R1':0, 'R2':0,
                       }
-        for line in open(self.pathname):
+        for line in autoopen(self.pathname,'r'):
             reads += 1
             fields = line.split()
             # code = fields[2]
@@ -601,7 +602,13 @@ class ELAND(object):
 
 def eland(basedir, gerald=None, genome_maps=None):
     e = ELAND()
-    for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
+
+    file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
+    if len(file_list) == 0:
+        # lets handle compressed eland files too
+        file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
+
+    for pathname in file_list:
         # yes the lane_id is also being computed in ElandLane._update
         # I didn't want to clutter up my constructor
         # but I needed to persist the sample_name/lane_id for
diff --git a/gaworkflow/util/opener.py b/gaworkflow/util/opener.py
new file mode 100644
index 0000000..035bb24
--- /dev/null
+++ b/gaworkflow/util/opener.py
@@ -0,0 +1,57 @@
+"""
+Helpful utilities for turning random names/objects into streams.
+"""
+import os
+import gzip
+import bz2
+import types
+import urllib2
+
+def isfilelike(file_ref, mode):
+    """Does file_ref have the core file operations?
+    """
+    # if mode is w/a check to make sure we writeable ops
+    # but always check to see if we can read
+    read_operations = ['read', 'readline', 'readlines']
+    write_operations = [ 'write', 'writelines' ]
+    #random_operations = [ 'seek', 'tell' ]
+    if mode[0] in ('w', 'a'):
+        for o in write_operations:
+            if not hasattr(file_ref, o):
+                return False
+    for o in read_operations:
+        if not hasattr(file_ref, o):
+            return False
+          
+    return True
+
+def isurllike(file_ref, mode):
+    """
+    does file_ref look like a url?
+    (AKA does it start with protocol:// ?)
+    """
+    #what if mode is 'w'?
+    parsed = urllib2.urlparse.urlparse(file_ref)
+    schema, netloc, path, params, query, fragment = parsed
+    
+    return len(schema) > 0
+
+def autoopen(file_ref, mode='r'):
+    """
+    Attempt to intelligently turn file_ref into a readable stream
+    """
+    # catch being passed a file
+    if type(file_ref) is types.FileType:
+        return file_ref
+    # does it look like a file?
+    elif isfilelike(file_ref, mode):
+        return file_ref
+    elif isurllike(file_ref, mode):
+        return urllib2.urlopen(file_ref)
+    elif os.path.splitext(file_ref)[1] == ".gz":
+        return gzip.open(file_ref, mode)
+    elif os.path.splitext(file_ref)[1] == '.bz2':
+        return bz2.BZ2File(file_ref, mode)
+    else:
+        return open(file_ref,mode)
+