Update pipeline.gerald to handle eland_result files that have been bzipped.

author Diane Trout <diane@caltech.edu>

Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)

committer Diane Trout <diane@caltech.edu>

Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)
author Diane Trout <diane@caltech.edu>
Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)
committer Diane Trout <diane@caltech.edu>
Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)
diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py

index 83646075355b1d9e565c514ecb4acceb18aae5e6..1a6bce18cff7eb45f217bbb8133faaa39a8250d3 100644 (file)
--- a/gaworkflow/pipeline/gerald.py
+++ b/gaworkflow/pipeline/gerald.py
@@ -14,6 +14,7 @@ from gaworkflow.pipeline.runfolder import \
     LANES_PER_FLOWCELL, \
     VERSION_RE
  from gaworkflow.util.ethelp import indent, flatten
+from gaworkflow.util.opener import autoopen
  
  class Gerald(object):
      """
@@ -419,7 +420,7 @@ class ElandLane(object):
                         'U0':0, 'U1':0, 'U2':0,
                         'R0':0, 'R1':0, 'R2':0,
                        }
-        for line in open(self.pathname):
+        for line in autoopen(self.pathname,'r'):
              reads += 1
              fields = line.split()
              # code = fields[2]
@@ -601,7 +602,13 @@ class ELAND(object):
  
  def eland(basedir, gerald=None, genome_maps=None):
      e = ELAND()
-    for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
+
+    file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
+    if len(file_list) == 0:
+        # lets handle compressed eland files too
+        file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
+
+    for pathname in file_list:
          # yes the lane_id is also being computed in ElandLane._update
          # I didn't want to clutter up my constructor
          # but I needed to persist the sample_name/lane_id for
diff --git a/gaworkflow/util/opener.py b/gaworkflow/util/opener.py

new file mode 100644 (file)

index 0000000..035bb24
--- /dev/null
+++ b/gaworkflow/util/opener.py
@@ -0,0 +1,57 @@
+"""
+Helpful utilities for turning random names/objects into streams.
+"""
+import os
+import gzip
+import bz2
+import types
+import urllib2
+
+def isfilelike(file_ref, mode):
+    """Does file_ref have the core file operations?
+    """
+    # if mode is w/a check to make sure we writeable ops
+    # but always check to see if we can read
+    read_operations = ['read', 'readline', 'readlines']
+    write_operations = [ 'write', 'writelines' ]
+    #random_operations = [ 'seek', 'tell' ]
+    if mode[0] in ('w', 'a'):
+        for o in write_operations:
+            if not hasattr(file_ref, o):
+                return False
+    for o in read_operations:
+        if not hasattr(file_ref, o):
+            return False
+          
+    return True
+
+def isurllike(file_ref, mode):
+    """
+    does file_ref look like a url?
+    (AKA does it start with protocol:// ?)
+    """
+    #what if mode is 'w'?
+    parsed = urllib2.urlparse.urlparse(file_ref)
+    schema, netloc, path, params, query, fragment = parsed
+    
+    return len(schema) > 0
+
+def autoopen(file_ref, mode='r'):
+    """
+    Attempt to intelligently turn file_ref into a readable stream
+    """
+    # catch being passed a file
+    if type(file_ref) is types.FileType:
+        return file_ref
+    # does it look like a file?
+    elif isfilelike(file_ref, mode):
+        return file_ref
+    elif isurllike(file_ref, mode):
+        return urllib2.urlopen(file_ref)
+    elif os.path.splitext(file_ref)[1] == ".gz":
+        return gzip.open(file_ref, mode)
+    elif os.path.splitext(file_ref)[1] == '.bz2':
+        return bz2.BZ2File(file_ref, mode)
+    else:
+        return open(file_ref,mode)
+
author	Diane Trout <diane@caltech.edu>
	Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)
committer	Diane Trout <diane@caltech.edu>
	Fri, 23 May 2008 21:33:07 +0000 (21:33 +0000)
gaworkflow/pipeline/gerald.py		patch \| blob \| history
gaworkflow/util/opener.py	[new file with mode: 0644]	patch \| blob