From: Diane Trout Date: Fri, 23 May 2008 21:33:07 +0000 (+0000) Subject: Update pipeline.gerald to handle eland_result files that have been bzipped. X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~59 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=5ff9e196b8377adb2034f77566ce608a385290fc Update pipeline.gerald to handle eland_result files that have been bzipped. Also I added my opener module which will try to guess the right compression utility for a file. --- diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py index 8364607..1a6bce1 100644 --- a/gaworkflow/pipeline/gerald.py +++ b/gaworkflow/pipeline/gerald.py @@ -14,6 +14,7 @@ from gaworkflow.pipeline.runfolder import \ LANES_PER_FLOWCELL, \ VERSION_RE from gaworkflow.util.ethelp import indent, flatten +from gaworkflow.util.opener import autoopen class Gerald(object): """ @@ -419,7 +420,7 @@ class ElandLane(object): 'U0':0, 'U1':0, 'U2':0, 'R0':0, 'R1':0, 'R2':0, } - for line in open(self.pathname): + for line in autoopen(self.pathname,'r'): reads += 1 fields = line.split() # code = fields[2] @@ -601,7 +602,13 @@ class ELAND(object): def eland(basedir, gerald=None, genome_maps=None): e = ELAND() - for pathname in glob(os.path.join(basedir, "*_eland_result.txt")): + + file_list = glob(os.path.join(basedir, "*_eland_result.txt")) + if len(file_list) == 0: + # lets handle compressed eland files too + file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) + + for pathname in file_list: # yes the lane_id is also being computed in ElandLane._update # I didn't want to clutter up my constructor # but I needed to persist the sample_name/lane_id for diff --git a/gaworkflow/util/opener.py b/gaworkflow/util/opener.py new file mode 100644 index 0000000..035bb24 --- /dev/null +++ b/gaworkflow/util/opener.py @@ -0,0 +1,57 @@ +""" +Helpful utilities for turning random names/objects into streams. +""" +import os +import gzip +import bz2 +import types +import urllib2 + +def isfilelike(file_ref, mode): + """Does file_ref have the core file operations? + """ + # if mode is w/a check to make sure we writeable ops + # but always check to see if we can read + read_operations = ['read', 'readline', 'readlines'] + write_operations = [ 'write', 'writelines' ] + #random_operations = [ 'seek', 'tell' ] + if mode[0] in ('w', 'a'): + for o in write_operations: + if not hasattr(file_ref, o): + return False + for o in read_operations: + if not hasattr(file_ref, o): + return False + + return True + +def isurllike(file_ref, mode): + """ + does file_ref look like a url? + (AKA does it start with protocol:// ?) + """ + #what if mode is 'w'? + parsed = urllib2.urlparse.urlparse(file_ref) + schema, netloc, path, params, query, fragment = parsed + + return len(schema) > 0 + +def autoopen(file_ref, mode='r'): + """ + Attempt to intelligently turn file_ref into a readable stream + """ + # catch being passed a file + if type(file_ref) is types.FileType: + return file_ref + # does it look like a file? + elif isfilelike(file_ref, mode): + return file_ref + elif isurllike(file_ref, mode): + return urllib2.urlopen(file_ref) + elif os.path.splitext(file_ref)[1] == ".gz": + return gzip.open(file_ref, mode) + elif os.path.splitext(file_ref)[1] == '.bz2': + return bz2.BZ2File(file_ref, mode) + else: + return open(file_ref,mode) +