-import sys
-
-from htsworkflow.util import fctracker
-
-def find_lanes(flowcell_dir, flowcell_id, lane):
- lane_name = "s_%s_eland_*" %(lane)
- pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
- lanes = glob(pattern)
- return lanes
-
-def make_long_lane_name(flowcell_dir, lane_pathname):
- """
- make a name from the eland result file name
- """
- if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
- subpath = lane_pathname[len(flowcell_dir):]
- long_name = subpath.replace(os.path.sep, "_")
- return long_name
+import re
+import shelve
+import urllib
+import urllib2
+import urlparse
+
+eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
+raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
+qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
+
+class SequenceFile(object):
+ def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
+ self.filetype = filetype
+ self.path = path
+ self.flowcell = flowcell
+ self.lane = lane
+ self.read = read
+ self.pf = pf
+ self.cycle = cycle
+
+ def __hash__(self):
+ return hash(self.key())
+
+ def key(self):
+ return (self.flowcell, self.lane)
+
+ def unicode(self):
+ return unicode(self.path)
+
+ def __repr__(self):
+ return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
+
+ def make_target_name(self, root):
+ """
+ Create target name for where we need to link this sequence too
+ """
+ path, basename = os.path.split(self.path)
+ # Because the names aren't unque we include the flowcel name
+ # because there were different eland files for different length
+ # analyses, we include the cycle length in the name.
+ if self.filetype == 'eland':
+ template = "%(flowcell)s_%(cycle)s_%(eland)s"
+ basename = template % { 'flowcell': self.flowcell,
+ 'cycle': self.cycle,
+ 'eland': basename }
+ # else:
+ # all the other file types have names that include flowcell/lane
+ # information and thus are unique so we don't have to do anything
+ return os.path.join(root, basename)
+
+def parse_srf(path, filename):
+ basename, ext = os.path.splitext(filename)
+ records = basename.split('_')
+ flowcell = records[4]
+ lane = int(records[5][0])
+ fullpath = os.path.join(path, filename)
+ return SequenceFile('srf', fullpath, flowcell, lane)
+
+def parse_qseq(path, filename):
+ basename, ext = os.path.splitext(filename)
+ records = basename.split('_')
+ fullpath = os.path.join(path, filename)
+ flowcell = records[4]
+ lane = int(records[5][1])
+ read = int(records[6][1])
+ return SequenceFile('qseq', fullpath, flowcell, lane, read)
+
+def parse_fastq(path, filename):
+ basename, ext = os.path.splitext(filename)
+ records = basename.split('_')
+ fullpath = os.path.join(path, filename)
+ flowcell = records[4]
+ lane = int(records[5][1])
+ read = int(records[6][1])
+ if records[-1] == 'pass':
+ pf = True