update_submission_detail(model, subUrn, status, last_mod,
cookie=cookie)
- logging.info("Processed {0}".format(subUrn))
+ LOGGER.info("Processed {0}".format(subUrn))
if len(status_nodes) == 0:
# has no status node, add one
- logging.info("Adding status node to {0}".format(subUrn))
+ LOGGER.info("Adding status node to {0}".format(subUrn))
status_node = create_status_node(subUrn, recent_update)
add_stmt(model, subUrn, HasStatusN, status_node)
add_stmt(model, status_node, rdfsNS['type'], StatusN)
update_ddf(model, subUrn, status_node, cookie=cookie)
update_daf(model, subUrn, status_node, cookie=cookie)
else:
- logging.info("Found {0} status blanks".format(len(status_nodes)))
+ LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
for status_statement in status_nodes:
status_node = status_statement.object
last_modified_query = RDF.Statement(status_node,
status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
if not model.contains_statement(status_is_daf):
- logging.info('Adding daf to {0}, {1}'.format(submission_url,
+ LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
status_node))
daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
daf.fromstring_into_model(model, status_node, daf_text)
status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
if not model.contains_statement(status_is_ddf):
- logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
+ LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
add_ddf_statements(model, statusNode, ddf_text)
model.add_statement(status_is_ddf)
elif len(results) == 1:
pass # Assuming that a loaded dataset has one record
else:
- logging.warning("Many dates for {0}".format(libraryUrn))
+ LOGGER.warning("Many dates for {0}".format(libraryUrn))
def get_library_id(name):
'POST',
headers=headers,
body=urllib.urlencode(credentials))
- logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
+ LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
response['status']))
cookie = response.get('set-cookie', None)
"""
for lib_id, lib_path in library_result_map:
if not os.path.exists(lib_path):
- logging.info("Making dir {0}".format(lib_path))
+ logger.info("Making dir {0}".format(lib_path))
os.mkdir(lib_path)
source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
if os.path.exists(source_lib_dir):
raise IOError("{0} does not exist".format(source_pathname))
if not os.path.exists(target_pathname):
os.symlink(source_pathname, target_pathname)
- logging.info(
+ logger.info(
'LINK {0} to {1}'.format(source_pathname, target_pathname))
"""Look through our submission directories and collect needed information
"""
for lib_id, result_dir in library_result_map:
- logging.info("Importing %s from %s" % (lib_id, result_dir))
+ logger.info("Importing %s from %s" % (lib_id, result_dir))
try:
view_map.import_submission_dir(result_dir, lib_id)
except MetadataLookupException, e:
- logging.error("Skipping %s: %s" % (lib_id, str(e)))
+ logger.error("Skipping %s: %s" % (lib_id, str(e)))
def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
dag_fragment = []
if make_condor and len(dag_fragment) > 0:
dag_filename = 'submission.dagman'
if not force and os.path.exists(dag_filename):
- logging.warn("%s exists, please delete" % (dag_filename,))
+ logger.warn("%s exists, please delete" % (dag_filename,))
else:
f = open(dag_filename,'w')
f.write( os.linesep.join(dag_fragment))
name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
if name is None:
- logging.error("Need name for %s" % (str(submissionNode)))
+ logger.error("Need name for %s" % (str(submissionNode)))
return []
ddf_name = name + '.ddf'
for variable_name in variables:
value = str(fromTypedNode(row[variable_name]))
if value is None or value == 'None':
- logging.warn("{0}: {1} was None".format(outfile, variable_name))
+ logger.warn("{0}: {1} was None".format(outfile, variable_name))
if variable_name in ('files', 'md5sum'):
current.setdefault(variable_name,[]).append(value)
else:
output.write(os.linesep)
all_files.extend(all_views[view]['files'])
- logging.info(
+ logger.info(
"Examined {0}, found files: {1}".format(
str(submissionNode), ", ".join(all_files)))
from benderjab import rpc
from htsworkflow.automation.solexa import is_runfolder
-
+
+LOGGER = logging.getLogger(__name__)
+
class rsync(object):
def __init__(self, sources, dest, pwfile):
self.cmd = ['/usr/bin/rsync', ]
"""
Get a directory listing for all our sources
"""
- logging.debug("searching for entries in: %s" % (self.source_base_list,))
+ LOGGER.debug("searching for entries in: %s" % (self.source_base_list,))
entries = []
for source in self.source_base_list:
- logging.debug("Scanning %s" % (source,))
+ LOGGER.debug("Scanning %s" % (source,))
args = copy.copy(self.cmd)
args.append(source)
- logging.debug("Rsync cmd:" + " ".join(args))
+ LOGGER.debug("Rsync cmd:" + " ".join(args))
short_process = subprocess.Popen(args, stdout=subprocess.PIPE)
exit_code = short_process.wait()
stdout = short_process.stdout
# We made sure source ends in a / earlier
cur_list = [ source+subdir for subdir in self.list_filter(stdout)]
entries.extend(cur_list)
- logging.debug(u"Found the following: %s" % (unicode(entries)))
+ LOGGER.debug(u"Found the following: %s" % (unicode(entries)))
return entries
def list_filter(self, lines):
"""
dirs_to_copy = []
direntries = [ x[0:42].split() + [x[43:-1]] for x in lines ]
- logging.debug(u'direntries: %s' % (unicode(direntries),))
+ LOGGER.debug(u'direntries: %s' % (unicode(direntries),))
for permissions, size, filedate, filetime, filename in direntries:
if permissions[0] == 'd':
- # hey its a directory, the first step to being something we want to
+ # hey its a directory, the first step to being something we want to
# copy
if re.match("[0-9]{6}", filename):
# it starts with something that looks like a 6 digit date
args = copy.copy(self.cmd)
# args.append('--dry-run') # Makes testing easier
# we want to copy everything
- args.append('-rlt')
+ args.append('-rlt')
# from here
args.append(urlname)
# to here
args.append(self.dest_base)
- logging.debug("Rsync cmd:" + " ".join(args))
+ LOGGER.debug("Rsync cmd:" + " ".join(args))
return subprocess.Popen(args)
-
+
def copy(self, url_list=None):
"""
copy any interesting looking directories over
# clean up any lingering non-running processes
self.poll()
- if url_list is None or len(url_list) == 0:
+ if url_list is None or len(url_list) == 0:
# what's available to copy?
dirs_to_copy = self.list()
else:
dirs_to_copy = url_list
-
- logging.info("dirs to copy %s" % (dirs_to_copy,))
+
+ LOGGER.info("dirs to copy %s" % (dirs_to_copy,))
# lets start copying
started = []
for d in dirs_to_copy:
process = self.processes.get(d, None)
-
+
if process is None:
# we don't have a process, so make one
- logging.info("rsyncing %s" % (d))
+ LOGGER.info("rsyncing %s" % (d))
self.processes[d] = self.create_copy_process(d)
- started.append(d)
+ started.append(d)
return started
def _normalize_rsync_source(self, source):
def poll(self):
"""
check currently running processes to see if they're done
-
+
return path roots that have finished.
"""
for dir_key, proc_value in self.processes.items():
# process hasn't finished yet
pass
elif retcode == 0:
- logging.info("finished rsyncing %s, exitcode %d" %( dir_key, retcode))
+ LOGGER.info("finished rsyncing %s, exitcode %d" %( dir_key, retcode))
del self.processes[dir_key]
else:
- logging.error("rsync failed for %s, exit code %d" % (dir_key, retcode))
-
+ LOGGER.error("rsync failed for %s, exit code %d" % (dir_key, retcode))
+
def __len__(self):
"""
Return how many active rsync processes we currently have
-
+
Call poll first to close finished processes.
"""
return len(self.processes)
-
+
def keys(self):
"""
Return list of current run folder names
def __init__(self, section=None, configfile=None):
#if configfile is None:
# configfile = '~/.htsworkflow'
-
+
super(CopierBot, self).__init__(section, configfile)
-
+
# options for rsync command
self.cfg['rsync_password_file'] = None
self.cfg['rsync_sources'] = None
- self.cfg['rsync_destination'] = None
-
- # options for reporting we're done
+ self.cfg['rsync_destination'] = None
+
+ # options for reporting we're done
self.cfg['notify_users'] = None
self.cfg['notify_runner'] = None
-
+
self.pending = []
self.rsync = None
self.notify_users = None
self.notify_runner = None
-
+
self.register_function(self.startCopy)
self.register_function(self.sequencingFinished)
self.eventTasks.append(self.update)
-
+
def _init_rsync(self):
"""
Initalize rsync class
This is only accessible for test purposes.
"""
- # we can't call any logging function until after start finishes.
+ # we can't call any LOGGER function until after start finishes.
# this got moved to a seperate function from run to help with test code
if self.rsync is None:
self.rsync = rsync(self.sources, self.destination, self.password)
read the config file
"""
super(CopierBot, self).read_config(section, configfile)
-
+
self.sources = shlex.split(self._check_required_option('rsync_sources'))
self.password = self._check_required_option('rsync_password_file')
self.destination = self._check_required_option('rsync_destination')
-
+
self.notify_users = self._parse_user_list(self.cfg['notify_users'])
try:
self.notify_runner = \
start our copy
"""
# Note, args comes in over the network, so don't trust it.
- logging.debug("Arguments to startCopy %s" % (unicode(args),))
+ LOGGER.debug("Arguments to startCopy %s" % (unicode(args),))
copy_urls = []
for a in args:
clean_url = self.validate_url(a)
if clean_url is not None:
copy_urls.append(clean_url)
- logging.info("Validated urls = %s" % (copy_urls,))
+ LOGGER.info("Validated urls = %s" % (copy_urls,))
started = self.rsync.copy(copy_urls)
- logging.info("copying:" + " ".join(started)+".")
+ LOGGER.info("copying:" + " ".join(started)+".")
return started
-
+
def sequencingFinished(self, runDir, *args):
"""
- The run was finished, if we're done copying, pass the message on
+ The run was finished, if we're done copying, pass the message on
"""
# close any open processes
self.rsync.poll()
-
+
# see if we're still copying
if is_runfolder(runDir):
- logging.info("recevied sequencing finshed for %s" % (runDir))
+ LOGGER.info("recevied sequencing finshed for %s" % (runDir))
self.pending.append(runDir)
self.startCopy()
return "PENDING"
else:
errmsg = "received bad runfolder name (%s)" % (runDir)
- logging.warning(errmsg)
+ LOGGER.warning(errmsg)
# maybe I should use a different error message
raise RuntimeError(errmsg)
-
+
def reportSequencingFinished(self, runDir):
"""
Send the sequencingFinished message to the interested parties
if self.notify_runner is not None:
for r in self.notify_runner:
self.rpc_send(r, (runDir,), 'sequencingFinished')
- logging.info("forwarding sequencingFinshed message for %s" % (runDir))
-
+ LOGGER.info("forwarding sequencingFinshed message for %s" % (runDir))
+
def update(self, *args):
"""
Update our current status.
if p not in self.rsync.keys():
self.reportSequencingFinished(p)
self.pending.remove(p)
-
+
def _parser(self, msg, who):
"""
Parse xmpp chat messages
help = u"I can [copy], or report current [status]"
if re.match(u"help", msg):
reply = help
- elif re.match("copy", msg):
+ elif re.match("copy", msg):
started = self.startCopy()
reply = u"started copying " + ", ".join(started)
elif re.match(u"status", msg):
def main(args=None):
bot = CopierBot()
bot.main(args)
-
+
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
from htsworkflow.pipelines.configure_run import *
+LOGGER = logging.getLogger(__name__)
+
#s_fc = re.compile('FC[0-9]+')
s_fc = re.compile('_[0-9a-zA-Z]*$')
junk, dirname = os.path.split(run_dir)
mo = s_fc.search(dirname)
if not mo:
- logging.error('RunDir 2 FlowCell error: %s' % (run_dir))
+ LOGGER.error('RunDir 2 FlowCell error: %s' % (run_dir))
return None
return dirname[mo.start()+1:]
-
+
class Runner(rpc.XmlRpcBot):
"""
Manage running pipeline jobs.
- """
+ """
def __init__(self, section=None, configfile=None):
#if configfile is None:
# self.configfile = "~/.htsworkflow"
super(Runner, self).__init__(section, configfile)
-
+
self.cfg['notify_users'] = None
self.cfg['genome_dir'] = None
self.cfg['base_analysis_dir'] = None
self.cfg['notify_postanalysis'] = None
self.conf_info_dict = {}
-
+
self.register_function(self.sequencingFinished)
#self.eventTasks.append(self.update)
-
+
def read_config(self, section=None, configfile=None):
super(Runner, self).read_config(section, configfile)
self.notify_users = self._parse_user_list(self.cfg['notify_users'])
#FIXME: process notify_postpipeline cfg
-
-
+
+
def _parser(self, msg, who):
"""
Parse xmpp chat messages
else:
reply = u"I didn't understand '%s'" %(msg)
- logging.debug("reply: " + str(reply))
+ LOGGER.debug("reply: " + str(reply))
return reply
def getStatusReport(self, fc_num):
"""
- Returns text status report for flow cell number
+ Returns text status report for flow cell number
"""
if fc_num not in self.conf_info_dict:
return "No record of a %s run." % (fc_num)
output = status.statusReport()
return '\n'.join(output)
-
-
+
+
def sequencingFinished(self, run_dir):
"""
Sequenceing (and copying) is finished, time to start pipeline
"""
- logging.debug("received sequencing finished message")
+ LOGGER.debug("received sequencing finished message")
# Setup config info object
ci = ConfigInfo()
ci.base_analysis_dir = self.base_analysis_dir
- ci.analysis_dir = os.path.join(self.base_analysis_dir, run_dir)
+ ci.analysis_dir = os.path.join(self.base_analysis_dir, run_dir)
# get flowcell from run_dir name
flowcell = _get_flowcell_from_rundir(run_dir)
# Launch the job in it's own thread and turn.
self.launchJob(run_dir, flowcell, ci)
return "started"
-
-
+
+
def pipelineFinished(self, run_dir):
# need to strip off self.watch_dir from rundir I suspect.
- logging.info("pipeline finished in" + str(run_dir))
+ LOGGER.info("pipeline finished in" + str(run_dir))
#pattern = self.watch_dir
#if pattern[-1] != os.path.sep:
# pattern += os.path.sep
#stripped_run_dir = re.sub(pattern, "", run_dir)
- #logging.debug("stripped to " + stripped_run_dir)
+ #LOGGER.debug("stripped to " + stripped_run_dir)
# Notify each user that the run has finished.
if self.notify_users is not None:
for u in self.notify_users:
self.send(u, 'Pipeline run %s finished' % (run_dir))
-
+
#if self.notify_runner is not None:
# for r in self.notify_runner:
# self.rpc_send(r, (stripped_run_dir,), 'sequencingFinished')
cfg_filepath,
self.genome_dir)
if status_retrieve_cfg:
- logging.info("Runner: Retrieve config: success")
+ LOGGER.info("Runner: Retrieve config: success")
self.reportMsg("Retrieve config (%s): success" % (run_dir))
else:
- logging.error("Runner: Retrieve config: failed")
+ LOGGER.error("Runner: Retrieve config: failed")
self.reportMsg("Retrieve config (%s): FAILED" % (run_dir))
-
+
# configure step
if status_retrieve_cfg:
status = configure(conf_info)
if status:
- logging.info("Runner: Configure: success")
+ LOGGER.info("Runner: Configure: success")
self.reportMsg("Configure (%s): success" % (run_dir))
self.reportMsg(
os.linesep.join(glob(os.path.join(run_dir,'Data','C*')))
)
else:
- logging.error("Runner: Configure: failed")
+ LOGGER.error("Runner: Configure: failed")
self.reportMsg("Configure (%s): FAILED" % (run_dir))
#if successful, continue
if status:
# Setup status cmdline status monitor
#startCmdLineStatusMonitor(ci)
-
+
# running step
print 'Running pipeline now!'
run_status = run_pipeline(conf_info)
if run_status is True:
- logging.info('Runner: Pipeline: success')
+ LOGGER.info('Runner: Pipeline: success')
self.reportMsg("Pipeline run (%s): Finished" % (run_dir,))
else:
- logging.info('Runner: Pipeline: failed')
+ LOGGER.info('Runner: Pipeline: failed')
self.reportMsg("Pipeline run (%s): FAILED" % (run_dir))
args=[run_dir, flowcell, conf_info])
t.setDaemon(True)
t.start()
-
-
+
+
def main(args=None):
bot = Runner()
return bot.main(args)
-
+
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
-
+
from benderjab import rpc
+LOGGER = logging.getLogger(__name__)
+
class WatcherEvent(object):
"""
Track information about a file event
self.time = time.time()
self.event_root = event_root
self.complete = False
-
+
def __unicode__(self):
if self.complete:
complete = "(completed)"
if not is_runfolder(target):
self.log.debug("Skipping %s, not a runfolder" % (target,))
continue
-
+
# grab the previous events for this watch path
watch_path_events = self.last_event.setdefault(watch_path, {})
class SpoolWatcher(rpc.XmlRpcBot):
"""
Watch a directory and send a message when another process is done writing.
-
+
This monitors a directory tree using inotify (linux specific) and
after some files having been written will send a message after <timeout>
seconds of no file writing.
-
+
(Basically when the solexa machine finishes dumping a round of data
this'll hopefully send out a message saying hey look theres data available
-
+
"""
# these params need to be in the config file
# I wonder where I should put the documentation
# `notify_timeout` - how often to timeout from notify
# `completion_files` - what files indicates we've finished sequencing
# defaults to: netcopy_complete.txt
-
+
def __init__(self, section=None, configfile=None):
#if configfile is None:
# self.configfile = "~/.htsworkflow"
super(SpoolWatcher, self).__init__(section, configfile)
-
+
self.cfg['watchdirs'] = None
self.cfg['write_timeout'] = 10
self.cfg['notify_users'] = None
self.cfg['notify_runner'] = None
self.cfg['completion_files'] = 'ImageAnalysis_Netcopy_complete_READ2.txt ImageAnalysis_Netcopy_complete_SINGLEREAD.txt'
-
+
self.watchdirs = []
self.watchdir_url_map = {}
self.notify_timeout = 0.001
- self.wm = None
+ self.wm = None
self.notify_users = None
self.notify_runner = None
self.wdds = []
# keep track of which mount points tie to which watch directories
# so maybe we can remount them.
self.mounts_to_watches = {}
-
+
self.eventTasks.append(self.process_notify)
def read_config(self, section=None, configfile=None):
- # Don't give in to the temptation to use logging functions here,
+ # Don't give in to the temptation to use logging functions here,
# need to wait until after we detach in start
super(SpoolWatcher, self).read_config(section, configfile)
-
+
self.watchdirs = shlex.split(self._check_required_option('watchdirs'))
# see if there's an alternate url that should be used for the watchdir
for watchdir in self.watchdirs:
self.write_timeout = int(self.cfg['write_timeout'])
self.completion_files = shlex.split(self.cfg['completion_files'])
-
+
self.notify_users = self._parse_user_list(self.cfg['notify_users'])
try:
self.notify_runner = \
self.wdds.append(self.wm.add_watch(w, mask, rec=True, auto_add=True))
def unmount_watch(self, event_path):
- # remove backwards so we don't get weirdness from
+ # remove backwards so we don't get weirdness from
# the list getting shorter
for i in range(len(self.wdds),0, -1):
wdd = self.wdds[i]
copy_url = root_copy_url + list_event_dir
self.log.debug('Copy url: %s' % (copy_url,))
return copy_url
-
+
def process_notify(self, *args):
if self.notifier is None:
# nothing to do yet
for last_event_dir, last_event_detail in last_events.items():
time_delta = time.time() - last_event_detail.time
if time_delta > self.write_timeout:
- print "timeout", unicode(last_event_detail)
+ LOGGER.info("timeout: %s" % (unicode(last_event_detail),))
copy_url = self.make_copy_url(watchdir, last_event_dir)
self.startCopy(copy_url)
if last_event_detail.complete:
help = u"I can send [copy] message, or squencer [finished]"
if re.match(u"help", msg):
reply = help
- elif re.match("copy", msg):
+ elif re.match("copy", msg):
self.startCopy(msg)
reply = u"sent copy message"
elif re.match(u"finished", msg):
else:
reply = u"need runfolder name"
else:
- reply = u"I didn't understand '%s'" %(msg)
+ reply = u"I didn't understand '%s'" %(msg)
return reply
-
+
def run(self):
"""
Start application
# after it's initialized.
self.add_watch()
super(SpoolWatcher, self).run()
-
+
def stop(self):
"""
shutdown application
if self.notifier is not None:
self.notifier.stop()
super(SpoolWatcher, self).stop()
-
+
def startCopy(self, copy_url=None):
self.log.debug("writes seem to have stopped")
if self.notify_runner is not None:
if self.notify_users is not None:
for u in self.notify_users:
self.send(u, 'startCopy %s.' % (copy_url,))
-
+
def sequencingFinished(self, run_dir):
# need to strip off self.watchdirs from rundir I suspect.
self.log.info("run.completed in " + str(run_dir))
for watch in self.watchdirs:
if not run_dir.startswith(watch):
- print "%s didn't start with %s" % (run_dir, watch)
+ LOGGER.info("%s didn't start with %s" % (run_dir, watch))
continue
if watch[-1] != os.path.sep:
watch += os.path.sep
def main(args=None):
bot = SpoolWatcher()
return bot.main(args)
-
+
if __name__ == "__main__":
ret = main(sys.argv[1:])
#sys.exit(ret)
from htsworkflow.frontend.experiments.models import FlowCell
from htsworkflow.frontend.bcmagic.models import Printer
+LOGGER = logging.getLogger(__name__)
try:
import uuid
except ImportError, e:
# Some systems are using python 2.4, which doesn't have uuid
# this is a stub
- logging.warning('Real uuid is not available, initializing fake uuid module')
+ LOGGER.warning('Real uuid is not available, initializing fake uuid module')
class uuid:
def uuid1(self):
self.hex = None
"""
if instance.default:
other_defaults = PrinterTemplate.objects.filter(default=True)
-
+
for other in other_defaults:
other.default = False
other.save()
-
+
class Vendor(models.Model):
name = models.CharField(max_length=256)
class Location(models.Model):
-
+
name = models.CharField(max_length=256, unique=True)
location_description = models.TextField()
-
+
uuid = models.CharField(max_length=32, blank=True, help_text="Leave blank for automatic UUID generation", editable=False)
-
+
notes = models.TextField(blank=True, null=True)
-
+
def __unicode__(self):
if len(self.location_description) > 16:
return u"%s: %s" % (self.name, self.location_description[0:16]+u"...")
model_id = models.CharField(max_length=256, blank=True, null=True)
part_number = models.CharField(max_length=256, blank=True, null=True)
lot_number = models.CharField(max_length=256, blank=True, null=True)
-
+
url = models.URLField(blank=True, null=True)
-
+
qty_purchased = models.IntegerField(default=1)
-
+
vendor = models.ForeignKey(Vendor)
purchase_date = models.DateField(blank=True, null=True)
warranty_months = models.IntegerField(blank=True, null=True)
-
+
notes = models.TextField(blank=True, null=True)
-
+
def __unicode__(self):
name = u''
if self.model_id:
name += u"part:%s " % (self.part_number)
if self.lot_number:
name += u"lot:%s " % (self.lot_number)
-
+
return u"%s: %s" % (name, self.purchase_date)
-
+
class Meta:
verbose_name_plural = "Item Info"
class ItemType(models.Model):
-
+
name = models.CharField(max_length=64, unique=True)
description = models.TextField(blank=True, null=True)
-
+
def __unicode__(self):
return u"%s" % (self.name)
class ItemStatus(models.Model):
name = models.CharField(max_length=64, unique=True)
notes = models.TextField(blank=True, null=True)
-
+
def __unicode__(self):
return self.name
-
+
class Meta:
verbose_name_plural = "Item Status"
class Item(models.Model):
-
+
item_type = models.ForeignKey(ItemType)
-
+
#Automatically assigned uuid; used for barcode if one is not provided in
# barcode_id
uuid = models.CharField(max_length=32, blank=True, help_text="Leave blank for automatic UUID generation", unique=True, editable=False)
-
+
# field for existing barcodes; used instead of uuid if provided
barcode_id = models.CharField(max_length=256, blank=True, null=True)
force_use_uuid = models.BooleanField(default=False)
-
+
item_info = models.ForeignKey(ItemInfo)
-
+
location = models.ForeignKey(Location)
-
+
status = models.ForeignKey(ItemStatus, blank=True, null=True)
-
+
creation_date = models.DateTimeField(auto_now_add=True)
modified_date = models.DateTimeField(auto_now=True)
-
+
notes = models.TextField(blank=True, null=True)
-
+
def __unicode__(self):
if self.barcode_id is None or len(self.barcode_id) == 0:
return u"invu|%s" % (self.uuid)
else:
return u"invb|%s" % (self.barcode_id)
-
+
def get_absolute_url(self):
return '/inventory/%s/' % (self.uuid)
-
+
pre_save.connect(_assign_uuid, sender=Item)
"""
item_type = models.ForeignKey(ItemType)
printer = models.ForeignKey(Printer)
-
+
default = models.BooleanField()
-
+
template = models.TextField()
-
+
def __unicode__(self):
if self.default:
return u'%s %s' % (self.item_type.name, self.printer.name)
class LongTermStorage(models.Model):
-
+
flowcell = models.ForeignKey(FlowCell)
libraries = models.ManyToManyField(Library)
storage_devices = models.ManyToManyField(Item)
-
+
creation_date = models.DateTimeField(auto_now_add=True)
modified_date = models.DateTimeField(auto_now=True)
-
+
def __unicode__(self):
return u"%s: %s" % (str(self.flowcell), ', '.join([ str(s) for s in self.storage_devices.iterator() ]))
-
+
class Meta:
verbose_name_plural = "Long Term Storage"
-
+
class ReagentBase(models.Model):
-
+
reagent = models.ManyToManyField(Item)
-
+
creation_date = models.DateTimeField(auto_now_add=True)
modified_date = models.DateTimeField(auto_now=True)
-
+
class Meta:
abstract = True
Links reagents and flowcells
"""
flowcell = models.ForeignKey(FlowCell)
-
+
def __unicode__(self):
return u"%s: %s" % (str(self.flowcell), ', '.join([ str(s) for s in self.reagent.iterator() ]))
-
+
class ReagentLibrary(ReagentBase):
"""
Links libraries and flowcells
"""
library = models.ForeignKey(Library)
-
+
def __unicode__(self):
return u"%s: %s" % (str(self.library), ', '.join([ str(s) for s in self.reagent.iterator() ]))
'bcmagic': BarcodeMagicForm()
}
+LOGGER = logging.getLogger(__name__)
+
def count_lanes(lane_set):
single = 0
paired = 1
#except Exception, e:
# summary_list.append("Summary report needs to be updated.")
- # logging.error("Exception: " + str(e))
+ # LOGGER.error("Exception: " + str(e))
return (summary_list, err_list)
"""
Extract configuration from Illumina Bustard Directory.
-This includes the version number, run date, bustard executable parameters, and
-phasing estimates.
+This includes the version number, run date, bustard executable parameters, and
+phasing estimates.
"""
from copy import copy
from datetime import date
VERSION_RE, \
EUROPEAN_STRPTIME
+LOGGER = logging.getLogger(__name__)
+
# make epydoc happy
__docformat__ = "restructuredtext en"
self.base['A'] = [ float(v) for v in data[:4] ]
self.base['C'] = [ float(v) for v in data[4:8] ]
self.base['G'] = [ float(v) for v in data[8:12] ]
- self.base['T'] = [ float(v) for v in data[12:16] ]
+ self.base['T'] = [ float(v) for v in data[12:16] ]
else:
raise RuntimeError("matrix file %s is unusual" % (pathname,))
def get_elements(self):
auto_lane_fragment = ""
else:
auto_lane_fragment = "_%d" % ( matrix_auto_lane,)
-
+
for matrix_name in ['s%s_02_matrix.txt' % (auto_lane_fragment,),
's%s_1_matrix.txt' % (auto_lane_fragment,),
]:
ElementTree.dump(self.get_elements())
def get_elements(self):
- root = ElementTree.Element('Bustard',
+ root = ElementTree.Element('Bustard',
{'version': str(Bustard.XML_VERSION)})
version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
version.text = self.version
# add crosstalk matrix if it exists
if self.crosstalk is not None:
root.append(self.crosstalk.get_elements())
-
+
# add bustard config if it exists
if self.bustard_config is not None:
root.append(self.bustard_config)
raise ValueError('Expected "Bustard" SubElements')
xml_version = int(tree.attrib.get('version', 0))
if xml_version > Bustard.XML_VERSION:
- logging.warn('Bustard XML tree is a higher version than this class')
+ LOGGER.warn('Bustard XML tree is a higher version than this class')
for element in list(tree):
if element.tag == Bustard.SOFTWARE_VERSION:
self.version = element.text
self.bustard_config = element
else:
raise ValueError("Unrecognized tag: %s" % (element.tag,))
-
+
def bustard(pathname):
"""
Construct a Bustard object by analyzing an Illumina Bustard directory.
for bustard_dir in args:
print u'analyzing bustard directory: ' + unicode(bustard_dir)
bustard_object = bustard(bustard_dir)
- bustard_object.dump()
+ bustard_object.dump()
bustard_object2 = Bustard(xml=bustard_object.get_elements())
print ('-------------------------------------')
from pyinotify import WatchManager, ThreadedNotifier
from pyinotify import EventsCodes, ProcessEvent
+LOGGER = logging.getLogger(__name__)
+
class ConfigInfo:
-
+
def __init__(self):
#run_path = firecrest analysis directory to run analysis from
self.run_path = None
self._ci = conf_info
ProcessEvent.__init__(self)
-
+
def process_IN_CREATE(self, event):
fullpath = os.path.join(event.path, event.name)
if s_finished.search(fullpath):
- logging.info("File Found: %s" % (fullpath))
+ LOGGER.info("File Found: %s" % (fullpath))
if s_firecrest_finished.search(fullpath):
self.run_status_dict['firecrest'] = True
self._ci.status.updateBustard(event.name)
elif s_firecrest_all.search(fullpath):
self._ci.status.updateFirecrest(event.name)
-
+
#print "Create: %s" % (os.path.join(event.path, event.name))
def process_IN_DELETE(self, event):
# Detect invalid command-line arguments
elif s_invalid_cmdline.search(line):
- logging.error("Invalid commandline options!")
+ LOGGER.error("Invalid commandline options!")
# Detect starting of configuration
elif s_start.search(line):
- logging.info('START: Configuring pipeline')
+ LOGGER.info('START: Configuring pipeline')
# Detect it made it past invalid arguments
elif s_gerald.search(line):
- logging.info('Running make now')
+ LOGGER.info('Running make now')
# Detect that make files have been generated (based on output)
elif s_generating.search(line):
- logging.info('Make files generted')
+ LOGGER.info('Make files generted')
return True
# Capture run directory
conf_info.bustard_path = firecrest_bustard
conf_info.run_path = firecrest
-
+
#Standard output handling
else:
print 'Sequence line:', line
# Log all other output for debugging purposes
else:
- logging.warning('CONF:?: %s' % (line))
+ LOGGER.warning('CONF:?: %s' % (line))
return False
# Detect invalid species directory error
if s_species_dir_err.search(line):
- logging.error(line)
+ LOGGER.error(line)
return RUN_ABORT
# Detect goat_pipeline.py traceback
elif s_goat_traceb.search(line):
- logging.error("Goat config script died, traceback in debug output")
+ LOGGER.error("Goat config script died, traceback in debug output")
return RUN_ABORT
# Detect indication of successful configuration (from stderr; odd, but ok)
elif s_stderr_taskcomplete.search(line):
- logging.info('Configure step successful (from: stderr)')
+ LOGGER.info('Configure step successful (from: stderr)')
return True
# Detect missing cycles
elif s_missing_cycles.search(line):
# Only display error once
if not SUPPRESS_MISSING_CYCLES:
- logging.error("Missing cycles detected; Not all cycles copied?")
- logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line))
+ LOGGER.error("Missing cycles detected; Not all cycles copied?")
+ LOGGER.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line))
SUPPRESS_MISSING_CYCLES = True
return RUN_ABORT
-
+
# Log all other output as debug output
else:
- logging.debug('CONF:STDERR:?: %s' % (line))
+ LOGGER.debug('CONF:STDERR:?: %s' % (line))
# Neutral (not failure; nor success)
return False
if pl_stderr_ignore(line):
pass
elif s_make_error.search(line):
- logging.error("make error detected; run failed")
+ LOGGER.error("make error detected; run failed")
return RUN_FAILED
elif s_no_gnuplot.search(line):
- logging.error("gnuplot not found")
+ LOGGER.error("gnuplot not found")
return RUN_FAILED
elif s_no_convert.search(line):
- logging.error("imagemagick's convert command not found")
+ LOGGER.error("imagemagick's convert command not found")
return RUN_FAILED
elif s_no_ghostscript.search(line):
- logging.error("ghostscript not found")
+ LOGGER.error("ghostscript not found")
return RUN_FAILED
else:
- logging.debug('PIPE:STDERR:?: %s' % (line))
+ LOGGER.debug('PIPE:STDERR:?: %s' % (line))
return False
options = getCombinedOptions()
if options.url is None:
- logging.error("%s or %s missing base_host_url option" % \
+ LOGGER.error("%s or %s missing base_host_url option" % \
(CONFIG_USER, CONFIG_SYSTEM))
return False
saveConfigFile(flowcell, options.url, cfg_filepath)
conf_info.config_filepath = cfg_filepath
except FlowCellNotFound, e:
- logging.error(e)
+ LOGGER.error(e)
return False
except WebError404, e:
- logging.error(e)
+ LOGGER.error(e)
return False
except IOError, e:
- logging.error(e)
+ LOGGER.error(e)
return False
except Exception, e:
- logging.error(e)
+ LOGGER.error(e)
return False
f = open(cfg_filepath, 'r')
genome_dict = getAvailableGenomes(genome_dir)
mapper_dict = constructMapperDict(genome_dict)
- logging.debug(data)
+ LOGGER.debug(data)
f = open(cfg_filepath, 'w')
f.write(data % (mapper_dict))
f.close()
-
+
return True
-
+
def configure(conf_info):
fout = open(stdout_filepath, 'w')
ferr = open(stderr_filepath, 'w')
-
+
pipe = subprocess.Popen(['goat_pipeline.py',
'--GERALD=%s' % (conf_info.config_filepath),
'--make',
# Clean up
fout.close()
ferr.close()
-
-
+
+
##################
# Process stdout
fout = open(stdout_filepath, 'r')
-
+
stdout_line = fout.readline()
complete = False
#error_code = pipe.wait()
if error_code:
- logging.error('Recieved error_code: %s' % (error_code))
+ LOGGER.error('Recieved error_code: %s' % (error_code))
else:
- logging.info('We are go for launch!')
+ LOGGER.info('We are go for launch!')
#Process stderr
ferr = open(stderr_filepath, 'r')
# we didn't retrieve the path info, log it.
if status is True:
if conf_info.bustard_path is None or conf_info.run_path is None:
- logging.error("Failed to retrieve run_path")
+ LOGGER.error("Failed to retrieve run_path")
return False
-
+
return status
"""
# Fail if the run_path doesn't actually exist
if not os.path.exists(conf_info.run_path):
- logging.error('Run path does not exist: %s' \
+ LOGGER.error('Run path does not exist: %s' \
% (conf_info.run_path))
return False
wdd = wm.add_watch(conf_info.run_path, mask, rec=True)
# Log pipeline starting
- logging.info('STARTING PIPELINE @ %s' % (time.ctime()))
-
+ LOGGER.info('STARTING PIPELINE @ %s' % (time.ctime()))
+
# Start the pipeline (and hide!)
#pipe = subprocess.Popen(['make',
# '-j8',
from htsworkflow.util.ethelp import indent, flatten
from htsworkflow.util.opener import autoopen
+LOGGER = logging.getLogger(__name__)
+
SAMPLE_NAME = 'SampleName'
LANE_ID = 'LaneID'
END = 'End'
if os.stat(self.pathname)[stat.ST_SIZE] == 0:
raise RuntimeError("Eland isn't done, try again later.")
- logging.info("summarizing results for %s" % (self.pathname))
+ LOGGER.info("summarizing results for %s" % (self.pathname))
stream = autoopen(self.pathname, 'r')
if self.eland_type == ELAND_SINGLE:
elif tag == READS.lower():
self._reads = int(element.text)
else:
- logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
+ LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
class SequenceLane(ResultLane):
XML_VERSION=1
self._guess_sequence_type(self.pathname)
- logging.info("summarizing results for %s" % (self.pathname))
+ LOGGER.info("summarizing results for %s" % (self.pathname))
lines = 0
f = open(self.pathname)
for l in f.xreadlines():
elif tag == SequenceLane.SEQUENCE_TYPE.lower():
self.sequence_type = lookup_sequence_type.get(element.text, None)
else:
- logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
+ LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
class ELAND(object):
"""
full_lane_id = "%d_%d" % ( lane_id, end )
basename = pattern % (full_lane_id,)
- logging.info("Eland pattern: %s" %(basename,))
+ LOGGER.info("Eland pattern: %s" %(basename,))
pathname = os.path.join(basedir, basename)
if os.path.exists(pathname):
- logging.info('found eland file in %s' % (pathname,))
+ LOGGER.info('found eland file in %s' % (pathname,))
return pathname
else:
return None
# but I needed to persist the sample_name/lane_id for
# runfolder summary_report
path, name = os.path.split(pathname)
- logging.info("Adding eland file %s" %(name,))
+ LOGGER.info("Adding eland file %s" %(name,))
# split_name = name.split('_')
# lane_id = int(split_name[1])
update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
break
else:
- logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
+ LOGGER.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
continue
return e
def build_genome_fasta_map(genome_dir):
# build fasta to fasta file map
- logging.info("Building genome map")
+ LOGGER.info("Building genome map")
genome = genome_dir.split(os.path.sep)[-1]
fasta_map = {}
for vld_file in glob(os.path.join(genome_dir, '*.vld')):
from optparse import OptionParser
parser = OptionParser("%prog: <gerald dir>+")
opts, args = parser.parse_args(cmdline)
- logging.basicConfig(level=logging.DEBUG)
+ LOGGER.basicConfig(level=logging.DEBUG)
for a in args:
- logging.info("Starting scan of %s" % (a,))
+ LOGGER.info("Starting scan of %s" % (a,))
e = eland(a)
print e.get_elements()
"""
Extract information about the Firecrest run
-Firecrest
+Firecrest
class holding the properties we found
-firecrest
+firecrest
Firecrest factory function initalized from a directory name
-fromxml
+fromxml
Firecrest factory function initalized from an xml dump from
the Firecrest object.
"""
from datetime import date
from glob import glob
+import logging
import os
import re
import time
VERSION_RE, \
EUROPEAN_STRPTIME
+LOGGER = logging.getLogger(__name__)
+
__docformat__ = "restructuredtext en"
class Firecrest(object):
if xml is not None:
self.set_elements(xml)
-
+
def _get_time(self):
return time.mktime(self.date.timetuple())
time = property(_get_time, doc='return run time as seconds since epoch')
raise ValueError('Expected "Firecrest" SubElements')
xml_version = int(tree.attrib.get('version', 0))
if xml_version > Firecrest.XML_VERSION:
- logging.warn('Firecrest XML tree is a higher version than this class')
+ LOGGER.warn('Firecrest XML tree is a higher version than this class')
for element in list(tree):
if element.tag == Firecrest.SOFTWARE_VERSION:
self.version = element.text
f.user = groups[3]
bustard_pattern = os.path.join(pathname, 'Bustard*')
- # should I parse this deeper than just stashing the
+ # should I parse this deeper than just stashing the
# contents of the matrix file?
matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
if os.path.exists(matrix_pathname):
from htsworkflow.util.alphanum import alphanum
+LOGGER = logging.getLogger(__name__)
class DuplicateGenome(Exception): pass
"""
raises IOError (on genome_base_dir not found)
raises DuplicateGenome on duplicate genomes found.
-
+
returns a double dictionary (i.e. d[species][build] = path)
"""
try:
species, build = line.split('|')
except:
- logging.warning('Skipping: Invalid metafile (%s) line: %s' \
- % (metafile, line))
+ LOGGER.warning('Skipping: Invalid metafile (%s) line: %s' \
+ % (metafile, line))
continue
build_dict = d.setdefault(species, {})
build_dict[build] = genome_dir
return d
-
+
class constructMapperDict(object):
"""
Emulate a dictionary to map genome|build names to paths.
-
+
It uses the dictionary generated by getAvailableGenomes.
"""
def __init__(self, genome_dict):
Return the best match for key
"""
elements = re.split("\|", key)
-
+
if len(elements) == 1:
# we just the species name
# get the set of builds
builds = self.genome_dict[elements[0]]
-
+
# sort build names the way humans would
keys = builds.keys()
keys.sort(cmp=alphanum)
-
+
# return the path from the 'last' build name
return builds[keys[-1]]
-
+
elif len(elements) == 2:
# we have species, and build name
return self.genome_dict[elements[0]][elements[1]]
return self[key]
except KeyError, e:
return default
-
+
def keys(self):
keys = []
for species in self.genome_dict.keys():
for build in self.genome_dict[species]:
keys.append([species+'|'+build])
return keys
-
+
def values(self):
values = []
for species in self.genome_dict.keys():
for build in self.genome_dict[species]:
values.append(self.genome_dict[species][build])
return values
-
+
def items(self):
items = []
for species in self.genome_dict.keys():
value = self.genome_dict[species][build]
items.append((key, value))
return items
-
+
if __name__ == '__main__':
if len(sys.argv) != 2:
for k,v in d2.items():
print '%s: %s' % (k,v)
-
-
+
+
VERSION_RE
from htsworkflow.util.ethelp import indent, flatten
+LOGGER = logging.getLogger(__name__)
+
class Gerald(object):
"""
Capture meaning out of the GERALD directory
raise ValueError('exptected GERALD')
xml_version = int(tree.attrib.get('version', 0))
if xml_version > Gerald.XML_VERSION:
- logging.warn('XML tree is a higher version than this class')
+ LOGGER.warn('XML tree is a higher version than this class')
self.eland_results = ELAND()
for element in list(tree):
tag = element.tag.lower()
elif tag == ELAND.ELAND.lower():
self.eland_results = ELAND(xml=element)
else:
- logging.warn("Unrecognized tag %s" % (element.tag,))
+ LOGGER.warn("Unrecognized tag %s" % (element.tag,))
def gerald(pathname):
g = Gerald()
g.pathname = os.path.expanduser(pathname)
path, name = os.path.split(g.pathname)
- logging.info("Parsing gerald config.xml")
+ LOGGER.info("Parsing gerald config.xml")
config_pathname = os.path.join(g.pathname, 'config.xml')
g.tree = ElementTree.parse(config_pathname).getroot()
# parse Summary.htm file
summary_pathname = os.path.join(g.pathname, 'Summary.xml')
if os.path.exists(summary_pathname):
- logging.info("Parsing Summary.xml")
+ LOGGER.info("Parsing Summary.xml")
else:
summary_pathname = os.path.join(g.pathname, 'Summary.htm')
- logging.info("Parsing Summary.htm")
+ LOGGER.info("Parsing Summary.htm")
g.summary = Summary(summary_pathname)
# parse eland files
g.eland_results = eland(g.pathname, g)
"""
Extract information about the IPAR run
-IPAR
+IPAR
class holding the properties we found
ipar
IPAR factory function initalized from a directory name
VERSION_RE, \
EUROPEAN_STRPTIME
+LOGGER = logging.getLogger(__name__)
SOFTWARE_NAMES = ('IPAR_1.01', 'IPAR_1.3', 'Intensities')
class Tiles(object):
raise ValueError('Expected "IPAR" SubElements')
xml_version = int(tree.attrib.get('version', 0))
if xml_version > IPAR.XML_VERSION:
- logging.warn('IPAR XML tree is a higher version than this class')
+ LOGGER.warn('IPAR XML tree is a higher version than this class')
for element in list(tree):
if element.tag == IPAR.RUN:
self.tree = element
if run.attrib.has_key('Name') and run.attrib['Name'] in SOFTWARE_NAMES:
return run
else:
- logging.info("No run found")
+ LOGGER.info("No run found")
return None
def ipar(pathname):
"""
Examine the directory at pathname and initalize a IPAR object
"""
- logging.info("Searching IPAR directory %s" % (pathname,))
+ LOGGER.info("Searching IPAR directory %s" % (pathname,))
i = IPAR()
i.pathname = pathname
os.path.join(path, '.params')]
for paramfile in paramfiles:
if os.path.exists(paramfile):
- logging.info("Found IPAR Config file at: %s" % ( paramfile, ))
+ LOGGER.info("Found IPAR Config file at: %s" % ( paramfile, ))
i.tree = load_ipar_param_tree(paramfile)
mtime_local = os.stat(paramfile)[stat.ST_MTIME]
i.time = mtime_local
# JSON dictionaries use strings
LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
+LOGGER = logging.getLogger(__name__)
+
__docformat__ = "restructredtext en"
CONFIG_SYSTEM = '/etc/htsworkflow.ini'
web = urllib2.urlopen(url, apipayload)
except urllib2.URLError, e:
errmsg = 'URLError: %d %s' % (e.code, e.msg)
- logging.error(errmsg)
- logging.error('opened %s' % (url,))
+ LOGGER.error(errmsg)
+ LOGGER.error('opened %s' % (url,))
raise IOError(errmsg)
contents = web.read()
lane_prefix = u"".join(lane_numbers)
species_path = genome_map.get(species, None)
- logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
+ LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
if not is_sequencing and species_path is None:
no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
- logging.warning(no_genome_msg % (lane_numbers, species))
+ LOGGER.warning(no_genome_msg % (lane_numbers, species))
is_sequencing = True
if is_sequencing:
retrieves the flowcell eland config file, give the base_host_url
(i.e. http://sub.domain.edu:port)
"""
- logging.info('USING OPTIONS:')
- logging.info(u' URL: %s' % (options.url,))
- logging.info(u' OUT: %s' % (options.output_filepath,))
- logging.info(u' FC: %s' % (options.flowcell,))
- #logging.info(': %s' % (options.genome_dir,))
- logging.info(u'post_run: %s' % ( unicode(options.post_run),))
+ LOGGER.info('USING OPTIONS:')
+ LOGGER.info(u' URL: %s' % (options.url,))
+ LOGGER.info(u' OUT: %s' % (options.output_filepath,))
+ LOGGER.info(u' FC: %s' % (options.flowcell,))
+ #LOGGER.info(': %s' % (options.genome_dir,))
+ LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
- logging.debug('genome_dir: %s' % ( options.genome_dir, ))
+ LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
available_genomes = getAvailableGenomes(options.genome_dir)
genome_map = constructMapperDict(available_genomes)
- logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
+ LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
#config = format_gerald_config(options, flowcell_info, genome_map)
#
#if options.output_filepath is not None:
# outstream = open(options.output_filepath, 'w')
- # logging.info('Writing config file to %s' % (options.output_filepath,))
+ # LOGGER.info('Writing config file to %s' % (options.output_filepath,))
#else:
# outstream = sys.stdout
#
except ImportError, e:
from elementtree import ElementTree
+LOGGER = logging.getLogger(__name__)
+
EUROPEAN_STRPTIME = "%d-%m-%Y"
EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
VERSION_RE = "([0-9\.]+)"
else:
flowcell_id = 'unknown'
- logging.warning(
+ LOGGER.warning(
"Flowcell id was not found, guessing %s" % (
flowcell_id))
self._flowcell_id = flowcell_id
elif tag == gerald.Gerald.GERALD.lower():
self.gerald = gerald.Gerald(xml=element)
else:
- logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+ LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
def _get_run_name(self):
"""
def save(self, destdir=None):
if destdir is None:
destdir = ''
- logging.info("Saving run report " + self.name)
+ LOGGER.info("Saving run report " + self.name)
xml = self.get_elements()
indent(xml)
dest_pathname = os.path.join(destdir, self.name)
ElementTree.ElementTree(xml).write(dest_pathname)
def load(self, filename):
- logging.info("Loading run report from " + filename)
+ LOGGER.info("Loading run report from " + filename)
tree = ElementTree.parse(filename).getroot()
self.set_elements(tree)
"""
Load and instantiate a Pipeline run from a run xml file
- :Parameters:
+ :Parameters:
- `pathname` : location of an run xml file
:Returns: initialized PipelineRun object
from htsworkflow.pipelines import gerald
def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
- logging.info("Looking for bustard directories in %s" % (pathname,))
+ LOGGER.info("Looking for bustard directories in %s" % (pathname,))
bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
# RTA BaseCalls looks enough like Bustard.
bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
for bustard_pathname in bustard_dirs:
- logging.info("Found bustard directory %s" % (bustard_pathname,))
+ LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
b = bustard.bustard(bustard_pathname)
gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
- logging.info("Looking for gerald directories in %s" % (pathname,))
+ LOGGER.info("Looking for gerald directories in %s" % (pathname,))
for gerald_pathname in glob(gerald_glob):
- logging.info("Found gerald directory %s" % (gerald_pathname,))
+ LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
try:
g = gerald.gerald(gerald_pathname)
p = PipelineRun(runfolder)
p.gerald = g
runs.append(p)
except IOError, e:
- logging.error("Ignoring " + str(e))
+ LOGGER.error("Ignoring " + str(e))
datadir = os.path.join(runfolder, 'Data')
- logging.info('Searching for runs in ' + datadir)
+ LOGGER.info('Searching for runs in ' + datadir)
runs = []
# scan for firecrest directories
for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
- logging.info('Found firecrest in ' + datadir)
+ LOGGER.info('Found firecrest in ' + datadir)
image_analysis = firecrest.firecrest(firecrest_pathname)
if image_analysis is None:
- logging.warn(
+ LOGGER.warn(
"%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
)
else:
# The Intensities directory from the RTA software looks a lot like IPAR
ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
for ipar_pathname in ipar_dirs:
- logging.info('Found ipar directories in ' + datadir)
+ LOGGER.info('Found ipar directories in ' + datadir)
image_analysis = ipar.ipar(ipar_pathname)
if image_analysis is None:
- logging.warn(
+ LOGGER.warn(
"%s is an empty or invalid IPAR directory" % (ipar_pathname,)
)
else:
runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
- logging.info('--- use-run detected options ---')
- logging.info('runfolder: %s' % (runfolder_dir,))
- logging.info('image_dir: %s' % (image_dir,))
- logging.info('bustard_dir: %s' % (bustard_dir,))
- logging.info('gerald_dir: %s' % (gerald_dir,))
+ LOGGER.info('--- use-run detected options ---')
+ LOGGER.info('runfolder: %s' % (runfolder_dir,))
+ LOGGER.info('image_dir: %s' % (image_dir,))
+ LOGGER.info('bustard_dir: %s' % (bustard_dir,))
+ LOGGER.info('gerald_dir: %s' % (gerald_dir,))
# find our processed image dir
image_run = None
# split into parent, and leaf directory
# leaf directory should be an IPAR or firecrest directory
data_dir, short_image_dir = os.path.split(image_dir)
- logging.info('data_dir: %s' % (data_dir,))
- logging.info('short_iamge_dir: %s' % (short_image_dir,))
+ LOGGER.info('data_dir: %s' % (data_dir,))
+ LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
# guess which type of image processing directory we have by looking
# in the leaf directory name
elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
image_run = ipar.ipar(image_dir)
- # if we din't find a run, report the error and return
+ # if we din't find a run, report the error and return
if image_run is None:
msg = '%s does not contain an image processing step' % (image_dir,)
- logging.error(msg)
+ LOGGER.error(msg)
return None
# find our base calling
base_calling_run = bustard.bustard(bustard_dir)
if base_calling_run is None:
- logging.error('%s does not contain a bustard run' % (bustard_dir,))
+ LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
return None
# find alignments
gerald_run = gerald.gerald(gerald_dir)
if gerald_run is None:
- logging.error('%s does not contain a gerald run' % (gerald_dir,))
+ LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
return None
p = PipelineRun(runfolder_dir)
p.bustard = base_calling_run
p.gerald = gerald_run
- logging.info('Constructed PipelineRun from %s' % (gerald_dir,))
+ LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
return p
def extract_run_parameters(runs):
cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
if os.path.exists(status_file):
cmd_list.extend(['Status.xml', 'Status.xsl'])
- logging.info("Saving reports from " + reports_dir)
+ LOGGER.info("Saving reports from " + reports_dir)
cwd = os.getcwd()
os.chdir(data_dir)
q = QueueCommands([" ".join(cmd_list)])
# Copy Summary.htm
summary_path = os.path.join(gerald_object.pathname, 'Summary.htm')
if os.path.exists(summary_path):
- logging.info('Copying %s to %s' % (summary_path, cycle_dir))
+ LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir))
shutil.copy(summary_path, cycle_dir)
else:
- logging.info('Summary file %s was not found' % (summary_path,))
+ LOGGER.info('Summary file %s was not found' % (summary_path,))
def save_ivc_plot(bustard_object, cycle_dir):
"""
plot_target_path = os.path.join(cycle_dir, 'Plots')
if os.path.exists(plot_html):
- logging.debug("Saving %s" % (plot_html,))
- logging.debug("Saving %s" % (plot_images,))
+ LOGGER.debug("Saving %s" % (plot_html,))
+ LOGGER.debug("Saving %s" % (plot_images,))
shutil.copy(plot_html, cycle_dir)
if not os.path.exists(plot_target_path):
os.mkdir(plot_target_path)
for plot_file in glob(plot_images):
shutil.copy(plot_file, plot_target_path)
else:
- logging.warning('Missing IVC.html file, not archiving')
+ LOGGER.warning('Missing IVC.html file, not archiving')
def compress_score_files(bustard_object, cycle_dir):
bzip_cmd = [ 'bzip2', '-9', '-c' ]
tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
tar_dest = open(tar_dest_name, 'w')
- logging.info("Compressing score files from %s" % (scores_path,))
- logging.info("Running tar: " + " ".join(tar_cmd[:10]))
- logging.info("Running bzip2: " + " ".join(bzip_cmd))
- logging.info("Writing to %s" % (tar_dest_name,))
+ LOGGER.info("Compressing score files from %s" % (scores_path,))
+ LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
+ LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
+ LOGGER.info("Writing to %s" % (tar_dest_name,))
env = {'BZIP': '-9'}
tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
for eland_lane in lanes_dictionary.values():
source_name = eland_lane.pathname
if source_name is None:
- logging.info(
+ LOGGER.info(
"Lane ID %s does not have a filename." % (eland_lane.lane_id,))
else:
path, name = os.path.split(source_name)
dest_name = os.path.join(cycle_dir, name)
- logging.info("Saving eland file %s to %s" % \
+ LOGGER.info("Saving eland file %s to %s" % \
(source_name, dest_name))
if is_compressed(name):
- logging.info('Already compressed, Saving to %s' % (dest_name,))
+ LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
shutil.copy(source_name, dest_name)
else:
# not compressed
dest_name += '.bz2'
args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
bz_commands.append(" ".join(args))
- #logging.info('Running: %s' % ( " ".join(args) ))
+ #LOGGER.info('Running: %s' % ( " ".join(args) ))
#bzip_dest = open(dest_name, 'w')
#bzip = subprocess.Popen(args, stdout=bzip_dest)
- #logging.info('Saving to %s' % (dest_name, ))
+ #LOGGER.info('Saving to %s' % (dest_name, ))
#bzip.wait()
if len(bz_commands) > 0:
for r in runs:
result_dir = os.path.join(output_base_dir, r.flowcell_id)
- logging.info("Using %s as result directory" % (result_dir,))
+ LOGGER.info("Using %s as result directory" % (result_dir,))
if not os.path.exists(result_dir):
os.mkdir(result_dir)
# create cycle_dir
cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
- logging.info("Filling in %s" % (cycle,))
+ LOGGER.info("Filling in %s" % (cycle,))
cycle_dir = os.path.join(result_dir, cycle)
cycle_dir = os.path.abspath(cycle_dir)
if os.path.exists(cycle_dir):
- logging.error("%s already exists, not overwriting" % (cycle_dir,))
+ LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
continue
else:
os.mkdir(cycle_dir)
def rm_list(files, dry_run=True):
for f in files:
if os.path.exists(f):
- logging.info('deleting %s' % (f,))
+ LOGGER.info('deleting %s' % (f,))
if not dry_run:
if os.path.isdir(f):
shutil.rmtree(f)
else:
os.unlink(f)
else:
- logging.warn("%s doesn't exist." % (f,))
+ LOGGER.warn("%s doesn't exist." % (f,))
def clean_runs(runs, dry_run=True):
"""
Clean up run folders to optimize for compression.
"""
if dry_run:
- logging.info('In dry-run mode')
+ LOGGER.info('In dry-run mode')
for run in runs:
- logging.info('Cleaninging %s' % (run.pathname,))
+ LOGGER.info('Cleaninging %s' % (run.pathname,))
# rm RunLog*.xml
runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
rm_list(runlogs, dry_run)
calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
rm_list(calibration_dir, dry_run)
# rm Images/L*
- logging.info("Cleaning images")
+ LOGGER.info("Cleaning images")
image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
rm_list(image_dirs, dry_run)
# cd Data/C1-*_Firecrest*
- logging.info("Cleaning intermediate files")
+ LOGGER.info("Cleaning intermediate files")
# make clean_intermediate
if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
clean_process = subprocess.Popen(['make', 'clean_intermediate'],
import os
import re
+LOGGER = logging.getLogger(__name__)
+
eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+')
qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2')
class SequenceFile(object):
"""
Simple container class that holds the path to a sequence archive
- and basic descriptive information.
+ and basic descriptive information.
"""
def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
self.filetype = filetype
Add this entry to a DB2.0 database.
"""
#FIXME: NEEDS SQL ESCAPING
- header_macro = {'table': SEQUENCE_TABLE_NAME }
+ header_macro = {'table': SEQUENCE_TABLE_NAME }
sql_header = "insert into %(table)s (" % header_macro
sql_columns = ['filetype','path','flowcell','lane']
sql_middle = ") values ("
stop = cycle_match.group('stop')
if stop is not None:
stop = int(stop)
-
+
return flowcell, start, stop
-
+
def parse_srf(path, filename):
flowcell_dir, start, stop = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
fullpath = os.path.join(path, filename)
if flowcell_dir != flowcell:
- logging.warn("flowcell %s found in wrong directory %s" % \
+ LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
read = int(records[6][1])
if flowcell_dir != flowcell:
- logging.warn("flowcell %s found in wrong directory %s" % \
+ LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop)
lane = int(records[5][1])
read = int(records[6][1])
pf = parse_fastq_pf_flag(records)
-
+
if flowcell_dir != flowcell:
- logging.warn("flowcell %s found in wrong directory %s" % \
+ LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
(records[-1], os.path.join(path,filename)))
return pf
-
+
def parse_eland(path, filename, eland_match=None):
if eland_match is None:
eland_match = eland_re.match(filename)
else:
read = None
return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop)
-
+
def scan_for_sequences(dirs):
"""
Scan through a list of directories for sequence like files
"""
sequences = []
for d in dirs:
- logging.info("Scanning %s for sequences" % (d,))
+ LOGGER.info("Scanning %s for sequences" % (d,))
if not os.path.exists(d):
- logging.warn("Flowcell directory %s does not exist" % (d,))
+ LOGGER.warn("Flowcell directory %s does not exist" % (d,))
continue
-
+
for path, dirname, filenames in os.walk(d):
for f in filenames:
seq = None
seq = parse_eland(path, f, eland_match)
if seq:
sequences.append(seq)
- logging.debug("Found sequence at %s" % (f,))
-
+ LOGGER.debug("Found sequence at %s" % (f,))
+
return sequences
from htsworkflow.util import queuecommands
+LOGGER = logging.getLogger(__name__)
+
SOLEXA2SRF = 0
ILLUMINA2SRF10 = 1
ILLUMINA2SRF11 = 2
def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
"""
make a subprocess-friendly list of command line arguments to run solexa2srf
- generates files like:
+ generates files like:
woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
site run name lane
-
+
run_name - most of the file name (run folder name is a good choice)
lanes - list of integers corresponding to which lanes to process
site_name - name of your "sequencing site" or "Individual"
destdir - where to write all the srf files
"""
# clean up pathname
- logging.info("run_name %s" % (run_name,))
+ LOGGER.info("run_name %s" % (run_name,))
cmd_list = []
for lane in lanes:
else:
raise ValueError("Unrecognized run level %d" % (cmdlevel,))
- logging.info("Generated command: " + " ".join(cmd))
+ LOGGER.info("Generated command: " + " ".join(cmd))
cmd_list.append(" ".join(cmd))
return cmd_list
def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
"""
make a subprocess-friendly list of command line arguments to run solexa2srf
- generates files like:
+ generates files like:
woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
site run name lane
-
+
run_name - most of the file name (run folder name is a good choice)
lanes - list of integers corresponding to which lanes to process
site_name - name of your "sequencing site" or "Individual"
destdir - where to write all the srf files
"""
# clean up pathname
- logging.info("run_name %s" % (run_name,))
+ LOGGER.info("run_name %s" % (run_name,))
cmd_list = []
for lane in lanes:
dest_path = os.path.join(destdir, destname)
cmd = " ".join(['tar', 'cjf', dest_path, pattern % (lane,) ])
- logging.info("Generated command: " + cmd)
+ LOGGER.info("Generated command: " + cmd)
cmd_list.append(cmd)
return cmd_list
def run_commands(new_dir, cmd_list, num_jobs):
- logging.info("chdir to %s" % (new_dir,))
+ LOGGER.info("chdir to %s" % (new_dir,))
curdir = os.getcwd()
os.chdir(new_dir)
q = queuecommands.QueueCommands(cmd_list, num_jobs)
for f in file_list:
cmd = " ".join(['md5sum', f, '>', f + '.md5'])
- logging.info('generated command: ' + cmd)
+ LOGGER.info('generated command: ' + cmd)
cmd_list.append(cmd)
return cmd_list
from htsworkflow.util.opener import autoopen
from htsworkflow.version import version
+LOGGER = logging.getLogger(__name__)
+
# constants for our fastq finite state machine
FASTQ_HEADER = 0
FASTQ_SEQUENCE = 1
else:
left = open_write(opts.left, opts.force)
right = open_write(opts.right, opts.force)
-
+
# open the srf, fastq, or compressed fastq
if is_srf(args[0]):
source = srf_open(args[0], opts.cnf1)
convert_single_to_fastq(source, left, header)
else:
convert_single_to_two_fastq(source, left, right, opts.mid, header)
-
+
return 0
def make_parser():
help="add flowcell id header to sequence")
parser.add_option('-l','--left', default="r1.fastq",
help='left side filename')
- parser.add_option('-m','--mid', default=None,
+ parser.add_option('-m','--mid', default=None,
help='actual sequence mid point')
parser.add_option('-r','--right', default="r2.fastq",
help='right side filename')
if cnf1 or is_cnf1(filename):
cmd.append('-c')
cmd.append(filename)
-
- logging.info('srf command: %s' % (" ".join(cmd),))
+
+ LOGGER.info('srf command: %s' % (" ".join(cmd),))
p = Popen(cmd, stdout=PIPE)
return p.stdout
-
+
def convert_single_to_fastq(instream, target1, header=''):
state = FASTQ_SEQUENCE_HEADER
# quality header
elif state == FASTQ_SEQUENCE_HEADER:
- # the sequence header isn't really sequence, but
+ # the sequence header isn't really sequence, but
# we're just passing it through
write_sequence(target1, line)
state = FASTQ_QUALITY
raise RuntimeError("Unrecognized STATE in fastq split")
-
+
def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
"""
- read a fastq file where two paired ends have been run together into
+ read a fastq file where two paired ends have been run together into
two halves.
instream is the source stream
state = FASTQ_SEQUENCE_HEADER
# quality header
elif state == FASTQ_SEQUENCE_HEADER:
- # the sequence header isn't really sequence, but
+ # the sequence header isn't really sequence, but
# we're just passing it through
write_sequence(target1, line)
write_sequence(target2, line)
"""
max_header = 1024 ** 2
PROGRAM_ID = 'PROGRAM_ID\000'
- cnf4_apps = set(("solexa2srf v1.4",
+ cnf4_apps = set(("solexa2srf v1.4",
"illumina2srf v1.11.5.Illumina.1.3"))
if not is_srf(filename):
from htsworkflow.pipelines.runfolder import ElementTree
from htsworkflow.util.ethelp import indent, flatten
+LOGGER = logging.getLogger(__name__)
nan = float('nan')
class Summary(object):
for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
node = element.find(GeraldName)
if node is None:
- logging.info("Couldn't find %s" % (GeraldName))
+ LOGGER.info("Couldn't find %s" % (GeraldName))
setattr(self, LRSName, parse_xml_mean_range(node))
-
+
def get_elements(self):
lane_result = ElementTree.Element(
Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
setattr(self, variable_name,
parse_summary_element(element))
except KeyError, e:
- logging.warn('Unrecognized tag %s' % (element.tag,))
+ LOGGER.warn('Unrecognized tag %s' % (element.tag,))
def __init__(self, filename=None, xml=None):
# lane results is a list of 1 or 2 ends containing
Extract just the lane results.
Currently those are the only ones we care about.
"""
-
+
tables = self._extract_named_tables(pathname)
self._extract_lane_results_for_end(tables, name, end)
if len(self.lane_results[0]) == 0:
- logging.warning("No Lane Results Summary Found in %s" % (pathname,))
+ LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
def _extract_named_tables_from_gerald_xml(self, tree):
"""
self.lane_results[lrs.end][lrs.lane] = lrs
# probably not useful
return tables
-
+
###### START HTML Table Extraction ########
def _extract_named_tables_from_html(self, tree):
body = tree.find('body')
return ValueError("Expected %s" % (Summary.SUMMARY,))
xml_version = int(tree.attrib.get('version', 0))
if xml_version > Summary.XML_VERSION:
- logging.warn('Summary XML tree is a higher version than this class')
+ LOGGER.warn('Summary XML tree is a higher version than this class')
for element in list(tree):
lrs = Summary.LaneResultSummary()
lrs.set_elements(element)
"""
if element is None:
return None
-
+
mean = element.find('mean')
stddev = element.find('stdev')
if mean is None or stddev is None:
raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
- if mean.text is None:
+ if mean.text is None:
mean_value = float('nan')
else:
mean_value = tonumber(mean.text)
- if stddev.text is None:
+ if stddev.text is None:
stddev_value = float('nan')
else:
stddev_value = tonumber(stddev.text)
for fname in args:
s = Summary(fname)
s.dump()
-
+
from htsworkflow.util.api import HtswApi
from htsworkflow.util.conversion import parse_flowcell_id
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
class CondorFastqExtract(object):
def __init__(self, host, apidata, sequences_path,
def build_fastqs(self, library_result_map ):
"""
Generate condor scripts to build any needed fastq files
-
+
Args:
library_result_map (list): [(library_id, destination directory), ...]
"""
srf_condor_header = self.get_srf_condor_header()
srf_condor_entries = []
lib_db = self.find_archive_sequence_files(library_result_map)
-
+
needed_targets = self.find_missing_targets(library_result_map, lib_db)
-
+
for target_pathname, available_sources in needed_targets.items():
- logger.debug(' target : %s' % (target_pathname,))
- logger.debug(' candidate sources: %s' % (available_sources,))
+ LOGGER.debug(' target : %s' % (target_pathname,))
+ LOGGER.debug(' candidate sources: %s' % (available_sources,))
if available_sources.has_key('qseq'):
source = available_sources['qseq']
qseq_condor_entries.append(
- self.condor_qseq_to_fastq(source.path,
- target_pathname,
+ self.condor_qseq_to_fastq(source.path,
+ target_pathname,
source.flowcell)
)
elif available_sources.has_key('srf'):
source = available_sources['srf']
mid = getattr(source, 'mid_point', None)
srf_condor_entries.append(
- self.condor_srf_to_fastq(source.path,
+ self.condor_srf_to_fastq(source.path,
target_pathname,
source.paired,
source.flowcell,
)
else:
print " need file", target_pathname
-
+
if len(srf_condor_entries) > 0:
- make_submit_script('srf.fastq.condor',
+ make_submit_script('srf.fastq.condor',
srf_condor_header,
srf_condor_entries)
-
+
if len(qseq_condor_entries) > 0:
- make_submit_script('qseq.fastq.condor',
+ make_submit_script('qseq.fastq.condor',
qseq_condor_header,
qseq_condor_entries)
-
+
def get_qseq_condor_header(self):
return """Universe=vanilla
error=%(log)s/srf_pair_fastq.$(process).out
log=%(log)s/srf_pair_fastq.log
environment="PYTHONPATH=%(env)s"
-
+
""" % {'exe': sys.executable,
'log': self.log_path,
'env': os.environ.get('PYTHONPATH', '')
}
-
+
def find_archive_sequence_files(self, library_result_map):
"""
Find archived sequence files associated with our results.
"""
- logger.debug("Searching for sequence files in: %s" %(self.sequences_path,))
-
+ LOGGER.debug("Searching for sequence files in: %s" %(self.sequences_path,))
+
lib_db = {}
seq_dirs = set()
candidate_lanes = {}
lib_info = self.api.get_library(lib_id)
lib_info['lanes'] = {}
lib_db[lib_id] = lib_info
-
+
for lane in lib_info['lane_set']:
lane_key = (lane['flowcell'], lane['lane_number'])
candidate_lanes[lane_key] = lib_id
- seq_dirs.add(os.path.join(self.sequences_path,
- 'flowcells',
+ seq_dirs.add(os.path.join(self.sequences_path,
+ 'flowcells',
lane['flowcell']))
- logger.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
+ LOGGER.debug("Seq_dirs = %s" %(unicode(seq_dirs)))
candidate_seq_list = scan_for_sequences(seq_dirs)
-
+
# at this point we have too many sequences as scan_for_sequences
# returns all the sequences in a flowcell directory
# so lets filter out the extras
-
+
for seq in candidate_seq_list:
lane_key = (seq.flowcell, seq.lane)
lib_id = candidate_lanes.get(lane_key, None)
if lib_id is not None:
lib_info = lib_db[lib_id]
lib_info['lanes'].setdefault(lane_key, set()).add(seq)
-
+
return lib_db
-
+
def find_missing_targets(self, library_result_map, lib_db):
"""
Check if the sequence file exists.
This requires computing what the sequence name is and checking
to see if it can be found in the sequence location.
-
+
Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
"""
fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
for lib_id, result_dir in library_result_map:
lib = lib_db[lib_id]
lane_dict = make_lane_dict(lib_db, lib_id)
-
+
for lane_key, sequences in lib['lanes'].items():
for seq in sequences:
seq.paired = lane_dict[seq.flowcell]['paired_end']
lane_status = lane_dict[seq.flowcell]['status']
-
+
if seq.paired and seq.read is None:
seq.read = 1
- filename_attributes = {
+ filename_attributes = {
'flowcell': seq.flowcell,
'lib_id': lib_id,
'lane': seq.lane,
# 30DY0 only ran for 151 bases instead of 152
# it is actually 76 1st read, 75 2nd read
seq.mid_point = 76
-
+
# end filters
if seq.paired:
target_name = fastq_paired_template % filename_attributes
else:
target_name = fastq_single_template % filename_attributes
-
+
target_pathname = os.path.join(result_dir, target_name)
if self.force or not os.path.exists(target_pathname):
t = needed_targets.setdefault(target_pathname, {})
t[seq.filetype] = seq
-
+
return needed_targets
-
+
def condor_srf_to_fastq(self,
srf_file,
target_pathname,
# this is ugly. I did it because I was pregenerating the target
# names before I tried to figure out what sources could generate
# those targets, and everything up to this point had been
- # one-to-one. So I couldn't figure out how to pair the
- # target names.
+ # one-to-one. So I couldn't figure out how to pair the
+ # target names.
# With this at least the command will run correctly.
# however if we rename the default targets, this'll break
# also I think it'll generate it twice.
- args.extend(['--right',
+ args.extend(['--right',
target_pathname.replace('_r1.fastq', '_r2.fastq')])
else:
args.extend(['--single', target_pathname ])
if flowcell is not None:
args.extend(['--flowcell', flowcell])
-
+
if mid is not None:
args.extend(['-m', str(mid)])
-
+
if self.force:
args.extend(['--force'])
-
+
script = """arguments="%s"
queue
""" % (" ".join(args),)
-
- return script
-
-
+
+ return script
+
+
def condor_qseq_to_fastq(self, qseq_file, target_pathname, flowcell=None):
py = qseq2fastq.__file__
args = [py, '-i', qseq_file, '-o', target_pathname ]
script = """arguments="%s"
queue
""" % (" ".join(args))
-
- return script
-
+
+ return script
+
def make_submit_script(target, header, body_list):
"""
write out a text file
this was intended for condor submit scripts
Args:
- target (str or stream):
+ target (str or stream):
if target is a string, we will open and close the file
if target is a stream, the caller is responsible.
dafTermOntology['name']))
if view_name is None:
errmsg = 'Could not find view name for {0}'
- logging.warning(errmsg.format(str(view)))
+ logger.warning(errmsg.format(str(view)))
return
view_name = str(view_name)
md5 = make_md5sum(submission_pathname)
if md5 is None:
errmsg = "Unable to produce md5sum for {0}"
- logging.warning(errmsg.format(submission_pathname))
+ logger.warning(errmsg.format(submission_pathname))
else:
self.model.add_statement(
RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
import urllib2
import urlparse
+LOGGER = logging.getLogger(__name__)
def add_auth_options(parser):
"""Add options OptParser configure authentication options
config.read([os.path.expanduser('~/.htsworkflow.ini'),
'/etc/htsworkflow.ini'
])
-
+
sequence_archive = None
apiid = None
apikey = None
"""
if opts.host is None or opts.apiid is None or opts.apikey is None:
parser.error("Please specify host url, apiid, apikey")
-
+
return {'apiid': opts.apiid, 'apikey': opts.apikey }
def lanes_for_user_url(root_url, username):
"""
Return the url for returning all the lanes associated with a username
-
+
Args:
username (str): a username in your target filesystem
root_url (str): the root portion of the url, e.g. http://localhost
web = urllib2.urlopen(url, apipayload)
except urllib2.URLError, e:
if hasattr(e, 'code') and e.code == 404:
- logging.info("%s was not found" % (url,))
+ LOGGER.info("%s was not found" % (url,))
return None
else:
errmsg = 'URLError: %s' % (str(e))
raise IOError(errmsg)
-
+
contents = web.read()
headers = web.info()
def get_url(self, url):
return retrieve_info(url, self.authdata)
-
+
"""
-Run up to N simultanous jobs from provided of commands
+Run up to N simultanous jobs from provided of commands
"""
import logging
import sys
import time
+LOGGER = logging.getLogger(__name__)
+
class QueueCommands(object):
"""
Queue up N commands from cmd_list, launching more jobs as the first
def __init__(self, cmd_list, N=0, cwd=None, env=None):
"""
cmd_list is a list of elements suitable for subprocess
- N is the number of simultanious processes to run.
+ N is the number of simultanious processes to run.
0 is all of them.
-
+
WARNING: this will not work on windows
- (It depends on being able to pass local file descriptors to the
+ (It depends on being able to pass local file descriptors to the
select call with isn't supported by the Win32 API)
"""
self.to_run = cmd_list[:]
Launch jobs until we have the maximum allowable running
(or have run out of jobs)
"""
- queue_log = logging.getLogger('queue')
-
while (len(self.to_run) > 0) and self.under_process_limit():
- queue_log.info('%d left to run', len(self.to_run))
+ LOGGER.info('%d left to run', len(self.to_run))
cmd = self.to_run.pop(0)
- p = subprocess.Popen(cmd,
- stdout=PIPE,
- shell=True,
- cwd=self.cwd,
+ p = subprocess.Popen(cmd,
+ stdout=PIPE,
+ shell=True,
+ cwd=self.cwd,
env=self.env)
self.running[p.stdout] = p
- queue_log.info("Created process %d from %s" % (p.pid, str(cmd)))
+ LOGGER.info("Created process %d from %s" % (p.pid, str(cmd)))
def run(self):
"""
run up to N jobs until we run out of jobs
"""
- queue_log = logging.getLogger('queue')
- queue_log.debug('using %s as cwd' % (self.cwd,))
+ LOGGER.debug('using %s as cwd' % (self.cwd,))
# to_run slowly gets consumed by start_jobs
while len(self.to_run) > 0 or len(self.running) > 0:
pending = self.running[pending_fd]
# if it really did finish, remove it from running jobs
if pending.poll() is not None:
- queue_log.info("Process %d finished [%d]",
- pending.pid, pending.returncode)
+ LOGGER.info("Process %d finished [%d]",
+ pending.pid, pending.returncode)
del self.running[pending_fd]
else:
# It's still running, but there's some output
buffer = pending_fd.readline()
buffer = buffer.strip()
msg = "%d:(%d) %s" %(pending.pid, len(buffer), buffer)
- logging.debug(msg)
+ LOGGER.debug(msg)
time.sleep(1)
import time
import unittest
-
from htsworkflow.util.queuecommands import QueueCommands
class testQueueCommands(unittest.TestCase):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-8s %(message)s')
-
+
def test_unlimited_run_slow(self):
"""
end = time.time()-start
# pity I had to add a 1 second sleep
self.failUnless( end > 5.9 and end < 6.1,
- "took %s seconds, expected ~6" % (end,))
+ "took %s seconds, expected ~6" % (end,))
def suite():
return unittest.makeSuite(testQueueCommands, 'test')
import logging
+LOGGER = logging.getLogger(__name__)
+
def version():
- """Return version number
+ """Return version number
"""
version = None
try:
import pkg_resources
except ImportError, e:
- logging.error("Can't find version number, please install setuptools")
+ LOGGER.error("Can't find version number, please install setuptools")
raise e
try:
version = pkg_resources.get_distribution("htsworkflow")
except pkg_resources.DistributionNotFound, e:
- logging.error("Package not installed")
+ LOGGER.error("Package not installed")
return version
-
+
from htsworkflow.util.makebed import make_bed_from_eland_stream, make_description
+LOGGER = logging.getLogger(__name__)
+
def make_bed_for_gerald(eland_dir, output_dir, prefix, database, flowcell):
"""
convert s_[1-8]_eland_result.txt to corresponding bed files
if len(out_files) > 0:
raise RuntimeError("please move old bedfiles")
- logging.info('Processing %s using flowcell id %s' % (eland_dir, flowcell))
+ LOGGER.info('Processing %s using flowcell id %s' % (eland_dir, flowcell))
for pathname in eland_files:
path, name = os.path.split(pathname)
lane = int(name[2])
outname = 's_%d_eland_result.bed' %(lane,)
- logging.info('Converting lane %d to %s' % (lane, outname))
+ LOGGER.info('Converting lane %d to %s' % (lane, outname))
outpathname = os.path.join(eland_dir, outname)
# look up descriptions
return parser
def main(command_line=None):
- logging.basicConfig(level=logging.WARNING)
+ LOGGER.basicConfig(level=logging.WARNING)
if command_line is None:
command_line = sys.argv[1:]
* start/stop cycle numbers
* Firecrest, bustard, gerald version numbers
* Eland analysis types, and everything in the eland configuration file.
- * cluster numbers and other values from the Summary.htm
- LaneSpecificParameters table.
+ * cluster numbers and other values from the Summary.htm
+ LaneSpecificParameters table.
* How many reads mapped to a genome from an eland file
The ELAND "mapped reads" counter will also check for eland squashed file
-that were symlinked from another directory. This is so I can track how
-many reads landed on the genome of interest and on the spike ins.
+that were symlinked from another directory. This is so I can track how
+many reads landed on the genome of interest and on the spike ins.
Basically my subdirectories something like:
genomes/hg18
genomes/hg18/chr*.2bpb <- files for hg18 genome
-genomes/hg18/chr*.vld
+genomes/hg18/chr*.vld
genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
-genomes/spikein
+genomes/spikein
-runfolder.py can also spit out a simple summary report (-s option)
-that contains the per lane post filter cluster numbers and the mapped
+runfolder.py can also spit out a simple summary report (-s option)
+that contains the per lane post filter cluster numbers and the mapped
read counts. (The report isn't currently very pretty)
"""
from glob import glob
from htsworkflow.pipelines import runfolder
from htsworkflow.pipelines.runfolder import ElementTree
+LOGGER = logging.getLogger(__name__)
+
def make_parser():
usage = 'usage: %prog [options] runfolder_root_dir'
parser = optparse.OptionParser(usage)
logging.basicConfig()
if opt.verbose:
- root_log = logging.getLogger()
- root_log.setLevel(logging.INFO)
+ LOGGER.setLevel(logging.INFO)
- logging.info('Starting htsworkflow illumina runfolder processing tool.')
+ LOGGER.info('Starting htsworkflow illumina runfolder processing tool.')
runs = []
if opt.run_xml:
# handle ~ shortcut
if specific_run is not None:
runs.append(specific_run)
else:
- logging.warn("Couldn't find a run in %s" % (opt.use_run,))
+ LOGGER.warn("Couldn't find a run in %s" % (opt.use_run,))
# scan runfolders for runs
for run_pattern in args:
from htsworkflow.util import api
from htsworkflow.pipelines.sequences import scan_for_sequences
+LOGGER = logging.getLogger(__name__)
+
def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
"""
compare our flowcell database with our list of sequences and return
if flowcell_info is not None:
seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
libdb.setdefault(seq_library_id, []).append(seq)
-
+
fcdb.sync()
return fcdb, libdb
If we didn't update anything return 0, if we did update
return 1.
"""
- logging.debug("CHECKING: %s -> %s", source, destination)
+ LOGGER.debug("CHECKING: %s -> %s", source, destination)
if not os.path.exists(source):
- logging.warning("%s doesn't exist", source)
+ LOGGER.warning("%s doesn't exist", source)
return 0
if os.path.exists(destination):
if os.path.samefile(source, destination):
- logging.debug('SAME: %s -> %s' % (source, destination))
+ LOGGER.debug('SAME: %s -> %s' % (source, destination))
return 0
else:
- logging.error('%s and %s are different files, skipping' % \
- (source, destination))
+ LOGGER.error('%s and %s are different files, skipping' % \
+ (source, destination))
return 0
- logging.debug('Linking: %s -> %s' % (source, destination))
+ LOGGER.debug('Linking: %s -> %s' % (source, destination))
# we would do something by this part
if dry_run: return 1
os.chmod(destination,
stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
return 1
-
+
def make_library_links(root, library_db, dry_run=False):
"""
Make a tree of sequencer roots organized by library id
for lib_id, sequences in library_db.items():
target_dir = os.path.join(root, lib_id)
if not os.path.exists(target_dir):
- logging.info("mkdir %s" % (target_dir,))
+ LOGGER.info("mkdir %s" % (target_dir,))
if not dry_run:
os.mkdir(target_dir)
-
+
for s in sequences:
count += carefully_make_hardlink(s.path,
s.make_target_name(target_dir),
if opts.debug:
level = logging.DEBUG
logging.basicConfig(level=level)
-
+
def configure_opts(opts):
"""
if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
-
+
return opts
def make_parser():
'sequence archive section')
parser.add_option('--cache', default=None,
help="default flowcell cache")
-
+
parser.add_option('--host', default=None,
help="specify http://host for quering flowcell information")
parser.add_option('--apiid', default=None,
help="API ID to use when retriving information")
parser.add_option("--apikey", default=None,
help="API Key for when retriving information")
-
+
parser.add_option('-a', '--sequence-archive', default=None,
help='path to where the sequence archive lives')
help='be more verbose')
parser.add_option('-d', '--debug', action='store_true', default=False,
help='report everything')
-
+
parser.add_option("--dry-run", dest="dry_run", action="store_true",
default=False,
help="Don't modify the filesystem")
configure_logging(opts)
opts = configure_opts(opts)
-
+
# complain if critical things are missing
if opts.cache is None:
parser.error('Need location of htsworkflow frontend database')
seq_dirs = [ opts.flowcells, opts.srfs ]
if len(args) > 0:
seq_dirs = [os.path.abspath(f) for f in args]
-
+
seqs = scan_for_sequences(seq_dirs)
fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
-
- logging.warn("%s flowcells in database" % (len(fcdb),))
- logging.warn("found %s sequence files" % (len(seqs),))
- logging.warn("%s libraries being checked" % (len(libdb),))
- logging.warn("%s sequence files were linked" % (updates,))
-
+
+ LOGGER.warn("%s flowcells in database" % (len(fcdb),))
+ LOGGER.warn("found %s sequence files" % (len(seqs),))
+ LOGGER.warn("%s libraries being checked" % (len(libdb),))
+ LOGGER.warn("%s sequence files were linked" % (updates,))
+
return 0
-
+
if __name__ == "__main__":
main()
from htsworkflow.pipelines.eland import extract_eland_sequence
from htsworkflow.pipelines import runfolder
+LOGGER = logging.getLogger(__name__)
+
def make_query_filename(eland_obj, output_dir):
- query_name = '%s_%s_eland_query.txt'
+ query_name = '%s_%s_eland_query.txt'
query_name %= (eland_obj.sample_name, eland_obj.lane_id)
query_pathname = os.path.join(output_dir, query_name)
-
+
if os.path.exists(query_pathname):
- logging.warn("overwriting %s" % (query_pathname,))
+ LOGGER.warn("overwriting %s" % (query_pathname,))
return query_pathname
def make_result_filename(eland_obj, output_dir):
- result_name = '%s_%s_eland_result.txt'
+ result_name = '%s_%s_eland_result.txt'
result_name %= (eland_obj.sample_name, eland_obj.lane_id)
result_pathname = os.path.join(output_dir, result_name)
-
+
if os.path.exists(result_pathname):
- logging.warn("overwriting %s" % (result_pathname,))
+ LOGGER.warn("overwriting %s" % (result_pathname,))
return result_pathname
def extract_sequence(inpathname, query_pathname, length, dry_run=False):
- logging.info('extracting %d bases' %(length,))
- logging.info('extracting from %s' %(inpathname,))
- logging.info('extracting to %s' %(query_pathname,))
-
- if not dry_run:
+ LOGGER.info('extracting %d bases' %(length,))
+ LOGGER.info('extracting from %s' %(inpathname,))
+ LOGGER.info('extracting to %s' %(query_pathname,))
+
+ if not dry_run:
try:
instream = open(inpathname, 'r')
outstream = open(query_pathname, 'w')
finally:
outstream.close()
instream.close()
-
+
def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
if multi:
cmdline += ['--multi']
- logging.info('running eland: ' + " ".join(cmdline))
+ LOGGER.info('running eland: ' + " ".join(cmdline))
if not dry_run:
return subprocess.Popen(cmdline)
else:
"""
look for eland files in gerald_dir and write a subset to output_dir
"""
- logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
+ LOGGER.info("Extracting %d bp from files in %s" % (length, gerald_dir))
g = gerald.gerald(gerald_dir)
# this will only work if we're only missing the last dir in output_dir
if not os.path.exists(output_dir):
- logging.info("Making %s" %(output_dir,))
+ LOGGER.info("Making %s" %(output_dir,))
if not dry_run: os.mkdir(output_dir)
processes = []
extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
- p = run_eland(length,
- query_pathname,
- lane_param.eland_genome,
- result_pathname,
+ p = run_eland(length,
+ query_pathname,
+ lane_param.eland_genome,
+ result_pathname,
dry_run=dry_run)
if p is not None:
processes.append(p)
for p in processes:
p.wait()
-
+
def make_parser():
usage = '%prog: [options] runfolder'
parser = OptionParser(usage)
-
- parser.add_option('--gerald',
+
+ parser.add_option('--gerald',
help='specify location of GERALD directory',
default=None)
parser.add_option('-o', '--output',
opts.gerald = runs[0].gerald.pathname
if opts.output is None:
opts.output = os.path.join(
- runs[0].pathname,
- 'Data',
+ runs[0].pathname,
+ 'Data',
# pythons 0..n ==> elands 1..n+1
- 'C1-%d' % (opts.length+1,)
+ 'C1-%d' % (opts.length+1,)
)
elif opts.gerald is None:
parser.error("need gerald directory")
-
+
if opts.output is None:
parser.error("specify location for the new eland files")
if opts.verbose:
- root_logger = logging.getLogger()
+ root_logger = logging.getLogger('rerun_eland')
root_logger.setLevel(logging.INFO)
rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)