Gather information about our submissions into a single RDF store
"""
-from BeautifulSoup import BeautifulSoup
+from lxml.html import fromstring
from datetime import datetime
import httplib2
from operator import attrgetter
if cookie is None:
cookie = login()
- soup = get_url_as_soup(USER_URL, 'GET', cookie)
- projects = soup.find('table', attrs={'id': 'projects'})
- table_row = projects.findNext('tr')
+ tree = get_url_as_tree(USER_URL, 'GET', cookie)
+ table_rows = tree.xpath('//table[@id="projects"]/tr')
# first record is header
- table_row = table_row.findNext()
name_n = submissionOntology['name']
species_n = submissionOntology['species']
library_urn = submissionOntology['library_urn']
- while table_row is not None:
- cell = table_row.findAll('td')
+ # skip header
+ for row in table_rows[1:]:
+ cell = row.xpath('td')
if cell is not None and len(cell) > 1:
- submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
+ submission_id = str(cell[0].text_content())
if limit is None or submission_id in limit:
subUrn = RDF.Uri(submission_view_url(submission_id))
add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
- name = get_contents(cell[4])
+ name = str(cell[4].text_content())
add_stmt(model, subUrn, name_n, name)
- species = get_contents(cell[2])
+ species = str(cell[2].text_content())
if species is not None:
add_stmt(model, subUrn, species_n, species)
add_submission_creation_date(model, subUrn, cookie)
# grab changing atttributes
- status = get_contents(cell[6]).strip()
+ status = str(cell[6].text_content()).strip()
last_mod_datetime = get_date_contents(cell[8])
last_mod = last_mod_datetime.isoformat()
logging.info("Processed {0}".format(subUrn))
- table_row = table_row.findNext('tr')
+
def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
name = row['name']
print "# {0}".format(name)
print "<{0}>".format(subid.uri)
- print " encodeSubmit:library_urn"\
+ print " encodeSubmit:library_urn "\
"<http://jumpgate.caltech.edu/library/> ."
print ""
creation_dates = list(model.find_statements(query))
if len(creation_dates) == 0:
LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
- soup = get_url_as_soup(str(subUrn), 'GET', cookie)
- created_label = soup.find(text="Created: ")
- if created_label:
- created_date = get_date_contents(created_label.next)
+ tree = get_url_as_tree(str(subUrn), 'GET', cookie)
+ cells = tree.xpath('//div[@id="content"]/table/tr/td')
+ created_label = [x for x in cells
+ if x.text_content().startswith('Created')]
+ if len(created_label) == 1:
+ created_date = get_date_contents(created_label[0].getnext())
created_date_node = RDF.Node(literal=created_date.isoformat(),
datatype=dateTimeType.uri)
add_stmt(model, subUrn, creationDateN, created_date_node)
status_uri = urlparse.urljoin(submission_uri, timestamp)
return RDF.Node(RDF.Uri(status_uri))
+
def get_date_contents(element):
- data = get_contents(element)
+ data = element.text_content()
if data:
return datetime.strptime(data, "%Y-%m-%d %H:%M")
else:
return cookie
-def get_url_as_soup(url, method, cookie=None):
+def get_url_as_tree(url, method, cookie=None):
http = httplib2.Http()
headers = {}
if cookie is not None:
headers['Cookie'] = cookie
response, content = http.request(url, method, headers=headers)
if response['status'] == '200':
- soup = BeautifulSoup(content,
- fromEncoding="utf-8", # should read from header
- convertEntities=BeautifulSoup.HTML_ENTITIES)
- return soup
+ tree = fromstring(content, base_url=url)
+ return tree
else:
msg = "error accessing {0}, status {1}"
msg = msg.format(url, response['status'])
import re
-from BeautifulSoup import BeautifulSoup
+from lxml.html import fromstring
try:
import json
except ImportError, e:
from htsworkflow.pipelines.test.simulate_runfolder import TESTDATA_DIR
-LANE_SET = range(1, 9)
+LANE_SET = range(1,9)
+NSMAP = {'libns':'http://jumpgate.caltech.edu/wiki/LibraryOntology#'}
class ExperimentsTestCases(TestCase):
- fixtures = ['test_flowcells.json']
+ fixtures = ['test_flowcells.json',
+ ]
def setUp(self):
self.tempdir = tempfile.mkdtemp(prefix='htsw-test-experiments-')
runxml = 'run_FC12150_2007-09-27.xml'
shutil.copy(os.path.join(TESTDATA_DIR, runxml),
os.path.join(self.fc1_dir, runxml))
- for i in range(1, 9):
+ for i in range(1,9):
shutil.copy(
os.path.join(TESTDATA_DIR,
'woldlab_070829_USI-EAS44_0017_FC11055_1.srf'),
os.path.join(self.fc1_dir,
- 'woldlab_070829_SERIAL_FC12150_%d.srf' % (i, ))
+ 'woldlab_070829_SERIAL_FC12150_%d.srf' %(i,))
)
self.fc2_dir = os.path.join(self.tempdir, '42JTNAAXX')
fc_django = models.FlowCell.objects.get(flowcell_id=fc_id)
self.failUnlessEqual(fc_dict['flowcell_id'], fc_id)
self.failUnlessEqual(fc_django.flowcell_id, fc_id)
- self.failUnlessEqual(fc_dict['sequencer'],
- fc_django.sequencer.name)
- self.failUnlessEqual(fc_dict['read_length'],
- fc_django.read_length)
+ self.failUnlessEqual(fc_dict['sequencer'], fc_django.sequencer.name)
+ self.failUnlessEqual(fc_dict['read_length'], fc_django.read_length)
self.failUnlessEqual(fc_dict['notes'], fc_django.notes)
- self.failUnlessEqual(fc_dict['cluster_station'],
- fc_django.cluster_station.name)
+ self.failUnlessEqual(fc_dict['cluster_station'], fc_django.cluster_station.name)
for lane in fc_django.lane_set.all():
lane_contents = fc_dict['lane_set'][lane.lane_number]
lane_dict = multi_lane_to_dict(lane_contents)[lane.library_id]
- self.failUnlessEqual(lane_dict['cluster_estimate'],
- lane.cluster_estimate)
+ self.failUnlessEqual(lane_dict['cluster_estimate'], lane.cluster_estimate)
self.failUnlessEqual(lane_dict['comment'], lane.comment)
- self.failUnlessEqual(lane_dict['flowcell'],
- lane.flowcell.flowcell_id)
- self.failUnlessEqual(lane_dict['lane_number'],
- lane.lane_number)
- self.failUnlessEqual(lane_dict['library_name'],
- lane.library.library_name)
+ self.failUnlessEqual(lane_dict['flowcell'], lane.flowcell.flowcell_id)
+ self.failUnlessEqual(lane_dict['lane_number'], lane.lane_number)
+ self.failUnlessEqual(lane_dict['library_name'], lane.library.library_name)
self.failUnlessEqual(lane_dict['library_id'], lane.library.id)
- self.failUnlessAlmostEqual(float(lane_dict['pM']),
- float(lane.pM))
+ self.failUnlessAlmostEqual(float(lane_dict['pM']), float(lane.pM))
self.failUnlessEqual(lane_dict['library_species'],
- lane.library.library_species.scientific_name)
+ lane.library.library_species.scientific_name)
- flowcell_url = '/experiments/config/%s/json'
- response = self.client.get(flowcell_url % (fc_id,), apidata)
+ response = self.client.get('/experiments/config/%s/json' % (fc_id,), apidata)
# strptime isoformat string = '%Y-%m-%dT%H:%M:%S'
fc_json = json.loads(response.content)
self.failUnlessEqual(fc_json['flowcell_id'], fc_id)
- self.failUnlessEqual(fc_json['sequencer'],
- fc_django.sequencer.name)
+ self.failUnlessEqual(fc_json['sequencer'], fc_django.sequencer.name)
self.failUnlessEqual(fc_json['read_length'], fc_django.read_length)
self.failUnlessEqual(fc_json['notes'], fc_django.notes)
- self.failUnlessEqual(fc_json['cluster_station'],
- fc_django.cluster_station.name)
+ self.failUnlessEqual(fc_json['cluster_station'], fc_django.cluster_station.name)
+
for lane in fc_django.lane_set.all():
lane_contents = fc_json['lane_set'][unicode(lane.lane_number)]
lane_dict = multi_lane_to_dict(lane_contents)[lane.library_id]
- self.failUnlessEqual(lane_dict['cluster_estimate'],
- lane.cluster_estimate)
+ self.failUnlessEqual(lane_dict['cluster_estimate'], lane.cluster_estimate)
self.failUnlessEqual(lane_dict['comment'], lane.comment)
- self.failUnlessEqual(lane_dict['flowcell'],
- lane.flowcell.flowcell_id)
- self.failUnlessEqual(lane_dict['lane_number'],
- lane.lane_number)
- self.failUnlessEqual(lane_dict['library_name'],
- lane.library.library_name)
+ self.failUnlessEqual(lane_dict['flowcell'], lane.flowcell.flowcell_id)
+ self.failUnlessEqual(lane_dict['lane_number'], lane.lane_number)
+ self.failUnlessEqual(lane_dict['library_name'], lane.library.library_name)
self.failUnlessEqual(lane_dict['library_id'], lane.library.id)
- self.failUnlessAlmostEqual(float(lane_dict['pM']),
- float(lane.pM))
+ self.failUnlessAlmostEqual(float(lane_dict['pM']), float(lane.pM))
self.failUnlessEqual(lane_dict['library_species'],
- lane.library.library_species.scientific_name)
+ lane.library.library_species.scientific_name)
def test_invalid_flowcell(self):
"""
Make sure we get a 404 if we request an invalid flowcell ID
"""
- flowcell_url = '/experiments/config/nottheone/json'
- response = self.client.get(flowcell_url, apidata)
+ response = self.client.get('/experiments/config/nottheone/json', apidata)
self.failUnlessEqual(response.status_code, 404)
def test_no_key(self):
def test_library_id(self):
"""
- Library IDs should be flexible, retrive a non-numeric ID
+ Library IDs should be flexible, so make sure we can retrive a non-numeric ID
"""
- flowcell_url = '/experiments/config/FC12150/json'
- response = self.client.get(flowcell_url, apidata)
+ response = self.client.get('/experiments/config/FC12150/json', apidata)
self.failUnlessEqual(response.status_code, 200)
flowcell = json.loads(response.content)
Library's have IDs, libraries also have primary keys,
we eventually had enough libraries that the drop down combo box was too
- hard to filter through, unfortnately we want a field that uses our
- library id and not the internal primary key, and raw_id_field uses
- primary keys.
+ hard to filter through, unfortnately we want a field that uses our library
+ id and not the internal primary key, and raw_id_field uses primary keys.
- This tests to make sure that the value entered in the raw library id
- field matches the library id looked up.
+ This tests to make sure that the value entered in the raw library id field matches
+ the library id looked up.
"""
- expected_ids = [u'10981', u'11016', u'SL039', u'11060',
- u'11061', u'11062', u'11063', u'11064']
+ expected_ids = [u'10981',u'11016',u'SL039',u'11060',
+ u'11061',u'11062',u'11063',u'11064']
self.client.login(username='supertest', password='BJOKL5kAj6aFZ6A5')
response = self.client.get('/admin/experiments/flowcell/153/')
- soup = BeautifulSoup(response.content)
- for i in range(0, 8):
- input_field = soup.find(id='id_lane_set-%d-library' % (i,))
- library_field = input_field.findNext('strong')
+ tree = fromstring(response.content)
+ for i in range(0,8):
+ xpath_expression = '//input[@id="id_lane_set-%d-library"]'
+ input_field = tree.xpath(xpath_expression % (i,))[0]
+ library_field = input_field.find('../strong')
library_id, library_name = library_field.text.split(':')
# strip leading '#' sign from name
library_id = library_id[1:]
self.failUnlessEqual(library_id, expected_ids[i])
- self.failUnlessEqual(input_field['value'], library_id)
+ self.failUnlessEqual(input_field.attrib['value'], library_id)
def test_library_to_flowcell_link(self):
"""
"""
self.client.login(username='supertest', password='BJOKL5kAj6aFZ6A5')
response = self.client.get('/library/11070/')
- soup = BeautifulSoup(response.content)
- failed_fc_span = soup.find(text='30012AAXX (failed)')
- failed_fc_a = failed_fc_span.findPrevious('a')
+ tree = fromstring(response.content)
+ flowcell_spans = tree.xpath('//span[@property="libns:flowcell_id"]',
+ namespaces=NSMAP)
+ self.assertEqual(flowcell_spans[0].text, '30012AAXX (failed)')
+ failed_fc_span = flowcell_spans[0]
+ failed_fc_a = failed_fc_span.getparent()
# make sure some of our RDF made it.
self.failUnlessEqual(failed_fc_a.get('rel'), 'libns:flowcell')
self.failUnlessEqual(failed_fc_a.get('href'), '/flowcell/30012AAXX/')
self.assertEqual(lane_dict['11045']['index_sequence'],
{u'1': u'ATCACG'})
+
+
def test_lanes_for(self):
"""
Check the code that packs the django objects into simple types.
lanes = experiments.lanes_for(user)
self.failUnlessEqual(len(lanes), 5)
- flowcell_url = '/experiments/lanes_for/%s/json'
- response = self.client.get(flowcell_url % (user,), apidata)
+ response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata)
lanes_json = json.loads(response.content)
self.failUnlessEqual(len(lanes), len(lanes_json))
for i in range(len(lanes)):
self.failUnlessEqual(lanes[i]['comment'], lanes_json[i]['comment'])
- self.failUnlessEqual(lanes[i]['lane_number'],
- lanes_json[i]['lane_number'])
- self.failUnlessEqual(lanes[i]['flowcell'],
- lanes_json[i]['flowcell'])
- self.failUnlessEqual(lanes[i]['run_date'],
- lanes_json[i]['run_date'])
+ self.failUnlessEqual(lanes[i]['lane_number'], lanes_json[i]['lane_number'])
+ self.failUnlessEqual(lanes[i]['flowcell'], lanes_json[i]['flowcell'])
+ self.failUnlessEqual(lanes[i]['run_date'], lanes_json[i]['run_date'])
def test_lanes_for_no_lanes(self):
"""
- What happens to user who haven't submitted anything
+ Do we get something meaningful back when the user isn't attached to anything?
"""
user = 'supertest'
lanes = experiments.lanes_for(user)
self.failUnlessEqual(len(lanes), 0)
- url = '/experiments/lanes_for/%s/json'
- response = self.client.get(url % (user,), apidata)
+ response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata)
lanes_json = json.loads(response.content)
def test_lanes_for_no_user(self):
user = 'not a real user'
self.failUnlessRaises(ObjectDoesNotExist, experiments.lanes_for, user)
- url = '/experiments/lanes_for/%s/json'
- response = self.client.get(url % (user,), apidata)
+ response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata)
self.failUnlessEqual(response.status_code, 404)
+
def test_raw_data_dir(self):
"""Raw data path generator check"""
flowcell_id = self.fc1_id
fc.flowcell_id = flowcell_id + " (failed)"
self.failUnlessEqual(fc.get_raw_data_directory(), raw_dir)
+
def test_data_run_import(self):
srf_file_type = models.FileType.objects.get(name='SRF')
runxml_file_type = models.FileType.objects.get(name='run_xml')
lane_files = run.lane_files()
self.failUnlessEqual(lane_files[4]['srf'], srf4)
- runxml = result_dict['FC12150/C1-37/run_FC12150_2007-09-27.xml']
+ runxml= result_dict['FC12150/C1-37/run_FC12150_2007-09-27.xml']
self.failUnlessEqual(runxml.file_type, runxml_file_type)
self.failUnlessEqual(runxml.library_id, None)
+
def test_read_result_file(self):
"""make sure we can return a result file
"""
result_files = flowcell.datarun_set.all()[0].datafile_set.all()
for f in result_files:
- url = '/experiments/file/%s' % (f.random_key, )
+ url = '/experiments/file/%s' % ( f.random_key,)
response = self.client.get(url)
self.failUnlessEqual(response.status_code, 200)
mimetype = f.file_type.mimetype
self.failUnlessEqual(mimetype, response['content-type'])
-
class TestFileType(TestCase):
def test_file_type_unicode(self):
file_type_objects = models.FileType.objects
self.failUnlessEqual(u"<FileType: QSEQ tarfile>",
unicode(file_type_object))
-
class TestFileType(TestCase):
def test_find_file_type(self):
file_type_objects = models.FileType.objects
'QSEQ tarfile', 7, 1),
('woldlab_091005_HWUSI-EAS627_0010_42JT2AAXX_1.srf',
'SRF', 1, None),
- ('s_1_eland_extended.txt.bz2', 'ELAND Extended', 1, None),
+ ('s_1_eland_extended.txt.bz2','ELAND Extended', 1, None),
('s_7_eland_multi.txt.bz2', 'ELAND Multi', 7, None),
- ('s_3_eland_result.txt.bz2', 'ELAND Result', 3, None),
+ ('s_3_eland_result.txt.bz2','ELAND Result', 3, None),
('s_1_export.txt.bz2','ELAND Export', 1, None),
('s_1_percent_call.png', 'IVC Percent Call', 1, None),
('s_2_percent_base.png', 'IVC Percent Base', 2, None),
'QSEQ tarfile', 7, 1),
('foo/woldlab_091005_HWUSI-EAS627_0010_42JT2AAXX_1.srf',
'SRF', 1, None),
- ('../s_1_eland_extended.txt.bz2', 'ELAND Extended', 1, None),
+ ('../s_1_eland_extended.txt.bz2','ELAND Extended', 1, None),
('/bleem/s_7_eland_multi.txt.bz2', 'ELAND Multi', 7, None),
- ('/qwer/s_3_eland_result.txt.bz2', 'ELAND Result', 3, None),
- ('/ty///1/s_1_export.txt.bz2', 'ELAND Export', 1, None),
+ ('/qwer/s_3_eland_result.txt.bz2','ELAND Result', 3, None),
+ ('/ty///1/s_1_export.txt.bz2','ELAND Export', 1, None),
('/help/s_1_percent_call.png', 'IVC Percent Call', 1, None),
('/bored/s_2_percent_base.png', 'IVC Percent Base', 2, None),
('/example1/s_3_percent_all.png', 'IVC Percent All', 3, None),
result = models.find_file_type_metadata_from_filename(filename)
self.failUnlessEqual(result['file_type'],
file_type_objects.get(name=typename))
- self.failUnlessEqual(result.get('lane', None), lane)
+ self.failUnlessEqual(result.get('lane',None), lane)
self.failUnlessEqual(result.get('end', None), end)
-
class TestEmailNotify(TestCase):
fixtures = ['test_flowcells.json']
self.failUnless('pk1@example.com' in response.content)
self.failUnless('Lane #8 : (11064) Paired ends 104' in response.content)
- response = self.client.get('/experiments/started/153/',
- {'send': '1','bcc': 'on'})
+ response = self.client.get('/experiments/started/153/', {'send':'1','bcc':'on'})
self.failUnlessEqual(response.status_code, 200)
self.failUnlessEqual(len(mail.outbox), 4)
for m in mail.outbox:
self.failUnlessEqual(response.status_code, 200)
self.failUnless(re.search('Flowcell FC12150', response.content))
# require that navigation back to the admin page exists
- flowcell_a_re = '<a href="/admin/experiments/flowcell/153/">[^<]+</a>'
- self.failUnless(re.search(flowcell_a_re, response.content))
-
+ self.failUnless(re.search('<a href="/admin/experiments/flowcell/153/">[^<]+</a>', response.content))
def multi_lane_to_dict(lane):
"""Convert a list of lane entries into a dictionary indexed by library ID
"""
- return dict(((x['library_id'], x) for x in lane))
+ return dict( ((x['library_id'],x) for x in lane) )