From 3fe87b726172f5e118cf3e07f987a77ccd03358a Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Wed, 21 Sep 2011 12:07:25 -0700 Subject: [PATCH] Switch from beautifulsoup to lxml, now that I know how to build it on os x. --- extra/ucsc_encode_submission/encode_find.py | 46 +++--- htsworkflow/frontend/experiments/tests.py | 159 +++++++++----------- 2 files changed, 93 insertions(+), 112 deletions(-) diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py index f116281..a56bf1d 100644 --- a/extra/ucsc_encode_submission/encode_find.py +++ b/extra/ucsc_encode_submission/encode_find.py @@ -3,7 +3,7 @@ Gather information about our submissions into a single RDF store """ -from BeautifulSoup import BeautifulSoup +from lxml.html import fromstring from datetime import datetime import httplib2 from operator import attrgetter @@ -155,28 +155,27 @@ def load_my_submissions(model, limit=None, cookie=None): if cookie is None: cookie = login() - soup = get_url_as_soup(USER_URL, 'GET', cookie) - projects = soup.find('table', attrs={'id': 'projects'}) - table_row = projects.findNext('tr') + tree = get_url_as_tree(USER_URL, 'GET', cookie) + table_rows = tree.xpath('//table[@id="projects"]/tr') # first record is header - table_row = table_row.findNext() name_n = submissionOntology['name'] species_n = submissionOntology['species'] library_urn = submissionOntology['library_urn'] - while table_row is not None: - cell = table_row.findAll('td') + # skip header + for row in table_rows[1:]: + cell = row.xpath('td') if cell is not None and len(cell) > 1: - submission_id = cell[0].contents[0].contents[0].encode(CHARSET) + submission_id = str(cell[0].text_content()) if limit is None or submission_id in limit: subUrn = RDF.Uri(submission_view_url(submission_id)) add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission']) - name = get_contents(cell[4]) + name = str(cell[4].text_content()) add_stmt(model, subUrn, name_n, name) - species = get_contents(cell[2]) + species = str(cell[2].text_content()) if species is not None: add_stmt(model, subUrn, species_n, species) @@ -190,7 +189,7 @@ def load_my_submissions(model, limit=None, cookie=None): add_submission_creation_date(model, subUrn, cookie) # grab changing atttributes - status = get_contents(cell[6]).strip() + status = str(cell[6].text_content()).strip() last_mod_datetime = get_date_contents(cell[8]) last_mod = last_mod_datetime.isoformat() @@ -199,7 +198,7 @@ def load_my_submissions(model, limit=None, cookie=None): logging.info("Processed {0}".format(subUrn)) - table_row = table_row.findNext('tr') + def add_submission_to_library_urn(model, submissionUrn, predicate, library_id): @@ -233,7 +232,7 @@ WHERE {{ name = row['name'] print "# {0}".format(name) print "<{0}>".format(subid.uri) - print " encodeSubmit:library_urn"\ + print " encodeSubmit:library_urn "\ " ." print "" @@ -246,10 +245,12 @@ def add_submission_creation_date(model, subUrn, cookie): creation_dates = list(model.find_statements(query)) if len(creation_dates) == 0: LOGGER.info("Getting creation date for: {0}".format(str(subUrn))) - soup = get_url_as_soup(str(subUrn), 'GET', cookie) - created_label = soup.find(text="Created: ") - if created_label: - created_date = get_date_contents(created_label.next) + tree = get_url_as_tree(str(subUrn), 'GET', cookie) + cells = tree.xpath('//div[@id="content"]/table/tr/td') + created_label = [x for x in cells + if x.text_content().startswith('Created')] + if len(created_label) == 1: + created_date = get_date_contents(created_label[0].getnext()) created_date_node = RDF.Node(literal=created_date.isoformat(), datatype=dateTimeType.uri) add_stmt(model, subUrn, creationDateN, created_date_node) @@ -417,8 +418,9 @@ def create_status_node(submission_uri, timestamp): status_uri = urlparse.urljoin(submission_uri, timestamp) return RDF.Node(RDF.Uri(status_uri)) + def get_date_contents(element): - data = get_contents(element) + data = element.text_content() if data: return datetime.strptime(data, "%Y-%m-%d %H:%M") else: @@ -457,17 +459,15 @@ def login(cookie=None): return cookie -def get_url_as_soup(url, method, cookie=None): +def get_url_as_tree(url, method, cookie=None): http = httplib2.Http() headers = {} if cookie is not None: headers['Cookie'] = cookie response, content = http.request(url, method, headers=headers) if response['status'] == '200': - soup = BeautifulSoup(content, - fromEncoding="utf-8", # should read from header - convertEntities=BeautifulSoup.HTML_ENTITIES) - return soup + tree = fromstring(content, base_url=url) + return tree else: msg = "error accessing {0}, status {1}" msg = msg.format(url, response['status']) diff --git a/htsworkflow/frontend/experiments/tests.py b/htsworkflow/frontend/experiments/tests.py index ef299e3..8ec46e0 100644 --- a/htsworkflow/frontend/experiments/tests.py +++ b/htsworkflow/frontend/experiments/tests.py @@ -1,5 +1,5 @@ import re -from BeautifulSoup import BeautifulSoup +from lxml.html import fromstring try: import json except ImportError, e: @@ -19,11 +19,13 @@ from htsworkflow.frontend.auth import apidata from htsworkflow.pipelines.test.simulate_runfolder import TESTDATA_DIR -LANE_SET = range(1, 9) +LANE_SET = range(1,9) +NSMAP = {'libns':'http://jumpgate.caltech.edu/wiki/LibraryOntology#'} class ExperimentsTestCases(TestCase): - fixtures = ['test_flowcells.json'] + fixtures = ['test_flowcells.json', + ] def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='htsw-test-experiments-') @@ -37,12 +39,12 @@ class ExperimentsTestCases(TestCase): runxml = 'run_FC12150_2007-09-27.xml' shutil.copy(os.path.join(TESTDATA_DIR, runxml), os.path.join(self.fc1_dir, runxml)) - for i in range(1, 9): + for i in range(1,9): shutil.copy( os.path.join(TESTDATA_DIR, 'woldlab_070829_USI-EAS44_0017_FC11055_1.srf'), os.path.join(self.fc1_dir, - 'woldlab_070829_SERIAL_FC12150_%d.srf' % (i, )) + 'woldlab_070829_SERIAL_FC12150_%d.srf' %(i,)) ) self.fc2_dir = os.path.join(self.tempdir, '42JTNAAXX') @@ -63,69 +65,53 @@ class ExperimentsTestCases(TestCase): fc_django = models.FlowCell.objects.get(flowcell_id=fc_id) self.failUnlessEqual(fc_dict['flowcell_id'], fc_id) self.failUnlessEqual(fc_django.flowcell_id, fc_id) - self.failUnlessEqual(fc_dict['sequencer'], - fc_django.sequencer.name) - self.failUnlessEqual(fc_dict['read_length'], - fc_django.read_length) + self.failUnlessEqual(fc_dict['sequencer'], fc_django.sequencer.name) + self.failUnlessEqual(fc_dict['read_length'], fc_django.read_length) self.failUnlessEqual(fc_dict['notes'], fc_django.notes) - self.failUnlessEqual(fc_dict['cluster_station'], - fc_django.cluster_station.name) + self.failUnlessEqual(fc_dict['cluster_station'], fc_django.cluster_station.name) for lane in fc_django.lane_set.all(): lane_contents = fc_dict['lane_set'][lane.lane_number] lane_dict = multi_lane_to_dict(lane_contents)[lane.library_id] - self.failUnlessEqual(lane_dict['cluster_estimate'], - lane.cluster_estimate) + self.failUnlessEqual(lane_dict['cluster_estimate'], lane.cluster_estimate) self.failUnlessEqual(lane_dict['comment'], lane.comment) - self.failUnlessEqual(lane_dict['flowcell'], - lane.flowcell.flowcell_id) - self.failUnlessEqual(lane_dict['lane_number'], - lane.lane_number) - self.failUnlessEqual(lane_dict['library_name'], - lane.library.library_name) + self.failUnlessEqual(lane_dict['flowcell'], lane.flowcell.flowcell_id) + self.failUnlessEqual(lane_dict['lane_number'], lane.lane_number) + self.failUnlessEqual(lane_dict['library_name'], lane.library.library_name) self.failUnlessEqual(lane_dict['library_id'], lane.library.id) - self.failUnlessAlmostEqual(float(lane_dict['pM']), - float(lane.pM)) + self.failUnlessAlmostEqual(float(lane_dict['pM']), float(lane.pM)) self.failUnlessEqual(lane_dict['library_species'], - lane.library.library_species.scientific_name) + lane.library.library_species.scientific_name) - flowcell_url = '/experiments/config/%s/json' - response = self.client.get(flowcell_url % (fc_id,), apidata) + response = self.client.get('/experiments/config/%s/json' % (fc_id,), apidata) # strptime isoformat string = '%Y-%m-%dT%H:%M:%S' fc_json = json.loads(response.content) self.failUnlessEqual(fc_json['flowcell_id'], fc_id) - self.failUnlessEqual(fc_json['sequencer'], - fc_django.sequencer.name) + self.failUnlessEqual(fc_json['sequencer'], fc_django.sequencer.name) self.failUnlessEqual(fc_json['read_length'], fc_django.read_length) self.failUnlessEqual(fc_json['notes'], fc_django.notes) - self.failUnlessEqual(fc_json['cluster_station'], - fc_django.cluster_station.name) + self.failUnlessEqual(fc_json['cluster_station'], fc_django.cluster_station.name) + for lane in fc_django.lane_set.all(): lane_contents = fc_json['lane_set'][unicode(lane.lane_number)] lane_dict = multi_lane_to_dict(lane_contents)[lane.library_id] - self.failUnlessEqual(lane_dict['cluster_estimate'], - lane.cluster_estimate) + self.failUnlessEqual(lane_dict['cluster_estimate'], lane.cluster_estimate) self.failUnlessEqual(lane_dict['comment'], lane.comment) - self.failUnlessEqual(lane_dict['flowcell'], - lane.flowcell.flowcell_id) - self.failUnlessEqual(lane_dict['lane_number'], - lane.lane_number) - self.failUnlessEqual(lane_dict['library_name'], - lane.library.library_name) + self.failUnlessEqual(lane_dict['flowcell'], lane.flowcell.flowcell_id) + self.failUnlessEqual(lane_dict['lane_number'], lane.lane_number) + self.failUnlessEqual(lane_dict['library_name'], lane.library.library_name) self.failUnlessEqual(lane_dict['library_id'], lane.library.id) - self.failUnlessAlmostEqual(float(lane_dict['pM']), - float(lane.pM)) + self.failUnlessAlmostEqual(float(lane_dict['pM']), float(lane.pM)) self.failUnlessEqual(lane_dict['library_species'], - lane.library.library_species.scientific_name) + lane.library.library_species.scientific_name) def test_invalid_flowcell(self): """ Make sure we get a 404 if we request an invalid flowcell ID """ - flowcell_url = '/experiments/config/nottheone/json' - response = self.client.get(flowcell_url, apidata) + response = self.client.get('/experiments/config/nottheone/json', apidata) self.failUnlessEqual(response.status_code, 404) def test_no_key(self): @@ -137,10 +123,9 @@ class ExperimentsTestCases(TestCase): def test_library_id(self): """ - Library IDs should be flexible, retrive a non-numeric ID + Library IDs should be flexible, so make sure we can retrive a non-numeric ID """ - flowcell_url = '/experiments/config/FC12150/json' - response = self.client.get(flowcell_url, apidata) + response = self.client.get('/experiments/config/FC12150/json', apidata) self.failUnlessEqual(response.status_code, 200) flowcell = json.loads(response.content) @@ -160,26 +145,26 @@ class ExperimentsTestCases(TestCase): Library's have IDs, libraries also have primary keys, we eventually had enough libraries that the drop down combo box was too - hard to filter through, unfortnately we want a field that uses our - library id and not the internal primary key, and raw_id_field uses - primary keys. + hard to filter through, unfortnately we want a field that uses our library + id and not the internal primary key, and raw_id_field uses primary keys. - This tests to make sure that the value entered in the raw library id - field matches the library id looked up. + This tests to make sure that the value entered in the raw library id field matches + the library id looked up. """ - expected_ids = [u'10981', u'11016', u'SL039', u'11060', - u'11061', u'11062', u'11063', u'11064'] + expected_ids = [u'10981',u'11016',u'SL039',u'11060', + u'11061',u'11062',u'11063',u'11064'] self.client.login(username='supertest', password='BJOKL5kAj6aFZ6A5') response = self.client.get('/admin/experiments/flowcell/153/') - soup = BeautifulSoup(response.content) - for i in range(0, 8): - input_field = soup.find(id='id_lane_set-%d-library' % (i,)) - library_field = input_field.findNext('strong') + tree = fromstring(response.content) + for i in range(0,8): + xpath_expression = '//input[@id="id_lane_set-%d-library"]' + input_field = tree.xpath(xpath_expression % (i,))[0] + library_field = input_field.find('../strong') library_id, library_name = library_field.text.split(':') # strip leading '#' sign from name library_id = library_id[1:] self.failUnlessEqual(library_id, expected_ids[i]) - self.failUnlessEqual(input_field['value'], library_id) + self.failUnlessEqual(input_field.attrib['value'], library_id) def test_library_to_flowcell_link(self): """ @@ -188,9 +173,12 @@ class ExperimentsTestCases(TestCase): """ self.client.login(username='supertest', password='BJOKL5kAj6aFZ6A5') response = self.client.get('/library/11070/') - soup = BeautifulSoup(response.content) - failed_fc_span = soup.find(text='30012AAXX (failed)') - failed_fc_a = failed_fc_span.findPrevious('a') + tree = fromstring(response.content) + flowcell_spans = tree.xpath('//span[@property="libns:flowcell_id"]', + namespaces=NSMAP) + self.assertEqual(flowcell_spans[0].text, '30012AAXX (failed)') + failed_fc_span = flowcell_spans[0] + failed_fc_a = failed_fc_span.getparent() # make sure some of our RDF made it. self.failUnlessEqual(failed_fc_a.get('rel'), 'libns:flowcell') self.failUnlessEqual(failed_fc_a.get('href'), '/flowcell/30012AAXX/') @@ -212,6 +200,8 @@ class ExperimentsTestCases(TestCase): self.assertEqual(lane_dict['11045']['index_sequence'], {u'1': u'ATCACG'}) + + def test_lanes_for(self): """ Check the code that packs the django objects into simple types. @@ -220,29 +210,24 @@ class ExperimentsTestCases(TestCase): lanes = experiments.lanes_for(user) self.failUnlessEqual(len(lanes), 5) - flowcell_url = '/experiments/lanes_for/%s/json' - response = self.client.get(flowcell_url % (user,), apidata) + response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata) lanes_json = json.loads(response.content) self.failUnlessEqual(len(lanes), len(lanes_json)) for i in range(len(lanes)): self.failUnlessEqual(lanes[i]['comment'], lanes_json[i]['comment']) - self.failUnlessEqual(lanes[i]['lane_number'], - lanes_json[i]['lane_number']) - self.failUnlessEqual(lanes[i]['flowcell'], - lanes_json[i]['flowcell']) - self.failUnlessEqual(lanes[i]['run_date'], - lanes_json[i]['run_date']) + self.failUnlessEqual(lanes[i]['lane_number'], lanes_json[i]['lane_number']) + self.failUnlessEqual(lanes[i]['flowcell'], lanes_json[i]['flowcell']) + self.failUnlessEqual(lanes[i]['run_date'], lanes_json[i]['run_date']) def test_lanes_for_no_lanes(self): """ - What happens to user who haven't submitted anything + Do we get something meaningful back when the user isn't attached to anything? """ user = 'supertest' lanes = experiments.lanes_for(user) self.failUnlessEqual(len(lanes), 0) - url = '/experiments/lanes_for/%s/json' - response = self.client.get(url % (user,), apidata) + response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata) lanes_json = json.loads(response.content) def test_lanes_for_no_user(self): @@ -252,10 +237,10 @@ class ExperimentsTestCases(TestCase): user = 'not a real user' self.failUnlessRaises(ObjectDoesNotExist, experiments.lanes_for, user) - url = '/experiments/lanes_for/%s/json' - response = self.client.get(url % (user,), apidata) + response = self.client.get('/experiments/lanes_for/%s/json' % (user,), apidata) self.failUnlessEqual(response.status_code, 404) + def test_raw_data_dir(self): """Raw data path generator check""" flowcell_id = self.fc1_id @@ -267,6 +252,7 @@ class ExperimentsTestCases(TestCase): fc.flowcell_id = flowcell_id + " (failed)" self.failUnlessEqual(fc.get_raw_data_directory(), raw_dir) + def test_data_run_import(self): srf_file_type = models.FileType.objects.get(name='SRF') runxml_file_type = models.FileType.objects.get(name='run_xml') @@ -293,10 +279,11 @@ class ExperimentsTestCases(TestCase): lane_files = run.lane_files() self.failUnlessEqual(lane_files[4]['srf'], srf4) - runxml = result_dict['FC12150/C1-37/run_FC12150_2007-09-27.xml'] + runxml= result_dict['FC12150/C1-37/run_FC12150_2007-09-27.xml'] self.failUnlessEqual(runxml.file_type, runxml_file_type) self.failUnlessEqual(runxml.library_id, None) + def test_read_result_file(self): """make sure we can return a result file """ @@ -308,7 +295,7 @@ class ExperimentsTestCases(TestCase): result_files = flowcell.datarun_set.all()[0].datafile_set.all() for f in result_files: - url = '/experiments/file/%s' % (f.random_key, ) + url = '/experiments/file/%s' % ( f.random_key,) response = self.client.get(url) self.failUnlessEqual(response.status_code, 200) mimetype = f.file_type.mimetype @@ -317,7 +304,6 @@ class ExperimentsTestCases(TestCase): self.failUnlessEqual(mimetype, response['content-type']) - class TestFileType(TestCase): def test_file_type_unicode(self): file_type_objects = models.FileType.objects @@ -326,7 +312,6 @@ class TestFileType(TestCase): self.failUnlessEqual(u"", unicode(file_type_object)) - class TestFileType(TestCase): def test_find_file_type(self): file_type_objects = models.FileType.objects @@ -334,9 +319,9 @@ class TestFileType(TestCase): 'QSEQ tarfile', 7, 1), ('woldlab_091005_HWUSI-EAS627_0010_42JT2AAXX_1.srf', 'SRF', 1, None), - ('s_1_eland_extended.txt.bz2', 'ELAND Extended', 1, None), + ('s_1_eland_extended.txt.bz2','ELAND Extended', 1, None), ('s_7_eland_multi.txt.bz2', 'ELAND Multi', 7, None), - ('s_3_eland_result.txt.bz2', 'ELAND Result', 3, None), + ('s_3_eland_result.txt.bz2','ELAND Result', 3, None), ('s_1_export.txt.bz2','ELAND Export', 1, None), ('s_1_percent_call.png', 'IVC Percent Call', 1, None), ('s_2_percent_base.png', 'IVC Percent Base', 2, None), @@ -359,10 +344,10 @@ class TestFileType(TestCase): 'QSEQ tarfile', 7, 1), ('foo/woldlab_091005_HWUSI-EAS627_0010_42JT2AAXX_1.srf', 'SRF', 1, None), - ('../s_1_eland_extended.txt.bz2', 'ELAND Extended', 1, None), + ('../s_1_eland_extended.txt.bz2','ELAND Extended', 1, None), ('/bleem/s_7_eland_multi.txt.bz2', 'ELAND Multi', 7, None), - ('/qwer/s_3_eland_result.txt.bz2', 'ELAND Result', 3, None), - ('/ty///1/s_1_export.txt.bz2', 'ELAND Export', 1, None), + ('/qwer/s_3_eland_result.txt.bz2','ELAND Result', 3, None), + ('/ty///1/s_1_export.txt.bz2','ELAND Export', 1, None), ('/help/s_1_percent_call.png', 'IVC Percent Call', 1, None), ('/bored/s_2_percent_base.png', 'IVC Percent Base', 2, None), ('/example1/s_3_percent_all.png', 'IVC Percent All', 3, None), @@ -375,10 +360,9 @@ class TestFileType(TestCase): result = models.find_file_type_metadata_from_filename(filename) self.failUnlessEqual(result['file_type'], file_type_objects.get(name=typename)) - self.failUnlessEqual(result.get('lane', None), lane) + self.failUnlessEqual(result.get('lane',None), lane) self.failUnlessEqual(result.get('end', None), end) - class TestEmailNotify(TestCase): fixtures = ['test_flowcells.json'] @@ -404,8 +388,7 @@ class TestEmailNotify(TestCase): self.failUnless('pk1@example.com' in response.content) self.failUnless('Lane #8 : (11064) Paired ends 104' in response.content) - response = self.client.get('/experiments/started/153/', - {'send': '1','bcc': 'on'}) + response = self.client.get('/experiments/started/153/', {'send':'1','bcc':'on'}) self.failUnlessEqual(response.status_code, 200) self.failUnlessEqual(len(mail.outbox), 4) for m in mail.outbox: @@ -420,11 +403,9 @@ class TestEmailNotify(TestCase): self.failUnlessEqual(response.status_code, 200) self.failUnless(re.search('Flowcell FC12150', response.content)) # require that navigation back to the admin page exists - flowcell_a_re = '[^<]+' - self.failUnless(re.search(flowcell_a_re, response.content)) - + self.failUnless(re.search('[^<]+', response.content)) def multi_lane_to_dict(lane): """Convert a list of lane entries into a dictionary indexed by library ID """ - return dict(((x['library_id'], x) for x in lane)) + return dict( ((x['library_id'],x) for x in lane) ) -- 2.30.2