+
+
+def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
+ """Add a link from a UCSC submission to woldlab library if needed
+ """
+ libraryUrn = libraryNS[library_id]
+ query = RDF.Statement(submissionUrn, predicate, libraryUrn)
+ if not model.contains_statement(query):
+ link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
+ logger.info("Adding Sub -> Lib link: {0}".format(link))
+ model.add_statement(link)
+ else:
+ logger.info("Found: {0}".format(str(result[0])))
+
+
+def find_submissions_with_no_library(model):
+ p = os.path.abspath(__file__)
+ sourcedir = os.path.dirname(p)
+ no_lib = open(os.path.join(sourcedir, "no-lib.sparql"),'r').read()
+ query = RDF.SPARQLQuery(no_lib)
+ results = query.execute(model)
+ for row in results:
+ subid = row['subid']
+ name = row['name']
+ print "# {0}".format(name)
+ print "<{0}>".format(subid.uri)
+ print " encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+ print ""
+
+def update_submission_detail(model, cookie=None):
+ """Look for submission IDs in our model and go get their ddfs
+ """
+ submissions = model.get_sources(rdfsNS['Class'],
+ submitOntologyNS['Submission'])
+ for subUrn in submissions:
+ logging.info("Updating detail for: {0}".format(str(subUrn)))
+ update_submission_creation_date(model, subUrn, cookie)
+ download_ddf(model, subUrn, cookie=cookie)
+
+
+def update_submission_creation_date(model, subUrn, cookie):
+ # in theory the submission page might have more information on it.
+ creationDateP = libNS['date']
+ dateTimeType = xsdNS['dateTime']
+ query = RDF.Statement(subUrn, creationDateP, None)
+ creation_dates = list(model.find_statements(query))
+ if len(creation_dates) == 0:
+ logger.info("Getting creation date for: {0}".format(str(subUrn)))
+ soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
+ created_label = soup.find(text="Created: ")
+ if created_label:
+ created_date = get_date_contents(created_label.next)
+ created_date_node = RDF.Node(literal=created_date.isoformat(),
+ datatype=dateTimeType.uri)
+ model.add_statement(
+ RDF.Statement(subUrn, creationDateP, created_date_node)
+ )
+
+
+def download_ddf(model, subId, cookie=None):
+ """Read a DDF
+ """
+ if cookie is None:
+ cookie = login()
+
+ download_ddf_url = str(subId).replace('show', 'download_ddf')
+ ddf = get_url_as_text(download_ddf_url, 'GET', cookie)
+ ddfUrn = RDF.Uri(download_ddf_url)
+ query = RDF.Statement(ddfUrn, rdfsNS['Class'], ddfNS['ddf'])
+ if not model.contains_statement(query):
+ statements = parse_ddf(subId, ddf)
+ for s in statements:
+ model.add_statement(s)
+
+
+def parse_ddf(subId, ddf_blob):
+ """Convert a ddf text file into RDF Statements
+ """
+ ddf_data = ddf_blob.split('\n')
+ # first line is header
+ header = ddf_data[0].split()
+ attributes = [ ddfNS[x] for x in header ]
+ statements = []
+ subIdUri = str(subId.uri)
+ # force it to look like a namespace
+ if subIdUri[-1] != '/':
+ subIdUri += '/'
+ subIdNS = RDF.NS(subIdUri)
+ for ddf_line in ddf_data[1:]:
+ ddf_line = ddf_line.strip()
+ if len(ddf_line) == 0:
+ continue
+ if ddf_line.startswith("#"):
+ continue
+
+ ddf_records = ddf_line.split('\t')
+ files = ddf_records[0].split(',')
+ file_attributes = ddf_records[1:]
+
+ for f in files:
+ blank = RDF.Node()
+ statements += [RDF.Statement(subId,
+ submitOntologyNS['has_file'],
+ blank)]
+ statements += [RDF.Statement(blank, rdfsNS['Class'],
+ submitOntologyNS['File'])]
+ statements += [RDF.Statement(blank, ddfNS['filename'], f)]
+ file_uri_list = [ blank ] * len(file_attributes)
+ for s,p,o in zip(file_uri_list, attributes[1:], file_attributes):
+ statements += [RDF.Statement(s,p,o)]
+
+ return statements
+
+def load_libraries(model, htswapi):
+ """
+ """
+ query = RDF.SPARQLQuery("""
+ SELECT distinct ?library_urn
+ WHERE {
+ ?subid <http://jumpgate.caltech.edu/wiki/EncodeSubmit#library_urn> ?library_urn .
+ }""")
+ results = query.execute(model)
+ #newmodel = get_model()
+ newmodel = model
+ for row in results:
+ lib_id = row['library_urn']
+ lib_uri = str(row['library_urn'].uri)
+ short_lib_id = lib_uri.replace(libraryNS._prefix,"")
+ logging.info("Loading library info: {0}".format(short_lib_id))
+ if short_lib_id.startswith("SL"):
+ continue
+ lib_info = htswapi.get_library(short_lib_id)
+
+ for lib_k, lib_v in lib_info.items():
+ if lib_k != 'lane_set':
+ attribute = lib_k.encode(CHARSET)
+ newmodel.append(
+ RDF.Statement(lib_id,
+ submitOntologyNS[attribute],
+ str(lib_v)))
+ else:
+ for flowcell in lib_v:
+ blank = RDF.Node()
+ newmodel.append(
+ RDF.Statement(lib_id,
+ submitOntologyNS['has_lane'],
+ blank))
+ for fc_k, fc_v in flowcell.items():
+ newmodel.append(
+ RDF.Statement(blank,
+ submitOntologyNS[fc_k.encode(CHARSET)],
+ str(fc_v)))
+
+ #serializer = RDF.Serializer('turtle')
+ #print serializer.serialize_model_to_string(newmodel)
+
+def get_library_id(name):
+ """Guess library ID from library name
+ """
+ match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
+ library_id = None
+ if match is not None:
+ library_id = match.group('id')
+ return library_id
+