Import information from NCBI SRA into a RDF model.
authorDiane Trout <diane@caltech.edu>
Thu, 14 Jun 2012 21:57:18 +0000 (14:57 -0700)
committerDiane Trout <diane@caltech.edu>
Thu, 14 Jun 2012 21:57:18 +0000 (14:57 -0700)
(Mostly so I can find our libraries on their site)

htsworkflow/submission/ncbi.py [new file with mode: 0644]
htsworkflow/templates/sra.rdfxml.xsl [new file with mode: 0644]

diff --git a/htsworkflow/submission/ncbi.py b/htsworkflow/submission/ncbi.py
new file mode 100644 (file)
index 0000000..3cc8736
--- /dev/null
@@ -0,0 +1,120 @@
+"""Start extracting information out of NCBI SRA
+
+It probably could be extended to extract other NCBI information.
+But at the time I just needed to look up things in the short read archive.
+"""
+
+import logging
+from lxml.etree import parse, XSLT, tostring, fromstring
+from optparse import OptionParser
+import os
+import RDF
+import urllib
+
+from htsworkflow.util.rdfhelp import get_model, dump_model
+
+from django.conf import settings
+from django.template import Context, loader
+
+if not 'DJANGO_SETTINGS_MODULE' in os.environ:
+    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
+
+LOGGER = logging.getLogger(__name__)
+
+ESEARCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
+EFETCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+DB = 'sra'
+DEFAULT_QUERY = 'wgEncodeCaltechRnaSeq OR wgEncodeCaltechHist OR wgEncodeCaltechTfbs'
+
+def search_ncbi_ids(database, term, return_max=200):
+    """Return list of IDs from a NCBI database
+    database - which ncbi database to search
+    term - ncbi query string
+    return_max - maximum records to return
+    """
+    search = {'db': database,
+              'term': term,
+              'retmax': return_max}
+    tree = parse(ESEARCH_URL + urllib.urlencode(search))
+    root = tree.getroot()
+    count = get_node_scalar(root, '/eSearchResult/Count', int)
+    retmax_node = get_node_scalar(root, '/eSearchResult/RetMax', int)
+
+    if count > retmax_node:
+        raise ValueError("Too many values returned please adjust query")
+
+    id_nodes = tree.xpath('/eSearchResult/IdList/Id')
+    if len(id_nodes) != count:
+        errmsg = "Weird. Length of ID list ({0}) doesn't match count ({1})"
+        raise ValueError(errmsg.format(len(id_nodes), count))
+
+    ids = [ x.text for x in id_nodes ]
+    return ids
+
+def parse_sra_metadata_into_model(model, ncbi_id):
+    """Extract SRA data by looking up a NCBI ID.
+    """
+    search = {'db':DB,
+              'id': ncbi_id}
+    url = EFETCH_URL + urllib.urlencode(search)
+    tree = parse(url)
+
+    context = Context()
+    sra_rdf_template = loader.get_template('sra.rdfxml.xsl')
+    sra_rdf_stylesheet = sra_rdf_template.render(context)
+    sra_rdf_transform = XSLT(fromstring(sra_rdf_stylesheet))
+    rdfdata = tostring(sra_rdf_transform(tree))
+    rdfparser = RDF.Parser(name='rdfxml')
+    rdfparser.parse_string_into_model(model, rdfdata, url)
+
+def get_node_scalar(parent, xpath, target_type=None):
+    """Return a single value from an xpath search, possibily type converted
+
+    target_type pass a constructor that takes a string to convert result
+    of search
+    """
+    node = parent.xpath(xpath)
+    if node is None or len(node) != 1:
+        raise ValueError("Wrong response, incorrect number of {0} tags".xpath)
+    if target_type is not None:
+        return target_type(node[0].text)
+    else:
+        return node[0].text
+
+def main(cmdline=None):
+    """Quick driver for importing data from SRA"""
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    elif opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.WARN)
+
+    model = get_model(opts.database, opts.dbpath)
+
+    ids = search_ncbi_ids('sra', opts.query)
+    for count, encode_id in enumerate(ids[:1]):
+        LOGGER.info("processing %s %d / %d", encode_id, count+1, len(ids))
+        parse_sra_metadata_into_model(model, encode_id)
+
+    if opts.dump:
+        dump_model(model)
+
+def make_parser():
+    parser = OptionParser()
+    parser.add_option('--dbpath', help="Database directory",
+                      default=os.getcwd())
+    parser.add_option('--database', help="Database name", default=None)
+    parser.add_option('--dump', help="dump database", default=False,
+                      action="store_true")
+    parser.add_option('--query', help='specify NCBI search terms',
+                      default=DEFAULT_QUERY)
+    parser.add_option("-v", "--verbose", action="store_true", default=False)
+    parser.add_option("--debug", action="store_true", default=False)
+    return parser
+
+if __name__ == "__main__":
+    main()
diff --git a/htsworkflow/templates/sra.rdfxml.xsl b/htsworkflow/templates/sra.rdfxml.xsl
new file mode 100644 (file)
index 0000000..88e5a5f
--- /dev/null
@@ -0,0 +1,179 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+                xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+                xmlns:xs="http://www.w3.org/2001/XMLSchema#"
+                xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+               xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
+                xmlns:submission="http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#"
+                xmlns:sra="http://www.ncbi.nlm.nih.gov/sra"
+                xmlns:sras="http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-3/SRA.package.xsd#"
+                xmlns:sraa="http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-3/SRA.attribute#"
+                >
+
+<xsl:output method="xml" indent="yes"/>
+
+<xsl:template match="/">
+  <rdf:RDF xmlns:ddf="http://encodesubmit.ucsc.edu/pipeline/download_ddf#"
+           xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+          xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
+           xmlns:submission="http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#"
+           xmlns:pipeline="http://encodesubmit.ucsc.edu/pipeline/"
+           xmlns:sra="http://www.ncbi.nlm.nih.gov/sra"
+           xmlns:sras="http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-3/SRA.package.xsd#"
+           xmlns:sraa="http://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/sra/doc/SRA_1-3/SRA.attribute#"
+>
+    <xsl:apply-templates select="*"/>
+  </rdf:RDF>
+</xsl:template>
+
+<xsl:template match="EXPERIMENT_PACKAGE">
+  <xsl:apply-templates select="./EXPERIMENT"/>
+  <!-- xsl:for-each select="./EXPERIMENT/EXPERIMENT_ATTRIBUTES/EXPERIMENT_ATTRIBUTE">
+    <xsl:call-template name="node_detail"/>
+  </xsl:for-each -->
+  <xsl:apply-templates select="./SUBMISSION"/>
+  <xsl:apply-templates select="./STUDY"/>
+  <xsl:apply-templates select="./SAMPLE"/>
+  <!-- xsl:for-each select="./SAMPLE/SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE">
+    <xsl:call-template name="node_detail"/>
+  </xsl:for-each -->
+  <xsl:apply-templates select="./RUN_SET"/>
+</xsl:template>
+
+<xsl:template match="EXPERIMENT">
+  <sras:experiment>
+    <xsl:attribute name="rdf:about">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+    <sras:experiment_alias><xsl:value-of select="@alias"/></sras:experiment_alias>
+    <sras:center_name><xsl:value-of select="@center_name"/></sras:center_name>
+    <xsl:apply-templates select="STUDY_REF"/>
+    <xsl:apply-templates select="DESIGN"/>
+    <xsl:apply-templates select="PLATFORM"/>
+    <xsl:for-each select="./EXPERIMENT_ATTRIBUTES/EXPERIMENT_ATTRIBUTE">
+      <xsl:call-template name="node_simple"/>
+    </xsl:for-each>
+    <sras:submission>
+      <xsl:attribute name="rdf:resource">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="../SUBMISSION/@accession"/></xsl:attribute>
+    </sras:submission>
+  </sras:experiment>
+</xsl:template>
+
+<xsl:template match="STUDY_REF">
+  <sras:study>
+    <xsl:attribute name="rdf:resource">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+  </sras:study>
+</xsl:template>
+
+<xsl:template match="DESIGN">
+  <xsl:apply-templates select="SAMPLE_DESCRIPTOR"/>
+  <xsl:apply-templates select="LIBRARY_DESCRIPTOR"/>
+</xsl:template>
+
+<xsl:template match="SAMPLE_DESCRIPTOR">
+  <sras:sample>
+    <xsl:attribute name="rdf:resource">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+  </sras:sample>
+</xsl:template>
+
+<xsl:template match="LIBRARY_DESCRIPTOR">
+  <sras:library_name><xsl:value-of select="LIBRARY_NAME"/></sras:library_name>
+  <sras:library_strategy><xsl:value-of select="LIBRARY_STRATEGY"/></sras:library_strategy>
+  <sras:library_source><xsl:value-of select="LIBRARY_SOURCE"/></sras:library_source>
+  <sras:library_selection><xsl:value-of select="LIBRARY_SELECTION"/></sras:library_selection>
+  <sras:library_protocol>
+    <xsl:attribute name="rdf:resource">http:<xsl:value-of select="substring-after(LIBRARY_CONSTRUCTION_PROTOCOL, 'http')"/></xsl:attribute></sras:library_protocol>
+</xsl:template>
+
+<xsl:template match="PLATFORM">
+  <sras:instrument_model><xsl:value-of select="ILLUMINA/INSTRUMENT_MODEL"/></sras:instrument_model>
+  <sras:sequence_length rdf:datatype="http://www.w3.org/2001/XMLSchema#decimal"><xsl:value-of select="ILLUMINA/SEQUENCE_LENGTH"/></sras:sequence_length>
+</xsl:template>
+
+<!-- xsl:template select="EXPERIMENT" mode="ref" -->
+<xsl:template name="node_ref">
+  <sras:has_attribute>
+    <xsl:attribute rdf:name="rdf:nodeID"><xsl:value-of select="generate-id(node())"/></xsl:attribute>
+  </sras:has_attribute>
+</xsl:template>
+
+<!-- xsl:template name="attribute" mode="nodedetail" -->
+<xsl:template name="node_detail">
+  <sras:attribute>
+    <xsl:attribute rdf:name="rdf:nodeID"><xsl:value-of select="generate-id(node())"/></xsl:attribute>
+    <sras:attribute_name><xsl:value-of select="TAG"/></sras:attribute_name>
+  </sras:attribute>
+  <sras:attribute>
+    <xsl:attribute rdf:name="rdf:nodeID"><xsl:value-of select="generate-id(node())"/></xsl:attribute>
+    <sras:attribute_value><xsl:value-of select="VALUE"/></sras:attribute_value>
+  </sras:attribute>
+</xsl:template>
+
+<!-- try to generate attributes with non blank nodes -->
+<xsl:template name="node_simple">
+  <xsl:variable name="spacelessTag">
+    <xsl:call-template name="string-replace-all">
+      <xsl:with-param name="text" select="TAG" />
+      <xsl:with-param name="replace" select="' '" />
+      <xsl:with-param name="by" select="'_'" />
+    </xsl:call-template>
+  </xsl:variable>
+  <xsl:element name="sraa:{$spacelessTag}"><xsl:value-of select="VALUE"/></xsl:element>
+</xsl:template>
+
+<!-- SUBMISSION TOP LEVEL -->
+<xsl:template match="SUBMISSION">
+  <sras:submission>
+    <xsl:attribute name="rdf:about">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+    <sras:alias><xsl:value-of select="@alias"/></sras:alias>
+    <sras:comment><xsl:value-of select="@submission_comment"/></sras:comment>
+    <sras:date rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime"><xsl:value-of select="@submission_date"/></sras:date>
+    <sras:lab_name><xsl:value-of select="@lab_name"/></sras:lab_name>
+    <sras:accession><xsl:value-of select="@accession"/></sras:accession>
+  </sras:submission>
+</xsl:template>
+
+<xsl:template match="STUDY">
+  <sras:study>
+    <xsl:attribute name="rdf:about">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+    <sras:study_alias><xsl:value-of select="@alias"/></sras:study_alias>
+    <sras:title><xsl:value-of select="DESCRIPTOR/STUDY_TITLE"/></sras:title>
+    <sras:description><xsl:value-of select="normalize-space(DESCRIPTOR/STUDY_DESCRIPTION)"/></sras:description>
+  </sras:study>
+</xsl:template>
+
+<xsl:template match="SAMPLE">
+  <sras:sample>
+    <xsl:attribute name="rdf:about">http://www.ncbi.nlm.nih.gov/sra/<xsl:value-of select="@accession"/></xsl:attribute>
+    <sras:sample_alias><xsl:value-of select="@alias"/></sras:sample_alias>
+    <sras:common_name><xsl:value-of select="SAMPLE_NAME/COMMON_NAME"/></sras:common_name>
+    <sras:description><xsl:value-of select="DESCRIPTION"/></sras:description>
+    <xsl:for-each select="./SAMPLE_ATTRIBUTES/SAMPLE_ATTRIBUTE">
+      <xsl:call-template name="node_simple"/>
+    </xsl:for-each>
+  </sras:sample>
+</xsl:template>
+<xsl:template match="RUN_SET">
+</xsl:template>
+
+<!-- from http://geekswithblogs.net/Erik/archive/2008/04/01/120915.aspx -->
+<xsl:template name="string-replace-all">
+    <xsl:param name="text" />
+    <xsl:param name="replace" />
+    <xsl:param name="by" />
+    <xsl:choose>
+      <xsl:when test="contains($text, $replace)">
+        <xsl:value-of select="substring-before($text,$replace)" />
+        <xsl:value-of select="$by" />
+        <xsl:call-template name="string-replace-all">
+          <xsl:with-param name="text"
+          select="substring-after($text,$replace)" />
+          <xsl:with-param name="replace" select="$replace" />
+          <xsl:with-param name="by" select="$by" />
+        </xsl:call-template>
+      </xsl:when>
+      <xsl:otherwise>
+        <xsl:value-of select="$text" />
+      </xsl:otherwise>
+    </xsl:choose>
+  </xsl:template>
+
+</xsl:stylesheet>