1 """Start extracting information out of NCBI SRA
3 It probably could be extended to extract other NCBI information.
4 But at the time I just needed to look up things in the short read archive.
8 from lxml.etree import parse, XSLT, tostring, fromstring
9 from optparse import OptionParser
12 import urllib.request, urllib.parse, urllib.error
14 from htsworkflow.util.rdfhelp import get_model, dump_model
16 from django.conf import settings
17 from django.template import Context, loader
19 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
20 os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
22 LOGGER = logging.getLogger(__name__)
24 ESEARCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
25 EFETCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
27 DEFAULT_QUERY = 'wgEncodeCaltechRnaSeq OR wgEncodeCaltechHist OR wgEncodeCaltechTfbs'
29 def search_ncbi_ids(database, term, return_max=200):
30 """Return list of IDs from a NCBI database
31 database - which ncbi database to search
32 term - ncbi query string
33 return_max - maximum records to return
35 search = {'db': database,
38 tree = parse(ESEARCH_URL + urllib.parse.urlencode(search))
40 count = get_node_scalar(root, '/eSearchResult/Count', int)
41 retmax_node = get_node_scalar(root, '/eSearchResult/RetMax', int)
43 if count > retmax_node:
44 raise ValueError("Too many values returned please adjust query")
46 id_nodes = tree.xpath('/eSearchResult/IdList/Id')
47 if len(id_nodes) != count:
48 errmsg = "Weird. Length of ID list ({0}) doesn't match count ({1})"
49 raise ValueError(errmsg.format(len(id_nodes), count))
51 ids = [ x.text for x in id_nodes ]
54 def parse_sra_metadata_into_model(model, ncbi_id):
55 """Extract SRA data by looking up a NCBI ID.
59 url = EFETCH_URL + urllib.parse.urlencode(search)
63 sra_rdf_template = loader.get_template('sra.rdfxml.xsl')
64 sra_rdf_stylesheet = sra_rdf_template.render(context)
65 sra_rdf_transform = XSLT(fromstring(sra_rdf_stylesheet))
66 rdfdata = tostring(sra_rdf_transform(tree))
67 rdfparser = RDF.Parser(name='rdfxml')
68 rdfparser.parse_string_into_model(model, rdfdata, url)
70 def get_node_scalar(parent, xpath, target_type=None):
71 """Return a single value from an xpath search, possibily type converted
73 target_type pass a constructor that takes a string to convert result
76 node = parent.xpath(xpath)
77 if node is None or len(node) != 1:
78 raise ValueError("Wrong response, incorrect number of {0} tags".xpath)
79 if target_type is not None:
80 return target_type(node[0].text)
84 def main(cmdline=None):
85 """Quick driver for importing data from SRA"""
86 parser = make_parser()
87 opts, args = parser.parse_args(cmdline)
90 logging.basicConfig(level=logging.DEBUG)
92 logging.basicConfig(level=logging.INFO)
94 logging.basicConfig(level=logging.WARN)
96 model = get_model(opts.database, opts.dbpath)
98 ids = search_ncbi_ids('sra', opts.query)
99 for count, encode_id in enumerate(ids[:1]):
100 LOGGER.info("processing %s %d / %d", encode_id, count+1, len(ids))
101 parse_sra_metadata_into_model(model, encode_id)
107 parser = OptionParser()
108 parser.add_option('--dbpath', help="Database directory",
110 parser.add_option('--database', help="Database name", default=None)
111 parser.add_option('--dump', help="dump database", default=False,
113 parser.add_option('--query', help='specify NCBI search terms',
114 default=DEFAULT_QUERY)
115 parser.add_option("-v", "--verbose", action="store_true", default=False)
116 parser.add_option("--debug", action="store_true", default=False)
119 if __name__ == "__main__":