htsworkflow/submission/ncbi.py

   1 """Start extracting information out of NCBI SRA
   2
   3 It probably could be extended to extract other NCBI information.
   4 But at the time I just needed to look up things in the short read archive.
   5 """
   6
   7 import logging
   8 from lxml.etree import parse, XSLT, tostring, fromstring
   9 from optparse import OptionParser
  10 import os
  11 import RDF
  12 import urllib.request, urllib.parse, urllib.error
  13
  14 from htsworkflow.util.rdfhelp import get_model, dump_model
  15
  16 from django.conf import settings
  17 from django.template import Context, loader
  18
  19 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
  20     os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
  21
  22 LOGGER = logging.getLogger(__name__)
  23
  24 ESEARCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
  25 EFETCH_URL="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
  26 DB = 'sra'
  27 DEFAULT_QUERY = 'wgEncodeCaltechRnaSeq OR wgEncodeCaltechHist OR wgEncodeCaltechTfbs'
  28
  29 def search_ncbi_ids(database, term, return_max=200):
  30     """Return list of IDs from a NCBI database
  31     database - which ncbi database to search
  32     term - ncbi query string
  33     return_max - maximum records to return
  34     """
  35     search = {'db': database,
  36               'term': term,
  37               'retmax': return_max}
  38     tree = parse(ESEARCH_URL + urllib.parse.urlencode(search))
  39     root = tree.getroot()
  40     count = get_node_scalar(root, '/eSearchResult/Count', int)
  41     retmax_node = get_node_scalar(root, '/eSearchResult/RetMax', int)
  42
  43     if count > retmax_node:
  44         raise ValueError("Too many values returned please adjust query")
  45
  46     id_nodes = tree.xpath('/eSearchResult/IdList/Id')
  47     if len(id_nodes) != count:
  48         errmsg = "Weird. Length of ID list ({0}) doesn't match count ({1})"
  49         raise ValueError(errmsg.format(len(id_nodes), count))
  50
  51     ids = [ x.text for x in id_nodes ]
  52     return ids
  53
  54 def parse_sra_metadata_into_model(model, ncbi_id):
  55     """Extract SRA data by looking up a NCBI ID.
  56     """
  57     search = {'db':DB,
  58               'id': ncbi_id}
  59     url = EFETCH_URL + urllib.parse.urlencode(search)
  60     tree = parse(url)
  61
  62     context = Context()
  63     sra_rdf_template = loader.get_template('sra.rdfxml.xsl')
  64     sra_rdf_stylesheet = sra_rdf_template.render(context)
  65     sra_rdf_transform = XSLT(fromstring(sra_rdf_stylesheet))
  66     rdfdata = tostring(sra_rdf_transform(tree))
  67     rdfparser = RDF.Parser(name='rdfxml')
  68     rdfparser.parse_string_into_model(model, rdfdata, url)
  69
  70 def get_node_scalar(parent, xpath, target_type=None):
  71     """Return a single value from an xpath search, possibily type converted
  72
  73     target_type pass a constructor that takes a string to convert result
  74     of search
  75     """
  76     node = parent.xpath(xpath)
  77     if node is None or len(node) != 1:
  78         raise ValueError("Wrong response, incorrect number of {0} tags".xpath)
  79     if target_type is not None:
  80         return target_type(node[0].text)
  81     else:
  82         return node[0].text
  83
  84 def main(cmdline=None):
  85     """Quick driver for importing data from SRA"""
  86     parser = make_parser()
  87     opts, args = parser.parse_args(cmdline)
  88
  89     if opts.debug:
  90         logging.basicConfig(level=logging.DEBUG)
  91     elif opts.verbose:
  92         logging.basicConfig(level=logging.INFO)
  93     else:
  94         logging.basicConfig(level=logging.WARN)
  95
  96     model = get_model(opts.database, opts.dbpath)
  97
  98     ids = search_ncbi_ids('sra', opts.query)
  99     for count, encode_id in enumerate(ids[:1]):
 100         LOGGER.info("processing %s %d / %d", encode_id, count+1, len(ids))
 101         parse_sra_metadata_into_model(model, encode_id)
 102
 103     if opts.dump:
 104         dump_model(model)
 105
 106 def make_parser():
 107     parser = OptionParser()
 108     parser.add_option('--dbpath', help="Database directory",
 109                       default=os.getcwd())
 110     parser.add_option('--database', help="Database name", default=None)
 111     parser.add_option('--dump', help="dump database", default=False,
 112                       action="store_true")
 113     parser.add_option('--query', help='specify NCBI search terms',
 114                       default=DEFAULT_QUERY)
 115     parser.add_option("-v", "--verbose", action="store_true", default=False)
 116     parser.add_option("--debug", action="store_true", default=False)
 117     return parser
 118
 119 if __name__ == "__main__":
 120     main()