Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
authorDiane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:49:59 +0000 (17:49 -0700)
committerDiane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:49:59 +0000 (17:49 -0700)
15 files changed:
encode_submission/find-lib-by-cell.sparql
htsworkflow/frontend/samples/fixtures/initial_data.json
htsworkflow/submission/geo.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_files.sparql [new file with mode: 0644]
htsworkflow/templates/geo_platform.sparql [new file with mode: 0644]
htsworkflow/templates/geo_samples.sparql [new file with mode: 0644]
htsworkflow/templates/geo_series.sparql [new file with mode: 0644]
htsworkflow/templates/geo_submission.soft
htsworkflow/templates/geo_submission.sparql [deleted file]
htsworkflow/util/alphanum.py
htsworkflow/util/rdfhelp.py
htsworkflow/util/test/extra.turtle [new file with mode: 0644]
htsworkflow/util/test/test_alphanum.py
htsworkflow/util/test/test_rdfhelp.py

index c4585c5bd4ce6cd7d038a85c64c993b2fee94934..eac66c14e8347630dd0050e16e4c6fbe6450f47d 100644 (file)
@@ -6,12 +6,15 @@ PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
 
 SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
 WHERE {
-    ?subid ucscSubmission:name ?name .
-    OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
-                       libraryOntology:date ?submission_date .
-               ?liburn libraryOntology:cell_line ?cell ;
-                       libraryOntology:replicate ?replicate . }
+    ?subid ucscSubmission:name ?name ;
+           ucscSubmission:library_urn ?liburn ;
+           libraryOntology:date ?submission_date.
+    ?liburn libraryOntology:cell_line ?cell ;
+            libraryOntology:replicate ?replicate ;
+            libraryOntology:species ?species .
+    OPTIONAL { ?liburn libraryOntology:treatement ?treatment . }
+    FILTER(regex(?species, "Homo sapiens", "i"))
     #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
     #filter(!bound(?liburn))
 }
-ORDER BY ?submission_date ?cell ?replicate ?liburn
+ORDER BY  ?liburn ?submission_date
\ No newline at end of file
index 76a221e029f920d1181b969017b4a273c61ca2ab..de3ad89a6211bc51319833ad45c5c4b0ab5dd9b8 100644 (file)
@@ -11,7 +11,7 @@
      "model": "samples.Cellline",
      "pk": 2,
      "fields": {
-        "cellline_name": "C2C12 Exponential",
+        "cellline_name": "C2C12",
         "notes": ""
      }
   },
index 737b1bb353458ceb0d961adf551a680cc1f9399f..8ff349fe9ae7444c5cc82f20c15e8458d6cc0599 100644 (file)
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import RDF
 
@@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission
 
 from htsworkflow.util.rdfhelp import \
      fromTypedNode, \
+     geoSoftNS, \
+     simplifyUri, \
      submissionOntology
 
 from django.conf import settings
@@ -19,13 +22,33 @@ class GEOSubmission(Submission):
 
     def make_soft(self, result_map):
         samples = []
+        platform = self.get_platform_metadata()
+        platform_attribs = dict(platform)
+        platform_id = platform_attribs['^platform']
+        series = self.get_series_metadata()
+        series_attribs = dict(series)
+        series_id = series_attribs['^series']
         for lib_id, result_dir in result_map.items():
             an_analysis = self.get_submission_node(result_dir)
-            samples.append(self.get_sample_metadata(an_analysis))
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) > 1:
+                errmsg = 'Confused there are more than one samples for %s'
+                LOGGER.debug(errmsg % (str(an_analysis,)))
+            metadata = metadata[0]
+            metadata['raw'] = self.get_sample_files(an_analysis,
+                                                    geoSoftNS['raw'])
+            metadata['supplimental'] = self.get_sample_files(
+                an_analysis,
+                geoSoftNS['supplemental'])
+            samples.append(metadata)
 
         soft_template = loader.get_template('geo_submission.soft')
         context = Context({
-            'samples': samples
+            'platform': platform,
+            'series': series,
+            'samples': samples,
+            'platform_id': platform_id,
+            'series_id': series_id,
         })
         print str(soft_template.render(context))
 
@@ -39,19 +62,69 @@ class GEOSubmission(Submission):
         else:
             return True
 
+    def get_platform_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_platform.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'platform')
+
+    def get_series_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_series.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'series')
+
     def get_sample_metadata(self, analysis_node):
         """Gather information for filling out sample section of a SOFT file
         """
-        query_template = loader.get_template('geo_submission.sparql')
+        query_template = loader.get_template('geo_samples.sparql')
 
         context = Context({
             'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
             })
 
-        formatted_query = query_template.render(context)
-        query = RDF.SPARQLQuery(str(formatted_query))
-        rdfstream = query.execute(self.model)
-        results = []
-        for r in rdfstream:
-            results.append(r)
+        results = self.execute_query(query_template, context)
+        for r in results:
+
+            r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
         return results
+
+    def get_sample_files(self, analysis_node, file_class):
+        """Gather files
+        """
+        query_template = loader.get_template('geo_files.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(file_class)
+            })
+
+        return self.execute_query(query_template, context)
+
+    def query_to_soft_dictionary(self, results, heading):
+        attributes = []
+        for r in results:
+            name = simplifyUri(geoSoftNS, r['name'])
+            if name is not None:
+                if name.lower() == heading.lower():
+                    name = '^' + name
+                else:
+                    name = '!' + name
+                for v in fromTypedNode(r['value']).split(os.linesep):
+                    v = v.strip()
+                    if len(v) > 0:
+                        attributes.append((name, v))
+        return attributes
index 98c25d57befc4047c4cc846ecd72f26fe02c740d..e4ce90c73b073287913ec2f76f5f1d5f0d9bd887 100644 (file)
@@ -24,7 +24,7 @@ from htsworkflow.submission.daf import \
      MetadataLookupException, \
      get_submission_uri
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 class Submission(object):
     def __init__(self, name, model):
@@ -32,7 +32,7 @@ class Submission(object):
         self.model = model
 
         self.submissionSet = get_submission_uri(self.name)
-        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
         self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
 
         self.__view_map = None
@@ -41,11 +41,11 @@ class Submission(object):
         """Examine files in our result directory
         """
         for lib_id, result_dir in result_map.items():
-            logger.info("Importing %s from %s" % (lib_id, result_dir))
+            LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
             try:
                 self.import_analysis_dir(result_dir, lib_id)
             except MetadataLookupException, e:
-                logger.error("Skipping %s: %s" % (lib_id, str(e)))
+                LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
 
     def import_analysis_dir(self, analysis_dir, library_id):
         """Import a submission directories and update our model as needed
@@ -68,18 +68,28 @@ class Submission(object):
         """
         path, filename = os.path.split(pathname)
 
-        logger.debug("Searching for view")
-        file_classification = self.find_best_match(filename)
-        if file_classification is None:
-            logger.warn("Unrecognized file: {0}".format(pathname))
+        LOGGER.debug("Searching for view")
+        file_type = self.find_best_match(filename)
+        if file_type is None:
+            LOGGER.warn("Unrecognized file: {0}".format(pathname))
             return None
-        if str(file_classification) == str(libraryOntology['ignore']):
+        if str(file_type) == str(libraryOntology['ignore']):
             return None
 
         an_analysis_name = self.make_submission_name(analysis_dir)
         an_analysis = self.get_submission_node(analysis_dir)
         an_analysis_uri = str(an_analysis.uri)
+        file_classification = self.model.get_target(file_type,
+                                                    rdfNS['type'])
+        if file_classification is None:
+            errmsg = 'Could not find class for {0}'
+            logger.warning(errmsg.format(str(file_type)))
+            return
 
+        self.model.add_statement(
+            RDF.Statement(self.submissionSetNS[''],
+                          submissionOntology['has_submission'],
+                          an_analysis))
         self.model.add_statement(RDF.Statement(an_analysis,
                                                submissionOntology['name'],
                                                toTypedNode(an_analysis_name)))
@@ -91,7 +101,7 @@ class Submission(object):
                                                submissionOntology['library'],
                                                libNode))
 
-        logger.debug("Adding statements to {0}".format(str(an_analysis)))
+        LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
         # add track specific information
         self.model.add_statement(
             RDF.Statement(an_analysis,
@@ -108,8 +118,11 @@ class Submission(object):
                                              an_analysis_uri,
                                              analysis_dir)
         self.add_md5s(filename, fileNode, analysis_dir)
-
-        logger.debug("Done.")
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          rdfNS['type'],
+                          file_type))
+        LOGGER.debug("Done.")
 
     def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
         # add file specific information
@@ -125,12 +138,12 @@ class Submission(object):
         return fileNode
 
     def add_md5s(self, filename, fileNode, analysis_dir):
-        logger.debug("Updating file md5sum")
+        LOGGER.debug("Updating file md5sum")
         submission_pathname = os.path.join(analysis_dir, filename)
         md5 = make_md5sum(submission_pathname)
         if md5 is None:
             errmsg = "Unable to produce md5sum for {0}"
-            logger.warning(errmsg.format(submission_pathname))
+            LOGGER.warning(errmsg.format(submission_pathname))
         else:
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
@@ -178,11 +191,11 @@ class Submission(object):
         for s in self.model.find_statements(filename_query):
             view_name = s.subject
             literal_re = s.object.literal_value['string']
-            logger.debug("Found: %s" % (literal_re,))
+            LOGGER.debug("Found: %s" % (literal_re,))
             try:
                 filename_re = re.compile(literal_re)
             except re.error, e:
-                logger.error("Unable to compile: %s" % (literal_re,))
+                LOGGER.error("Unable to compile: %s" % (literal_re,))
             patterns[literal_re] = view_name
         return patterns
 
@@ -254,3 +267,14 @@ class Submission(object):
                 "Unrecognized library type %s for %s" % \
                 (library_type, str(libNode)))
 
+    def execute_query(self, template, context):
+        """Execute the query, returning the results
+        """
+        formatted_query = template.render(context)
+        LOGGER.debug(formatted_query)
+        query = RDF.SPARQLQuery(str(formatted_query))
+        rdfstream = query.execute(self.model)
+        results = []
+        for r in rdfstream:
+            results.append(r)
+        return results
diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql
new file mode 100644 (file)
index 0000000..7b66f4f
--- /dev/null
@@ -0,0 +1,18 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+WHERE {
+  <{{submission}}> ucscDaf:has_file ?file ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_platform.sparql b/htsworkflow/templates/geo_platform.sparql
new file mode 100644 (file)
index 0000000..4d224d7
--- /dev/null
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_platform ?platform .
+  ?platform a geoSoft:platform .
+
+  ?platform ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql
new file mode 100644 (file)
index 0000000..850d99a
--- /dev/null
@@ -0,0 +1,41 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
+WHERE {
+  <{{submission}}> a submissionOntology:submission .
+
+  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell .
+             ?cell_line cells:cell ?cell ;
+                        cells:documents ?growthProtocol . }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?library_id }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+  <{{submission}}> submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
+  ?species libraryOntology:species ?species_name ;
+           libraryOntology:taxon_id ?taxon_id .
+
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_series.sparql b/htsworkflow/templates/geo_series.sparql
new file mode 100644 (file)
index 0000000..815f311
--- /dev/null
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_series ?series.
+  ?series a geoSoft:series .
+
+  ?series ?name ?value .
+}
index ae2ac5760946a14d414fd87730812e477bdafb87..00c0b4dc77818789bfea8950507fbd1dba0c13fb 100644 (file)
@@ -1,11 +1,39 @@
-Soft template
-!Platform_title = Illumina Genome Analyzer (Homo sapiens)
-!Platform_geo_accession = GPL9052
-{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+{% for name, value in series %}{{name}} = {{value}}
+{% endfor %}!Series_platform_id = {{ platform_id }}
+{% for row in samples %}
 ^SAMPLE={{row.name}}
+!Sample_type=SRA
 !Sample_title={{row.name}}
+!Sample_series_id = {{ series_id }}
+!Sample_instrument_model = Illumina Genome Analyzer
+!Sample_instrument_model = Illumina Genome Analyzer II
+!Sample_instrument_model = Illumina Genome Analyzer IIx
+!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_channel_count = 1
 !Sample_organism_ch1 = {{ row.species_name }}
 !Sample_taxid_ch1 = {{ row.taxon_id }}
-{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
-{% endif %}{% endspaceless %}{% endif %}
-!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
\ No newline at end of file
+!Sample_platform_id = {{ platform_id }}
+!Sample_source_name_ch1={{row.cell}}
+!Sample_library_strategy={{ row.experiment_type }}
+!Sample_library_source={{row.library_source}}
+!Sample_library_selection={{ row.library_selection }}
+!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
+!Sample_extract_protocol={{ row.extractProtocol|safe }}
+!Sample_data_processing={{ row.dataProtocol|safe }}
+!Sample_molecule_ch1 = {{ row.extractMolecule }}
+!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+{% if row.cell %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endspaceless %}{% endif %}
+{% if row.readType %}{% spaceless %}
+!Sample_characteristics_ch1 = readType: {{ row.readType }}
+{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
+{% endfor %}{% endfor %}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql
deleted file mode 100644 (file)
index 1d7cbb1..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
-
-select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
-WHERE {
-  <{{submission}}> a submissionOntology:submission .
-
-  OPTIONAL { ?submission ucscDaf:control ?control }
-  #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
-  #OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell } .
-  #OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
-  
-  #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
-  #OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  #OPTIONAL { ?library ucscDaf:readType ?readType }
-  #OPTIONAL { ?library ucscDaf:strain ?strain }
-  #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-
-  <{{submission}}> submissionOntology:library ?library ;
-                   ucscDaf:has_file ?file ;
-                   submissionOntology:name ?name .
-  ?species libraryOntology:species ?species_name ;
-           libraryOntology:taxon_id ?taxon_id .
-  ?file ucscDaf:filename ?filename .
-}
index 5e2560659bef32454415fa8d091023a540e428cf..c9d66495e76fe672a5c41019ce378358c4ee736b 100644 (file)
@@ -35,10 +35,10 @@ def chunkify(str):
     return a list of numbers and non-numeric substrings of +str+
     the numeric substrings are converted to integer, non-numeric are left as is
     """
-    if type(str) in types.StringTypes: 
+    if type(str) in types.StringTypes:
         chunks = re.findall("(\d+|\D+)",str)
         #convert numeric strings to numbers
-        chunks = [re.match('\d',x) and int(x) or x for x in chunks] 
+        chunks = [re.match('\d',x) and int(x) or x for x in chunks]
         return chunks
     elif type(str) in [types.IntType, types.LongType, types.FloatType]:
         return [str]
@@ -58,12 +58,3 @@ def alphanum(a,b):
     bChunks = chunkify(b)
 
     return cmp(aChunks,bChunks) #built in comparison works once data is prepared
-
-
-
-if __name__ == "__main__":
-       unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"]
-       sorted = unsorted[:]
-       sorted.sort(alphanum)
-       print '+++++Sorted...++++'
-       print '\n'.join(sorted)
index 1f9ec6179f2f933d1b52a1afefd7cdbe2e873537..6aa9627f9e5bcbf833080f8cfa24f31908b4a390 100644 (file)
@@ -1,10 +1,14 @@
 """Helper features for working with librdf
 """
 from datetime import datetime
+from urlparse import urlparse, urlunparse
+from urllib2 import urlopen
 import logging
 import os
 import types
 
+import lxml.html
+import lxml.html.clean
 import RDF
 
 logger = logging.getLogger(__name__)
@@ -13,37 +17,45 @@ logger = logging.getLogger(__name__)
 owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
-rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
+rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
 
 # internal ontologies
-submissionOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
+submissionOntology = RDF.NS(
+    "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
 dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
 libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
-inventoryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/InventoryOntology#")
+inventoryOntology = RDF.NS(
+    "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
 submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
+geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
 
 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
 
+
 def sparql_query(model, query_filename):
     """Execute sparql query from file
     """
     logger.info("Opening: %s" % (query_filename,))
-    query_body = open(query_filename,'r').read()
+    query_body = open(query_filename, 'r').read()
     query = RDF.SPARQLQuery(query_body)
     results = query.execute(model)
     display_query_results(results)
 
+
 def display_query_results(results):
+    """A very simple display of sparql query results showing name value pairs
+    """
     for row in results:
-        output = []
-        for k,v in row.items()[::-1]:
-            print "{0}: {1}".format(k,v)
+        for k, v in row.items()[::-1]:
+            print "{0}: {1}".format(k, v)
         print
 
 
 def blankOrUri(value=None):
+    """Return a blank node for None or a resource node for strings.
+    """
     node = None
     if value is None:
         node = RDF.Node()
@@ -56,6 +68,8 @@ def blankOrUri(value=None):
 
 
 def toTypedNode(value):
+    """Convert a python variable to a RDF Node with its closest xsd type
+    """
     if type(value) == types.BooleanType:
         value_type = xsdNS['boolean'].uri
         if value:
@@ -84,13 +98,14 @@ def toTypedNode(value):
         node = RDF.Node(literal=unicode(value).encode('utf-8'))
     return node
 
+
 def fromTypedNode(node):
+    """Convert a typed RDF Node to its closest python equivalent
+    """
     if node is None:
         return None
 
-    value_type = str(node.literal_value['datatype'])
-    # chop off xml schema declaration
-    value_type = value_type.replace(str(xsdNS[''].uri),'')
+    value_type = get_node_type(node)
     literal = node.literal_value['string']
     literal_lower = literal.lower()
 
@@ -112,11 +127,42 @@ def fromTypedNode(node):
     elif value_type in ('dateTime'):
         try:
             return datetime.strptime(literal, ISOFORMAT_MS)
-        except ValueError, e:
+        except ValueError, _:
             return datetime.strptime(literal, ISOFORMAT_SHORT)
     return literal
 
 
+def get_node_type(node):
+    """Return just the base name of a XSD datatype:
+    e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
+    """
+    # chop off xml schema declaration
+    value_type = node.literal_value['datatype']
+    if value_type is None:
+        return "string"
+    else:
+        value_type = str(value_type)
+        return value_type.replace(str(xsdNS[''].uri), '')
+
+
+def simplifyUri(namespace, term):
+    """Remove the namespace portion of a term
+
+    returns None if they aren't in common
+    """
+    if isinstance(term, RDF.Node):
+        if  term.is_resource():
+            term = term.uri
+        else:
+            raise ValueError("This works on resources")
+    elif not isinstance(term, RDF.Uri):
+        raise ValueError("This works on resources")
+    term_s = str(term)
+    if not term_s.startswith(namespace._prefix):
+        return None
+    return term_s.replace(namespace._prefix, "")
+
+
 def get_model(model_name=None, directory=None):
     if directory is None:
         directory = os.getcwd()
@@ -133,20 +179,79 @@ def get_model(model_name=None, directory=None):
     return model
 
 
-def load_into_model(model, parser_name, filename, ns=None):
-    if not os.path.exists(filename):
-        raise IOError("Can't find {0}".format(filename))
-
-    data = open(filename, 'r').read()
+def load_into_model(model, parser_name, path, ns=None):
+    url_parts = list(urlparse(path))
+    if len(url_parts[0]) == 0:
+        url_parts[0] = 'file'
+        url_parts[2] = os.path.abspath(url_parts[2])
+    url = urlunparse(url_parts)
+    logger.info("Opening %s" % (url,))
+    req = urlopen(url)
+    logger.debug("request status: %s" % (req.code,))
+    if parser_name is None:
+        content_type = req.headers.get('Content-Type', None)
+        parser_name = guess_parser(content_type, path)
+        logger.debug("Guessed parser: %s" % (parser_name,))
+    data = req.read()
     load_string_into_model(model, parser_name, data, ns)
 
 
 def load_string_into_model(model, parser_name, data, ns=None):
     if ns is None:
-        ns = "http://localhost/"
-
+        ns = RDF.NS("http://localhost/")
+    imports = owlNS['imports']
     rdf_parser = RDF.Parser(name=parser_name)
-    rdf_parser.parse_string_into_model(model, data, ns)
+    for s in rdf_parser.parse_string_as_stream(data, ns):
+        if s.predicate == imports:
+            obj = str(s.object)
+            logger.info("Importing %s" % (obj,))
+            load_into_model(model, None, obj, ns)
+        if s.object.is_literal():
+            value_type = get_node_type(s.object)
+            if value_type == 'string':
+                s.object = sanitize_literal(s.object)
+        model.add_statement(s)
+
+
+def sanitize_literal(node):
+    """Clean up a literal string
+    """
+    if not isinstance(node, RDF.Node):
+        raise ValueError("sanitize_literal only works on RDF.Nodes")
+
+    element = lxml.html.fromstring(node.literal_value['string'])
+    cleaner = lxml.html.clean.Cleaner(page_structure=False)
+    element = cleaner.clean_html(element)
+    text = lxml.html.tostring(element)
+    p_len = 3
+    slash_p_len = 4
+
+    args = {'literal': text[p_len:-slash_p_len]}
+    datatype = node.literal_value['datatype']
+    if datatype is not None:
+        args['datatype'] = datatype
+    language = node.literal_value['language']
+    if language is not None:
+        args['language'] = language
+    return RDF.Node(**args)
+
+
+def guess_parser(content_type, pathname):
+    if content_type in ('application/rdf+xml'):
+        return 'rdfxml'
+    elif content_type in ('application/x-turtle'):
+        return 'turtle'
+    elif content_type in ('text/html'):
+        return 'rdfa'
+    elif content_type is None:
+        _, ext = os.path.splitext(pathname)
+        if ext in ('xml', 'rdf'):
+            return 'rdfxml'
+        elif ext in ('html'):
+            return 'rdfa'
+        elif ext in ('turtle'):
+            return 'turtle'
+    return 'guess'
 
 
 def get_serializer(name='turtle'):
@@ -165,6 +270,7 @@ def get_serializer(name='turtle'):
     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
     return writer
 
+
 def dump_model(model):
     serializer = get_serializer()
     print serializer.serialize_model_to_string(model)
diff --git a/htsworkflow/util/test/extra.turtle b/htsworkflow/util/test/extra.turtle
new file mode 100644 (file)
index 0000000..9e77f14
--- /dev/null
@@ -0,0 +1,6 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+
+<http://jumpgate.caltech.edu/wiki/TestCase>
+        rdfs:label "TestCase" .
\ No newline at end of file
index f7b488d0b5cf575f04ec525c0b74bfd55b2bc659..3fd60250cbe392f9208c077076253fd6aed10f94 100644 (file)
@@ -27,6 +27,19 @@ class testAlphanum(unittest.TestCase):
       for i in xrange(len(scratch)):
         self.failUnlessEqual(scratch[i], sorted[i])
 
+    def test_long_names(self):
+        unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"]
+        expected = ['10X Radonius', '20X Radonius', '20X Radonius Prime', '30X Radonius', '40X Radonius', '200X Radonius', '1000X Radonius Maximus', 'Allegia 50 Clasteron', 'Allegia 51 Clasteron', 'Allegia 51B Clasteron', 'Allegia 52 Clasteron', 'Allegia 60 Clasteron', 'Allegia 500 Clasteron', 'Alpha 2', 'Alpha 2A', 'Alpha 2A-900', 'Alpha 2A-8000', 'Alpha 100', 'Alpha 200', 'Callisto Morphamax', 'Callisto Morphamax 500', 'Callisto Morphamax 600', 'Callisto Morphamax 700', 'Callisto Morphamax 5000', 'Callisto Morphamax 7000', 'Callisto Morphamax 7000 SE', 'Callisto Morphamax 7000 SE2', 'QRS-60 Intrinsia Machine', 'QRS-60F Intrinsia Machine', 'QRS-62 Intrinsia Machine', 'QRS-62F Intrinsia Machine', 'Xiph Xlater 5', 'Xiph Xlater 40', 'Xiph Xlater 50', 'Xiph Xlater 58', 'Xiph Xlater 300', 'Xiph Xlater 500', 'Xiph Xlater 2000', 'Xiph Xlater 5000', 'Xiph Xlater 10000']
+
+        s = unsorted[:]
+        s.sort(alphanum)
+        self.failUnlessEqual(s, expected)
+
+    def test_bad_input(self):
+        unsorted = [object(), (1,3j)]
+        s = unsorted[:]
+        self.failUnlessRaises(ValueError, s.sort, alphanum)
+
 
 def suite():
     return unittest.makeSuite(testAlphanum, 'test')
index df275edd095b818301308816589000405e67833d..55f590f53ba198f713a91ea5dbe36ce05a660897 100644 (file)
+import os
 import unittest
 import types
 
+
 from datetime import datetime
 
 from htsworkflow.util.rdfhelp import \
      blankOrUri, \
-     toTypedNode, \
+     dump_model, \
      fromTypedNode, \
+     get_model, \
+     load_string_into_model, \
+     rdfsNS, \
+     toTypedNode, \
+     simplifyUri, \
+     sanitize_literal, \
      xsdNS
 
 try:
-  import RDF
-
-  class TestRDFHelp(unittest.TestCase):
-      def test_typed_node_boolean(self):
-          node = toTypedNode(True)
-          self.failUnlessEqual(node.literal_value['string'], u'1')
-          self.failUnlessEqual(str(node.literal_value['datatype']),
-                               'http://www.w3.org/2001/XMLSchema#boolean')
-
-      def test_typed_node_string(self):
-          node = toTypedNode('hello')
-          self.failUnlessEqual(node.literal_value['string'], u'hello')
-          self.failUnless(node.literal_value['datatype'] is None)
-
-      def test_blank_or_uri_blank(self):
-          node = blankOrUri()
-          self.failUnlessEqual(node.is_blank(), True)
-
-      def test_blank_or_uri_url(self):
-          s = 'http://google.com'
-          node = blankOrUri(s)
-          self.failUnlessEqual(node.is_resource(), True)
-          self.failUnlessEqual(str(node.uri), s)
-
-      def test_blank_or_uri_node(self):
-          s = RDF.Node(RDF.Uri('http://google.com'))
-          node = blankOrUri(s)
-          self.failUnlessEqual(node.is_resource(), True)
-          self.failUnlessEqual(node, s)
-
-      def test_unicode_node_roundtrip(self):
-          literal = u'\u5927'
-          roundtrip = fromTypedNode(toTypedNode(literal))
-          self.failUnlessEqual(roundtrip, literal)
-          self.failUnlessEqual(type(roundtrip), types.UnicodeType)
-
-      def test_datetime_no_microsecond(self):
-          dateTimeType = xsdNS['dateTime'].uri
-          short_isostamp = '2011-12-20T11:44:25'
-          short_node = RDF.Node(literal=short_isostamp,
-                               datatype=dateTimeType)
-          short_datetime = datetime(2011,12,20,11,44,25)
-
-          self.assertEqual(fromTypedNode(short_node), short_datetime)
-          self.assertEqual(toTypedNode(short_datetime), short_node)
-          self.assertEqual(fromTypedNode(toTypedNode(short_datetime)),
-                           short_datetime)
-
-      def test_datetime_with_microsecond(self):
-          dateTimeType = xsdNS['dateTime'].uri
-          long_isostamp = '2011-12-20T11:44:25.081776'
-          long_node = RDF.Node(literal=long_isostamp,
-                               datatype=dateTimeType)
-          long_datetime = datetime(2011,12,20,11,44,25,81776)
-
-          self.assertEqual(fromTypedNode(long_node), long_datetime)
-          self.assertEqual(toTypedNode(long_datetime), long_node)
-          self.assertEqual(fromTypedNode(toTypedNode(long_datetime)),
-                           long_datetime)
-
-  def suite():
-      return unittest.makeSuite(testRdfHelp, 'test')
+    import RDF
+
+    class TestRDFHelp(unittest.TestCase):
+        def test_from_none(self):
+          self.failUnlessEqual(fromTypedNode(None), None)
+
+        def test_typed_node_boolean(self):
+            node = toTypedNode(True)
+            self.failUnlessEqual(node.literal_value['string'], u'1')
+            self.failUnlessEqual(str(node.literal_value['datatype']),
+                                 'http://www.w3.org/2001/XMLSchema#boolean')
+
+        def test_bad_boolean(self):
+            node = RDF.Node(literal='bad', datatype=xsdNS['boolean'].uri)
+            self.failUnlessRaises(ValueError, fromTypedNode, node)
+
+        def test_typed_node_string(self):
+            node = toTypedNode('hello')
+            self.failUnlessEqual(node.literal_value['string'], u'hello')
+            self.failUnless(node.literal_value['datatype'] is None)
+
+        def test_typed_real_like(self):
+            num = 3.14
+            node = toTypedNode(num)
+            self.failUnlessEqual(fromTypedNode(node), num)
+
+        def test_typed_integer(self):
+            num = 3
+            node = toTypedNode(num)
+            self.failUnlessEqual(fromTypedNode(node), num)
+            self.failUnlessEqual(type(fromTypedNode(node)), type(num))
+
+        def test_typed_node_string(self):
+            s = "Argh matey"
+            node = toTypedNode(s)
+            self.failUnlessEqual(fromTypedNode(node), s)
+            self.failUnlessEqual(type(fromTypedNode(node)), types.UnicodeType)
+
+        def test_blank_or_uri_blank(self):
+            node = blankOrUri()
+            self.failUnlessEqual(node.is_blank(), True)
+
+        def test_blank_or_uri_url(self):
+            s = 'http://google.com'
+            node = blankOrUri(s)
+            self.failUnlessEqual(node.is_resource(), True)
+            self.failUnlessEqual(str(node.uri), s)
+
+        def test_blank_or_uri_node(self):
+            s = RDF.Node(RDF.Uri('http://google.com'))
+            node = blankOrUri(s)
+            self.failUnlessEqual(node.is_resource(), True)
+            self.failUnlessEqual(node, s)
+
+        def test_unicode_node_roundtrip(self):
+            literal = u'\u5927'
+            roundtrip = fromTypedNode(toTypedNode(literal))
+            self.failUnlessEqual(roundtrip, literal)
+            self.failUnlessEqual(type(roundtrip), types.UnicodeType)
+
+        def test_datetime_no_microsecond(self):
+            dateTimeType = xsdNS['dateTime'].uri
+            short_isostamp = '2011-12-20T11:44:25'
+            short_node = RDF.Node(literal=short_isostamp,
+                                 datatype=dateTimeType)
+            short_datetime = datetime(2011,12,20,11,44,25)
+
+            self.assertEqual(fromTypedNode(short_node), short_datetime)
+            self.assertEqual(toTypedNode(short_datetime), short_node)
+            self.assertEqual(fromTypedNode(toTypedNode(short_datetime)),
+                             short_datetime)
+
+        def test_datetime_with_microsecond(self):
+            dateTimeType = xsdNS['dateTime'].uri
+            long_isostamp = '2011-12-20T11:44:25.081776'
+            long_node = RDF.Node(literal=long_isostamp,
+                                 datatype=dateTimeType)
+            long_datetime = datetime(2011,12,20,11,44,25,81776)
+
+            self.assertEqual(fromTypedNode(long_node), long_datetime)
+            self.assertEqual(toTypedNode(long_datetime), long_node)
+            self.assertEqual(fromTypedNode(toTypedNode(long_datetime)),
+                             long_datetime)
+
+        def test_simplify_uri(self):
+            nsOrg = RDF.NS('example.org/example#')
+            nsCom = RDF.NS('example.com/example#')
+
+            term = 'foo'
+            node = nsOrg[term]
+            self.failUnlessEqual(simplifyUri(nsOrg, node), term)
+            self.failUnlessEqual(simplifyUri(nsCom, node), None)
+            self.failUnlessEqual(simplifyUri(nsOrg, node.uri), term)
+
+        def test_simplify_uri_exceptions(self):
+            nsOrg = RDF.NS('example.org/example#')
+            nsCom = RDF.NS('example.com/example#')
+
+            node = toTypedNode('bad')
+            self.failUnlessRaises(ValueError, simplifyUri, nsOrg, node)
+            self.failUnlessRaises(ValueError, simplifyUri, nsOrg, nsOrg)
+
+        def test_owl_import(self):
+            path, name = os.path.split(__file__)
+            loc = 'file://'+os.path.abspath(path)+'/'
+            model = get_model()
+            fragment = '''
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix owl: <http://www.w3.org/2002/07/owl#> .
+
+_:a owl:imports "{loc}extra.turtle"
+'''.format(loc=loc)
+            load_string_into_model(model, 'turtle', fragment, loc)
+            tc = RDF.Node(RDF.Uri('http://jumpgate.caltech.edu/wiki/TestCase'))
+            query = RDF.Statement(tc, rdfsNS['label'], None)
+            result = list(model.find_statements(query))
+            self.failUnlessEqual(len(result), 1)
+            self.failUnlessEqual(str(result[0].object), 'TestCase')
+
+        def test_sanitize_literal_text(self):
+            self.failUnlessRaises(ValueError, sanitize_literal, "hi")
+            hello_text = "hello"
+            hello_none = RDF.Node(hello_text)
+            self.failUnlessEqual(str(sanitize_literal(hello_none)),
+                                 hello_text)
+            hello_str = RDF.Node(literal=hello_text,
+                                 datatype=xsdNS['string'].uri)
+            self.failUnlessEqual(str(sanitize_literal(hello_str)),
+                                 hello_text)
+
+        def test_sanitize_literal_html(self):
+            hello = "hello <a onload='javascript:alert(\"foo\");' href='http://google.com'>google.com</a>, whats up?"
+            hello_clean = 'hello <a href="http://google.com">google.com</a>, whats up?'
+            hello_node = RDF.Node(literal=hello,
+                                  datatype=xsdNS['string'].uri)
+            hello_sanitized = sanitize_literal(hello_node)
+            self.failUnlessEqual(str(hello_sanitized),
+                                 hello_clean)
+
+            hostile = "hi <b>there</b><script type='text/javascript>alert('boo');</script><a href='javascript:alert('poke')>evil</a> scammer"
+            hostile_node = RDF.Node(hostile)
+            hostile_sanitized = sanitize_literal(hostile_node)
+            # so it drops the stuff after the javascript link.
+            # I suppose it could be worse
+            hostile_result = """hi <b>there</b>"""
+            self.failUnlessEqual(str(hostile_sanitized), hostile_result)
+
+
+    def suite():
+        return unittest.makeSuite(TestRDFHelp, 'test')
 except ImportError, e:
     print "Unable to test rdfhelp"