Configure daf variables in only one place. (daf.py)
authorDiane Trout <diane@caltech.edu>
Fri, 30 Sep 2011 22:57:37 +0000 (15:57 -0700)
committerDiane Trout <diane@caltech.edu>
Fri, 30 Sep 2011 22:57:37 +0000 (15:57 -0700)
When adding things to the daf variable list, check to see if
they're present before adding them (control duplicates).
Incorporate a couple more terms to select (mapVariable, strain)

extra/ucsc_encode_submission/ucsc_gather.py
htsworkflow/submission/daf.py
htsworkflow/submission/test/test_daf.py

index f1e092015a8db5fbf90a7d51feffdd340d21078a..1e6b6a66886789cb1cd3204ae47c0ccbea263165 100755 (executable)
@@ -215,7 +215,7 @@ def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None)
 PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 
-select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate
+select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
 WHERE {
   ?file ucscDaf:filename ?files ;
         ucscDaf:md5sum ?md5sum .
@@ -236,7 +236,9 @@ WHERE {
   OPTIONAL { ?library libraryOntology:condition ?treatment }
   OPTIONAL { ?library ucscDaf:protocol ?protocol }
   OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
   OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
 }
 ORDER BY  ?submitView"""
     dag_fragments = []
@@ -251,6 +253,7 @@ ORDER BY  ?submitView"""
         outfile = os.path.join(outdir, ddf_name)
         output = open(outfile,'w')
     else:
+        outfile = 'stdout:'
         output = sys.stdout
 
     formatted_query = query_template % {'submission': str(submissionNode.uri)}
@@ -258,11 +261,9 @@ ORDER BY  ?submitView"""
     query = RDF.SPARQLQuery(formatted_query)
     results = query.execute(view_map.model)
 
-    variables = ['files']
     # filename goes first
-    variables.extend(view_map.get_daf_variables())
+    variables = view_map.get_daf_variables()
     # 'controlId',
-    variables += [ 'labExpId', 'md5sum']
     output.write('\t'.join(variables))
     output.write(os.linesep)
 
@@ -273,6 +274,8 @@ ORDER BY  ?submitView"""
         current = all_views.setdefault(viewname, {})
         for variable_name in variables:
             value = str(fromTypedNode(row[variable_name]))
+            if value is None or value == 'None':
+                logging.warn("{0}: {1} was None".format(outfile, variable_name))
             if variable_name in ('files', 'md5sum'):
                 current.setdefault(variable_name,[]).append(value)
             else:
index 7b17f6931a6a291b15173aa732806f9561ccfdfd..c59c45c83b1f97f5b25db767cc8b859d1441a769 100644 (file)
@@ -26,6 +26,9 @@ logger = logging.getLogger(__name__)
 
 DAF_VARIABLE_NAMES = ("variables", "extraVariables")
 VARIABLES_TERM_NAME = 'variables'
+DAF_PRE_VARIABLES = ['files', 'view']
+DAF_POST_VARIABLES = [ 'labExpId', 'md5sum']
+
 
 class ModelException(RuntimeError):
     """Assumptions about the RDF model failed"""
@@ -353,12 +356,6 @@ class DAFMapper(object):
                           dafTermOntology['submission'],
                           submissionNode))
 
-        # extra information
-        terms = [dafTermOntology['type'],
-                 dafTermOntology['filename_re'],
-                 ]
-        terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
-
         # add file specific information
         self.create_file_attributes(filename, submissionView, submission_uri, submission_dir)
 
@@ -399,14 +396,17 @@ class DAFMapper(object):
         """Returns simple variables names that to include in the ddf
         """
         variables_term = dafTermOntology[VARIABLES_TERM_NAME]
-        results = ['view']
-        if self.need_replicate():
+        results = []
+        results.extend([v for v in DAF_PRE_VARIABLES if v not in results])
+        results = DAF_PRE_VARIABLES[:]
+        if self.need_replicate() and 'replicate' not in results:
             results.append('replicate')
 
         for obj in self.model.get_targets(self.submissionSet, variables_term):
             value = str(fromTypedNode(obj))
-            results.append(value)
-        results.append('labVersion')
+            if value not in results:
+                results.append(value)
+        results.extend([v for v in DAF_POST_VARIABLES if v not in results])
         return results
 
     def make_submission_name(self, submission_dir):
index 93c6e9932b23a03e2d0b5cc86bfe17a9775fcfda..8b163129583aef4f37c593e8e01d22224fa27c78 100644 (file)
@@ -282,7 +282,7 @@ thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
     def test_daf_with_extra(self):
         daf_mapper = load_daf_mapper('test_rep',test_daf=test_daf_extra)
         variables = daf_mapper.get_daf_variables()
-        self.assertEqual(len(variables), 9)
+        self.assertEqual(len(variables), 11)
         self.failUnless('treatment' in variables)
         self.failUnless('controlId' in variables)