Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow into django1.4
authorDiane Trout <diane@caltech.edu>
Thu, 13 Dec 2012 18:21:36 +0000 (10:21 -0800)
committerDiane Trout <diane@caltech.edu>
Thu, 13 Dec 2012 18:21:36 +0000 (10:21 -0800)
htsworkflow/submission/condorfastq.py
htsworkflow/submission/fastqname.py [new file with mode: 0644]
htsworkflow/submission/submission.py
htsworkflow/submission/test/test_fastqname.py [new file with mode: 0644]
htsworkflow/templates/geo_fastqs.sparql
htsworkflow/templates/geo_submission.soft
htsworkflow/util/rdfhelp.py

index d79502d2af7052a06e871cc0520af2e056b7ec7f..17e463351282b7b8091f922760e8b1fd69a2c934 100644 (file)
@@ -13,6 +13,7 @@ from htsworkflow.pipelines.samplekey import SampleKey
 from htsworkflow.pipelines import qseq2fastq
 from htsworkflow.pipelines import srf2fastq
 from htsworkflow.pipelines import desplit_fastq
+from htsworkflow.submission.fastqname import FastqName
 from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
      fromTypedNode, \
      stripNamespace
@@ -231,18 +232,14 @@ WHERE {
                 'lib_id': seq.library_id,
                 'lane': seq.lane_number,
                 'read': seq.read,
-                'cycle': seq.cycle
+                'cycle': seq.cycle,
+                'is_paired': seq.ispaired
             }
 
-            if seq.ispaired:
-                target_name = fastq_paired_template % \
-                              filename_attributes
-            else:
-                target_name = fastq_single_template % \
-                              filename_attributes
+            fqName = FastqName(**filename_attributes)
 
             result_dir = result_map[seq.library_id]
-            target_pathname = os.path.join(result_dir, target_name)
+            target_pathname = os.path.join(result_dir, fqName.filename)
             if self.force or not os.path.exists(target_pathname):
                 t = needed_targets.setdefault(target_pathname, {})
                 t.setdefault(seq.filetype, []).append(seq)
diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py
new file mode 100644 (file)
index 0000000..9dd52a0
--- /dev/null
@@ -0,0 +1,88 @@
+"""Standardize reading and writing fastq submission names.
+"""
+import collections
+import re
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+
+FASTQ_RE = re.compile(
+    '(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
+    'c(?P<cycle>[\d]+)_l(?P<lane>[\d]+)(_r(?P<read>[\d]))?\.fastq')
+
+class FastqName(collections.Mapping):
+    """Utility class to convert to the standardized submission fastq name.
+    """
+    def __init__(self, is_paired=None, **kwargs):
+        """Create a fastq name handler.
+
+        Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
+        """
+        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+        self._is_paired = is_paired
+
+        if len(kwargs) == 0:
+            return
+        if 'filename' in kwargs:
+            self._init_by_filename(**kwargs)
+        else:
+            self._init_by_attributes(**kwargs)
+
+    def _init_by_attributes(self, **kwargs):
+        for k in self._attributes:
+            value = None
+            if k in kwargs:
+                value = kwargs[k]
+            self[k] = value
+
+    def _init_by_filename(self, filename):
+        match = FASTQ_RE.match(filename)
+        if match is None:
+            raise ValueError('Is "{0}" a submission fastq?'.format(filename))
+
+        for k in self._attributes:
+            self[k] = match.group(k)
+
+    def _get_is_paired(self):
+        if self._is_paired is None:
+            return getattr(self, 'read', None) is not None
+        else:
+            return self._is_paired
+    def _set_is_paired(self, value):
+        self._is_paired = value
+    is_paired = property(_get_is_paired, _set_is_paired)
+
+    def _is_valid(self):
+        if self.is_paired and self['read'] is None:
+            return False
+
+        for k in self.keys():
+            if k == 'read':
+                continue
+            if self[k] is None:
+                return False
+        return True
+    is_valid = property(_is_valid)
+
+    def _get_filename(self):
+        if not self.is_valid:
+            raise ValueError(
+                "Please set all needed variables before generating a filename")
+
+        T = PAIRED_TEMPLATE if self.is_paired else SINGLE_TEMPLATE
+        return T.format(**self)
+    filename = property(_get_filename)
+
+    def __iter__(self):
+        return iter(self._attributes)
+
+    def __getitem__(self, key):
+        return getattr(self, key, None)
+
+    def __setitem__(self, key, value):
+        if key in self._attributes:
+            setattr(self, key, value)
+        else:
+            raise ValueError("Unrecognized key {0}".format(key))
+
+    def __len__(self):
+        return len([k for k in self if self[k] is not None])
index 01bdb17228689281b3f33e8aaaaa3af5ccda6d18..b3e2778ca490b8bbf4a111c1e2c0543f23910b00 100644 (file)
@@ -19,7 +19,7 @@ from htsworkflow.util.rdfhelp import \
      toTypedNode, \
      fromTypedNode
 from htsworkflow.util.hashfile import make_md5sum
-
+from htsworkflow.submission.fastqname import FastqName
 from htsworkflow.submission.daf import \
      MetadataLookupException, \
      get_submission_uri
@@ -114,18 +114,18 @@ class Submission(object):
                           an_analysis))
 
         # add file specific information
-        fileNode = self.link_file_to_classes(pathname,
-                                             an_analysis,
-                                             an_analysis_uri,
-                                             analysis_dir)
+        fileNode = self.make_file_node(pathname, an_analysis)
         self.add_md5s(filename, fileNode, analysis_dir)
+        self.add_fastq_metadata(filename, fileNode)
         self.model.add_statement(
             RDF.Statement(fileNode,
                           rdfNS['type'],
                           file_type))
         LOGGER.debug("Done.")
 
-    def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir):
+    def make_file_node(self, pathname, submissionNode):
+        """Create file node and attach it to its submission.
+        """
         # add file specific information
         path, filename = os.path.split(pathname)
         fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
@@ -150,6 +150,25 @@ class Submission(object):
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 
+    def add_fastq_metadata(self, filename, fileNode):
+        # How should I detect if this is actually a fastq file?
+        try:
+            fqname = FastqName(filename=filename)
+        except ValueError:
+            # currently its just ignore it if the fastq name parser fails
+            return
+        
+        terms = [('flowcell', libraryOntology['flowcell_id']),
+                 ('lib_id', libraryOntology['library_id']),
+                 ('lane', libraryOntology['lane_number']),
+                 ('read', libraryOntology['read']),
+                 ('cycle', libraryOntology['read_length'])]
+        for file_term, model_term in terms:
+            value = fqname.get(file_term)
+            if value is not None:
+                s = RDF.Statement(fileNode, model_term, toTypedNode(value))
+                self.model.append(s)
+
     def _add_library_details_to_model(self, libNode):
         # attributes that can have multiple values
         set_attributes = set((libraryOntology['has_lane'],
diff --git a/htsworkflow/submission/test/test_fastqname.py b/htsworkflow/submission/test/test_fastqname.py
new file mode 100644 (file)
index 0000000..d51ad0e
--- /dev/null
@@ -0,0 +1,109 @@
+from unittest2 import TestCase
+from htsworkflow.submission.fastqname import FastqName
+
+class TestFastqName(TestCase):
+    def test_init_empty(self):
+        fq = FastqName()
+        self.assertEqual(fq.is_valid, False)
+
+    def test_init_single_filename(self):
+        fq = FastqName(filename="12345_AABBCCDDXX_c100_l1.fastq")
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_single_filename(self):
+        fq = FastqName(filename="12345_AABBCCDDXX_c100_l1_r2.fastq")
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.read, "2")
+        self.assertEqual(fq['read'], "2")
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_bad_filename(self):
+        attribs = {'filename': 'asdf.bam'}
+        self.assertRaises(ValueError, FastqName, **attribs)
+
+    def test_init_single_attributes(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1")
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.is_paired, False)
+        self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1.fastq")
+
+    def test_init_single_attributes_set_single(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", is_paired=False)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_single_attributes_set_paired(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", is_paired=True)
+        self.assertEqual(fq.is_valid, False)
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_paired_attributes(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2")
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.read, "2")
+        self.assertEqual(fq['read'], "2")
+        self.assertEqual(fq.is_paired, True)
+        self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1_r2.fastq")
+
+    def test_init_paired_attributes_set_single(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2", is_paired=False)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_paired_attributes_set_paired(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2", is_paired=True)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_insufficient_attributes(self):
+        attribs = dict(lib_id="12345", flowcell="AABBCCDDXX")
+        fq = FastqName(**attribs)
+        self.assertEqual(fq.is_valid, False)
+
+
+def suite():
+    from unittest2 import TestSuite, defaultTestLoader
+    suite = TestSuite()
+    suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestFastqName))
+    return suite
+
+if __name__ == "__main__":
+    from unittest2 import main
+    main(defaultTest='suite')
index 8f19c994bcf6b1ad93eb662ad9cfc53a7c5cda55..3ca5ab0cdf40d2f0b14091a381fac6e389d1aceb 100644 (file)
@@ -13,13 +13,12 @@ WHERE {
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
-        dc:source ?source ;
+        libraryOntology:flowcell_id ?flowcell_id ;
         a ?file_type .
+        
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
-  ?source libraryOntology:flowcell ?flowcell .
-
   ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
             libraryOntology:read_length ?read_length ;
             libraryOntology:flowcell_type ?flowcell_type ;
index 8ae3ed9ef78e0d737f1d7cada296a740e2ff553d..4e50d0021b87a3c352aa1be7cb9e85dc8effecb8 100644 (file)
@@ -1,6 +1,6 @@
-{% for name, value in series %}
-{{name}}={{value}}{% endfor %}{% for row in samples %}
-^SAMPLE={{row.name}}
+{% for name, value in series %}{{name}}={{value}}
+{% endfor %}{% for row in samples %}!Series_sample_id={{row.name}}
+{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
 !Sample_type=SRA
 !Sample_title={{row.name}}
 !Sample_series_id={{ series_id }}
@@ -30,5 +30,5 @@
 !Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
 !sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
 !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
-!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}{% endfor %}
+{% endfor %}
\ No newline at end of file
index c116fc9236c1f06641365a8bc6d3680ab727abea..90b0e6adb194d8e64d4f5ecbd677033ead87ac4a 100644 (file)
@@ -85,7 +85,7 @@ def blankOrUri(value=None):
     return node
 
 
-def toTypedNode(value):
+def toTypedNode(value, language="en"):
     """Convert a python variable to a RDF Node with its closest xsd type
     """
     if type(value) == types.BooleanType:
@@ -113,7 +113,7 @@ def toTypedNode(value):
     if value_type is not None:
         node = RDF.Node(literal=value, datatype=value_type)
     else:
-        node = RDF.Node(literal=unicode(value).encode('utf-8'))
+        node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
     return node