-class NameToViewMap(object):
- """Determine view attributes for a given submission file name
- """
- def __init__(self, root_url, apidata):
- self.root_url = root_url
- self.apidata = apidata
-
- self.lib_cache = {}
- self.lib_paired = {}
- # ma is "map algorithm"
- ma = 'TH1014'
-
- self.patterns = [
- # for 2011 Feb 18 elements submission
- ('final_Cufflinks_genes_*gtf', 'GeneDeNovo'),
- ('final_Cufflinks_transcripts_*gtf', 'TranscriptDeNovo'),
- ('final_exonFPKM-Cufflinks-0.9.3-GENCODE-v3c-*.gtf',
- 'ExonsGencV3c'),
- ('final_GENCODE-v3-Cufflinks-0.9.3.genes-*gtf',
- 'GeneGencV3c'),
- ('final_GENCODE-v3-Cufflinks-0.9.3.transcripts-*gtf',
- 'TranscriptGencV3c'),
- ('final_TSS-Cufflinks-0.9.3-GENCODE-v3c-*.gtf', 'TSS'),
- ('final_junctions-*.bed6+3', 'Junctions'),
-
- ('*.bai', None),
- ('*.splices.bam', 'Splices'),
- ('*.bam', self._guess_bam_view),
- ('junctions.bed', 'Junctions'),
- ('*.jnct', 'Junctions'),
- ('*unique.bigwig', None),
- ('*plus.bigwig', 'PlusSignal'),
- ('*minus.bigwig', 'MinusSignal'),
- ('*.bigwig', 'Signal'),
- ('*.tar.bz2', None),
- ('*.condor', None),
- ('*.daf', None),
- ('*.ddf', None),
-
- ('*ufflinks?0.9.3.genes.gtf', 'GeneDeNovo'),
- ('*ufflinks?0.9.3.transcripts.gtf', 'TranscriptDeNovo'),
- ('*GENCODE-v3c.exonFPKM.gtf', 'ExonsGencV3c'),
- ('*GENCODE-v3c.genes.gtf', 'GeneGencV3c'),
- ('*GENCODE-v3c.transcripts.gtf', 'TranscriptGencV3c'),
- ('*GENCODE-v3c.TSS.gtf', 'TSS'),
- ('*.junctions.bed6+3', 'Junctions'),
-
- ('*.?ufflinks-0.9.0?genes.expr', 'GeneDeNovo'),
- ('*.?ufflinks-0.9.0?transcripts.expr', 'TranscriptDeNovo'),
- ('*.?ufflinks-0.9.0?transcripts.gtf', 'GeneModel'),
-
- ('*.GENCODE-v3c?genes.expr', 'GeneGCV3c'),
- ('*.GENCODE-v3c?transcript*.expr', 'TranscriptGCV3c'),
- ('*.GENCODE-v3c?transcript*.gtf', 'TranscriptGencV3c'),
- ('*.GENCODE-v4?genes.expr', None), #'GeneGCV4'),
- ('*.GENCODE-v4?transcript*.expr', None), #'TranscriptGCV4'),
- ('*.GENCODE-v4?transcript*.gtf', None), #'TranscriptGencV4'),
- ('*_1.75mers.fastq', 'FastqRd1'),
- ('*_2.75mers.fastq', 'FastqRd2'),
- ('*_r1.fastq', 'FastqRd1'),
- ('*_r2.fastq', 'FastqRd2'),
- ('*.fastq', 'Fastq'),
- ('*.gtf', 'GeneModel'),
- ('*.ini', None),
- ('*.log', None),
- ('*.md5', None),
- ('paired-end-distribution*', 'InsLength'),
- ('*.stats.txt', 'InsLength'),
- ('*.srf', None),
- ('*.wig', None),
- ('*.zip', None),
- ('transfer_log', None),
- ]
-
- self.views = {
- None: {"MapAlgorithm": "NA"},
- "Paired": {"MapAlgorithm": ma},
- "Aligns": {"MapAlgorithm": ma},
- "Single": {"MapAlgorithm": ma},
- "Splices": {"MapAlgorithm": ma},
- "Junctions": {"MapAlgorithm": ma},
- "PlusSignal": {"MapAlgorithm": ma},
- "MinusSignal": {"MapAlgorithm": ma},
- "Signal": {"MapAlgorithm": ma},
- "GeneModel": {"MapAlgorithm": ma},
- "GeneDeNovo": {"MapAlgorithm": ma},
- "TranscriptDeNovo": {"MapAlgorithm": ma},
- "ExonsGencV3c": {"MapAlgorithm": ma},
- "GeneGencV3c": {"MapAlgorithm": ma},
- "TSS": {"MapAlgorithm": ma},
- "GeneGCV3c": {"MapAlgorithm": ma},
- "TranscriptGCV3c": {"MapAlgorithm": ma},
- "TranscriptGencV3c": {"MapAlgorithm": ma},
- "GeneGCV4": {"MapAlgorithm": ma},
- "TranscriptGCV4": {"MapAlgorithm": ma},
- "FastqRd1": {"MapAlgorithm": "NA", "type": "fastq"},
- "FastqRd2": {"MapAlgorithm": "NA", "type": "fastq"},
- "Fastq": {"MapAlgorithm": "NA", "type": "fastq" },
- "InsLength": {"MapAlgorithm": ma},
- }
- # view name is one of the attributes
- for v in self.views.keys():
- self.views[v]['view'] = v
-
- def find_attributes(self, pathname, lib_id):
- """Looking for the best extension
- The 'best' is the longest match
-
- :Args:
- filename (str): the filename whose extention we are about to examine
- """
- path, filename = os.path.splitext(pathname)
- if not self.lib_cache.has_key(lib_id):
- self.lib_cache[lib_id] = get_library_info(self.root_url,
- self.apidata, lib_id)
-
- lib_info = self.lib_cache[lib_id]
- if lib_info['cell_line'].lower() == 'unknown':
- logging.warn("Library %s missing cell_line" % (lib_id,))
- attributes = {
- 'cell': lib_info['cell_line'],
- 'replicate': lib_info['replicate'],
- }
- is_paired = self._is_paired(lib_id, lib_info)
-
- if is_paired:
- attributes.update(self.get_paired_attributes(lib_info))
- else:
- attributes.update(self.get_single_attributes(lib_info))
-
- for pattern, view in self.patterns:
- if fnmatch.fnmatch(pathname, pattern):
- if callable(view):
- view = view(is_paired=is_paired)
-
- attributes.update(self.views[view])
- attributes["extension"] = pattern
- return attributes
-
-
- def _guess_bam_view(self, is_paired=True):
- """Guess a view name based on library attributes
- """
- if is_paired:
- return "Paired"
- else:
- return "Aligns"
-
-
- def _is_paired(self, lib_id, lib_info):
- """Determine if a library is paired end"""
- # TODO: encode this information in the library type page.
- single = (1,3,6)
- if len(lib_info["lane_set"]) == 0:
- # we haven't sequenced anything so guess based on library type
- if lib_info['library_type_id'] in single:
- return False
- else:
- return True
-
- if not self.lib_paired.has_key(lib_id):
- is_paired = 0
- isnot_paired = 0
- failed = 0
- # check to see if all the flowcells are the same.
- # otherwise we might need to do something complicated
- for flowcell in lib_info["lane_set"]:
- # yes there's also a status code, but this comparison
- # is easier to read
- if flowcell["status"].lower() == "failed":
- # ignore failed flowcell
- failed += 1
- pass
- elif flowcell["paired_end"]:
- is_paired += 1
- else:
- isnot_paired += 1
-
- logging.debug("Library %s: %d paired, %d single, %d failed" % \
- (lib_info["library_id"], is_paired, isnot_paired, failed))
-
- if is_paired > isnot_paired:
- self.lib_paired[lib_id] = True
- elif is_paired < isnot_paired:
- self.lib_paired[lib_id] = False
- else:
- raise RuntimeError("Equal number of paired & unpaired lanes."\
- "Can't guess library paired status")
-
- return self.lib_paired[lib_id]
-
- def get_paired_attributes(self, lib_info):
- if lib_info['insert_size'] is None:
- errmsg = "Library %s is missing insert_size, assuming 200"
- logging.warn(errmsg % (lib_info["library_id"],))
- insert_size = 200
- else:
- insert_size = lib_info['insert_size']
- return {'insertLength': insert_size,
- 'readType': '2x75'}
-
- def get_single_attributes(self, lib_info):
- return {'insertLength':'ilNA',
- 'readType': '1x75D'
- }
-