From: Diane Trout Date: Thu, 2 Apr 2015 22:29:26 +0000 (-0700) Subject: initial (mostly working) version X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=4dbbf8de4187b5ff00aa0885a55c156095d99716 initial (mostly working) version --- 4dbbf8de4187b5ff00aa0885a55c156095d99716 diff --git a/extractor.py b/extractor.py new file mode 100644 index 0000000..328c8c3 --- /dev/null +++ b/extractor.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +"""Generate a quantification matrix + +This is intended to extract one quantification column +from each of a set of gene quantification files. +""" +import argparse +import collections +import logging +import gzip +import os +import sys +import time + +logger = logging.getLogger('extractor') + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + if args.verbose: + logging.basicConfig(level=logging.INFO) + + geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {} + + if not args.quantifications: + parser.error("Please list files to extract quantifications from") + + output_headers, matrix = load_matrixes(geneid_map, + args.quantifications, + args.column) + write_merged_matrix(args.output, output_headers, matrix, args.no_zeros) + + +def load_matrixes(geneid_map, quantifications, column_name): + """Load a quantification from a list of quantification files. + + This will also convert through a gene id to gene_name map. + if a gene name isn't found, it will default to the gene id. + + Arguments: + geneid_map (dict): mapping between gene ids and gene names + quantifications (list): list of filenames to load from + + Returns: + output_headers (list): list of column headers for matrix + (derived from input filenames) + matrix (dict of lists): selected quantification values + by gene. + """ + matrix = collections.OrderedDict() + output_headers = ['#genes'] + start = time.time() + for quantification in quantifications: + logger.info("Loading %s", quantification) + with open(quantification, 'rt') as instream: + output_headers.append(os.path.basename(quantification)) + headers = instream.readline().split('\t') + try: + column_to_use = headers.index(column_name) + except ValueError as e: + raise RuntimeError( + 'Error: {} is not one of the column headers {}'.format( + args.column, headers)) + + for line in instream: + columns = line.split('\t') + key = geneid_map.get(columns[0], columns[0]) + matrix.setdefault(key, []).append(columns[column_to_use]) + + logger.info("Loaded %d matrixes in %d seconds", + len(quantifications), + time.time() - start) + return output_headers, matrix + + +def write_merged_matrix(output, headers, matrix, drop_zeros=False): + """Save matrix + + Arguments: + output (str): output filename or None for stdout + headers (list): list of matrix column headers) + matrix (dict): gene_name: list of interested + drop_zeros (bool): should we drop rows that are all zero? + """ + logger.info("Writing matrix") + if output: + outstream = open(output, 'wt') + else: + outstream = sys.stdout + + outstream.write('\t'.join(headers)) + outstream.write(os.linesep) + for key in matrix: + columns = matrix[key] + + # skip over zero rows + if drop_zeros: + for x in columns: + if float(x) != 0: + break + else: + continue + + outstream.write(key) + outstream.write('\t') + outstream.write('\t'.join(matrix[key])) + outstream.write(os.linesep) + + if outstream != sys.stdout: + outstream.close() + + +def make_parser(): + """Build argument parser. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--gtf', help='gtf file to load') + parser.add_argument('--column', default='FPKM', + help='which column to use') + parser.add_argument('-o', '--output', + help='filename to write merged matrix to') + parser.add_argument('--no-zeros', default=False, action='store_true', + help='Drop rows that are all zero') + parser.add_argument('-v', '--verbose', default=False, + action='store_true', + help='report progress') + parser.add_argument('quantifications', nargs='*', + help='list of quantification files to load') + + return parser + + +def build_geneid_to_gene(gencode): + """Build a dictionary mapping from gene_id to gene_name. + + Arguments: + gencode (str): compressed filename to read + + Returns: + dictionary mapping gene_id to gene_name + """ + logger.info("Loading %s", gencode) + start = time.time() + names = {} + with gzip.GzipFile(gencode, 'r') as instream: + for line in instream: + line = line.decode('ascii') + + if line.startswith('#'): + continue + + columns = line.split('\t') + + gene_id = None + gene_name = None + + for item in columns[-1].split(';'): + item = item.strip() + if len(item) == 0: + continue + item = item.split() + if len(item) != 2: + print("Confused: {} {}".format(item, len(item))) + name, value = item + if name == 'gene_id': + gene_id = value[1:-1] + elif name == 'gene_name': + gene_name = value[1:-1] + + if gene_id and gene_name: + names[gene_id] = gene_name + + logger.info("loaded in %d seconds", time.time() - start) + return names + + +if __name__ == '__main__': + main() +