From: Diane Trout Date: Thu, 2 Apr 2015 22:38:43 +0000 (-0700) Subject: come up with a better name X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=a158bd813406cfbf3716fea1590cc6b657b1670f come up with a better name --- diff --git a/extractor.py b/extractor.py deleted file mode 100644 index 328c8c3..0000000 --- a/extractor.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 -"""Generate a quantification matrix - -This is intended to extract one quantification column -from each of a set of gene quantification files. -""" -import argparse -import collections -import logging -import gzip -import os -import sys -import time - -logger = logging.getLogger('extractor') - -def main(cmdline=None): - parser = make_parser() - args = parser.parse_args(cmdline) - - if args.verbose: - logging.basicConfig(level=logging.INFO) - - geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {} - - if not args.quantifications: - parser.error("Please list files to extract quantifications from") - - output_headers, matrix = load_matrixes(geneid_map, - args.quantifications, - args.column) - write_merged_matrix(args.output, output_headers, matrix, args.no_zeros) - - -def load_matrixes(geneid_map, quantifications, column_name): - """Load a quantification from a list of quantification files. - - This will also convert through a gene id to gene_name map. - if a gene name isn't found, it will default to the gene id. - - Arguments: - geneid_map (dict): mapping between gene ids and gene names - quantifications (list): list of filenames to load from - - Returns: - output_headers (list): list of column headers for matrix - (derived from input filenames) - matrix (dict of lists): selected quantification values - by gene. - """ - matrix = collections.OrderedDict() - output_headers = ['#genes'] - start = time.time() - for quantification in quantifications: - logger.info("Loading %s", quantification) - with open(quantification, 'rt') as instream: - output_headers.append(os.path.basename(quantification)) - headers = instream.readline().split('\t') - try: - column_to_use = headers.index(column_name) - except ValueError as e: - raise RuntimeError( - 'Error: {} is not one of the column headers {}'.format( - args.column, headers)) - - for line in instream: - columns = line.split('\t') - key = geneid_map.get(columns[0], columns[0]) - matrix.setdefault(key, []).append(columns[column_to_use]) - - logger.info("Loaded %d matrixes in %d seconds", - len(quantifications), - time.time() - start) - return output_headers, matrix - - -def write_merged_matrix(output, headers, matrix, drop_zeros=False): - """Save matrix - - Arguments: - output (str): output filename or None for stdout - headers (list): list of matrix column headers) - matrix (dict): gene_name: list of interested - drop_zeros (bool): should we drop rows that are all zero? - """ - logger.info("Writing matrix") - if output: - outstream = open(output, 'wt') - else: - outstream = sys.stdout - - outstream.write('\t'.join(headers)) - outstream.write(os.linesep) - for key in matrix: - columns = matrix[key] - - # skip over zero rows - if drop_zeros: - for x in columns: - if float(x) != 0: - break - else: - continue - - outstream.write(key) - outstream.write('\t') - outstream.write('\t'.join(matrix[key])) - outstream.write(os.linesep) - - if outstream != sys.stdout: - outstream.close() - - -def make_parser(): - """Build argument parser. - """ - parser = argparse.ArgumentParser() - parser.add_argument('--gtf', help='gtf file to load') - parser.add_argument('--column', default='FPKM', - help='which column to use') - parser.add_argument('-o', '--output', - help='filename to write merged matrix to') - parser.add_argument('--no-zeros', default=False, action='store_true', - help='Drop rows that are all zero') - parser.add_argument('-v', '--verbose', default=False, - action='store_true', - help='report progress') - parser.add_argument('quantifications', nargs='*', - help='list of quantification files to load') - - return parser - - -def build_geneid_to_gene(gencode): - """Build a dictionary mapping from gene_id to gene_name. - - Arguments: - gencode (str): compressed filename to read - - Returns: - dictionary mapping gene_id to gene_name - """ - logger.info("Loading %s", gencode) - start = time.time() - names = {} - with gzip.GzipFile(gencode, 'r') as instream: - for line in instream: - line = line.decode('ascii') - - if line.startswith('#'): - continue - - columns = line.split('\t') - - gene_id = None - gene_name = None - - for item in columns[-1].split(';'): - item = item.strip() - if len(item) == 0: - continue - item = item.split() - if len(item) != 2: - print("Confused: {} {}".format(item, len(item))) - name, value = item - if name == 'gene_id': - gene_id = value[1:-1] - elif name == 'gene_name': - gene_name = value[1:-1] - - if gene_id and gene_name: - names[gene_id] = gene_name - - logger.info("loaded in %d seconds", time.time() - start) - return names - - -if __name__ == '__main__': - main() - diff --git a/translate_tsv_genes.py b/translate_tsv_genes.py new file mode 100644 index 0000000..328c8c3 --- /dev/null +++ b/translate_tsv_genes.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +"""Generate a quantification matrix + +This is intended to extract one quantification column +from each of a set of gene quantification files. +""" +import argparse +import collections +import logging +import gzip +import os +import sys +import time + +logger = logging.getLogger('extractor') + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + if args.verbose: + logging.basicConfig(level=logging.INFO) + + geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {} + + if not args.quantifications: + parser.error("Please list files to extract quantifications from") + + output_headers, matrix = load_matrixes(geneid_map, + args.quantifications, + args.column) + write_merged_matrix(args.output, output_headers, matrix, args.no_zeros) + + +def load_matrixes(geneid_map, quantifications, column_name): + """Load a quantification from a list of quantification files. + + This will also convert through a gene id to gene_name map. + if a gene name isn't found, it will default to the gene id. + + Arguments: + geneid_map (dict): mapping between gene ids and gene names + quantifications (list): list of filenames to load from + + Returns: + output_headers (list): list of column headers for matrix + (derived from input filenames) + matrix (dict of lists): selected quantification values + by gene. + """ + matrix = collections.OrderedDict() + output_headers = ['#genes'] + start = time.time() + for quantification in quantifications: + logger.info("Loading %s", quantification) + with open(quantification, 'rt') as instream: + output_headers.append(os.path.basename(quantification)) + headers = instream.readline().split('\t') + try: + column_to_use = headers.index(column_name) + except ValueError as e: + raise RuntimeError( + 'Error: {} is not one of the column headers {}'.format( + args.column, headers)) + + for line in instream: + columns = line.split('\t') + key = geneid_map.get(columns[0], columns[0]) + matrix.setdefault(key, []).append(columns[column_to_use]) + + logger.info("Loaded %d matrixes in %d seconds", + len(quantifications), + time.time() - start) + return output_headers, matrix + + +def write_merged_matrix(output, headers, matrix, drop_zeros=False): + """Save matrix + + Arguments: + output (str): output filename or None for stdout + headers (list): list of matrix column headers) + matrix (dict): gene_name: list of interested + drop_zeros (bool): should we drop rows that are all zero? + """ + logger.info("Writing matrix") + if output: + outstream = open(output, 'wt') + else: + outstream = sys.stdout + + outstream.write('\t'.join(headers)) + outstream.write(os.linesep) + for key in matrix: + columns = matrix[key] + + # skip over zero rows + if drop_zeros: + for x in columns: + if float(x) != 0: + break + else: + continue + + outstream.write(key) + outstream.write('\t') + outstream.write('\t'.join(matrix[key])) + outstream.write(os.linesep) + + if outstream != sys.stdout: + outstream.close() + + +def make_parser(): + """Build argument parser. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--gtf', help='gtf file to load') + parser.add_argument('--column', default='FPKM', + help='which column to use') + parser.add_argument('-o', '--output', + help='filename to write merged matrix to') + parser.add_argument('--no-zeros', default=False, action='store_true', + help='Drop rows that are all zero') + parser.add_argument('-v', '--verbose', default=False, + action='store_true', + help='report progress') + parser.add_argument('quantifications', nargs='*', + help='list of quantification files to load') + + return parser + + +def build_geneid_to_gene(gencode): + """Build a dictionary mapping from gene_id to gene_name. + + Arguments: + gencode (str): compressed filename to read + + Returns: + dictionary mapping gene_id to gene_name + """ + logger.info("Loading %s", gencode) + start = time.time() + names = {} + with gzip.GzipFile(gencode, 'r') as instream: + for line in instream: + line = line.decode('ascii') + + if line.startswith('#'): + continue + + columns = line.split('\t') + + gene_id = None + gene_name = None + + for item in columns[-1].split(';'): + item = item.strip() + if len(item) == 0: + continue + item = item.split() + if len(item) != 2: + print("Confused: {} {}".format(item, len(item))) + name, value = item + if name == 'gene_id': + gene_id = value[1:-1] + elif name == 'gene_name': + gene_name = value[1:-1] + + if gene_id and gene_name: + names[gene_id] = gene_name + + logger.info("loaded in %d seconds", time.time() - start) + return names + + +if __name__ == '__main__': + main() +