X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=blobdiff_plain;f=extractor.py;fp=extractor.py;h=0000000000000000000000000000000000000000;hp=328c8c36491b961c248a95b82056136e27ecb807;hb=a158bd813406cfbf3716fea1590cc6b657b1670f;hpb=4dbbf8de4187b5ff00aa0885a55c156095d99716 diff --git a/extractor.py b/extractor.py deleted file mode 100644 index 328c8c3..0000000 --- a/extractor.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 -"""Generate a quantification matrix - -This is intended to extract one quantification column -from each of a set of gene quantification files. -""" -import argparse -import collections -import logging -import gzip -import os -import sys -import time - -logger = logging.getLogger('extractor') - -def main(cmdline=None): - parser = make_parser() - args = parser.parse_args(cmdline) - - if args.verbose: - logging.basicConfig(level=logging.INFO) - - geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {} - - if not args.quantifications: - parser.error("Please list files to extract quantifications from") - - output_headers, matrix = load_matrixes(geneid_map, - args.quantifications, - args.column) - write_merged_matrix(args.output, output_headers, matrix, args.no_zeros) - - -def load_matrixes(geneid_map, quantifications, column_name): - """Load a quantification from a list of quantification files. - - This will also convert through a gene id to gene_name map. - if a gene name isn't found, it will default to the gene id. - - Arguments: - geneid_map (dict): mapping between gene ids and gene names - quantifications (list): list of filenames to load from - - Returns: - output_headers (list): list of column headers for matrix - (derived from input filenames) - matrix (dict of lists): selected quantification values - by gene. - """ - matrix = collections.OrderedDict() - output_headers = ['#genes'] - start = time.time() - for quantification in quantifications: - logger.info("Loading %s", quantification) - with open(quantification, 'rt') as instream: - output_headers.append(os.path.basename(quantification)) - headers = instream.readline().split('\t') - try: - column_to_use = headers.index(column_name) - except ValueError as e: - raise RuntimeError( - 'Error: {} is not one of the column headers {}'.format( - args.column, headers)) - - for line in instream: - columns = line.split('\t') - key = geneid_map.get(columns[0], columns[0]) - matrix.setdefault(key, []).append(columns[column_to_use]) - - logger.info("Loaded %d matrixes in %d seconds", - len(quantifications), - time.time() - start) - return output_headers, matrix - - -def write_merged_matrix(output, headers, matrix, drop_zeros=False): - """Save matrix - - Arguments: - output (str): output filename or None for stdout - headers (list): list of matrix column headers) - matrix (dict): gene_name: list of interested - drop_zeros (bool): should we drop rows that are all zero? - """ - logger.info("Writing matrix") - if output: - outstream = open(output, 'wt') - else: - outstream = sys.stdout - - outstream.write('\t'.join(headers)) - outstream.write(os.linesep) - for key in matrix: - columns = matrix[key] - - # skip over zero rows - if drop_zeros: - for x in columns: - if float(x) != 0: - break - else: - continue - - outstream.write(key) - outstream.write('\t') - outstream.write('\t'.join(matrix[key])) - outstream.write(os.linesep) - - if outstream != sys.stdout: - outstream.close() - - -def make_parser(): - """Build argument parser. - """ - parser = argparse.ArgumentParser() - parser.add_argument('--gtf', help='gtf file to load') - parser.add_argument('--column', default='FPKM', - help='which column to use') - parser.add_argument('-o', '--output', - help='filename to write merged matrix to') - parser.add_argument('--no-zeros', default=False, action='store_true', - help='Drop rows that are all zero') - parser.add_argument('-v', '--verbose', default=False, - action='store_true', - help='report progress') - parser.add_argument('quantifications', nargs='*', - help='list of quantification files to load') - - return parser - - -def build_geneid_to_gene(gencode): - """Build a dictionary mapping from gene_id to gene_name. - - Arguments: - gencode (str): compressed filename to read - - Returns: - dictionary mapping gene_id to gene_name - """ - logger.info("Loading %s", gencode) - start = time.time() - names = {} - with gzip.GzipFile(gencode, 'r') as instream: - for line in instream: - line = line.decode('ascii') - - if line.startswith('#'): - continue - - columns = line.split('\t') - - gene_id = None - gene_name = None - - for item in columns[-1].split(';'): - item = item.strip() - if len(item) == 0: - continue - item = item.split() - if len(item) != 2: - print("Confused: {} {}".format(item, len(item))) - name, value = item - if name == 'gene_id': - gene_id = value[1:-1] - elif name == 'gene_name': - gene_name = value[1:-1] - - if gene_id and gene_name: - names[gene_id] = gene_name - - logger.info("loaded in %d seconds", time.time() - start) - return names - - -if __name__ == '__main__': - main() -