X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=blobdiff_plain;f=extractor.py;fp=extractor.py;h=0000000000000000000000000000000000000000;hp=328c8c36491b961c248a95b82056136e27ecb807;hb=a158bd813406cfbf3716fea1590cc6b657b1670f;hpb=4dbbf8de4187b5ff00aa0885a55c156095d99716

diff --git a/extractor.py b/extractor.py
deleted file mode 100644
index 328c8c3..0000000
--- a/extractor.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-"""Generate a quantification matrix 
-
-This is intended to extract one quantification column
-from each of a set of gene quantification files.
-"""
-import argparse
-import collections
-import logging
-import gzip
-import os
-import sys
-import time
-
-logger = logging.getLogger('extractor')
-
-def main(cmdline=None):
-    parser = make_parser()
-    args = parser.parse_args(cmdline)
-
-    if args.verbose:
-        logging.basicConfig(level=logging.INFO)
-
-    geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
-
-    if not args.quantifications:
-        parser.error("Please list files to extract quantifications from")
-        
-    output_headers, matrix = load_matrixes(geneid_map,
-                                           args.quantifications,
-                                           args.column)
-    write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
-
-
-def load_matrixes(geneid_map, quantifications, column_name):
-    """Load a quantification from a list of quantification files.
-
-    This will also convert through a gene id to gene_name map.
-    if a gene name isn't found, it will default to the gene id.
-
-    Arguments:
-        geneid_map (dict): mapping between gene ids and gene names
-        quantifications (list): list of filenames to load from
-
-    Returns:
-        output_headers (list): list of column headers for matrix
-            (derived from input filenames)
-        matrix (dict of lists): selected quantification values
-            by gene.
-    """
-    matrix = collections.OrderedDict()
-    output_headers = ['#genes']
-    start = time.time()
-    for quantification in quantifications:
-        logger.info("Loading %s", quantification)
-        with open(quantification, 'rt') as instream:
-            output_headers.append(os.path.basename(quantification))
-            headers = instream.readline().split('\t')
-            try:
-                column_to_use = headers.index(column_name)
-            except ValueError as e:
-                raise RuntimeError(
-                    'Error: {} is not one of the column headers {}'.format(
-                    args.column, headers))
-            
-            for line in instream:
-                columns = line.split('\t')
-                key = geneid_map.get(columns[0], columns[0])
-                matrix.setdefault(key, []).append(columns[column_to_use])
-
-    logger.info("Loaded %d matrixes in %d seconds",
-                len(quantifications),
-                time.time() - start)
-    return output_headers, matrix
-
-
-def write_merged_matrix(output, headers, matrix, drop_zeros=False):
-    """Save matrix
-
-    Arguments:
-        output (str): output filename or None for stdout
-        headers (list): list of matrix column headers)
-        matrix (dict): gene_name: list of interested
-        drop_zeros (bool): should we drop rows that are all zero?
-    """
-    logger.info("Writing matrix")
-    if output:
-        outstream = open(output, 'wt')
-    else:
-        outstream = sys.stdout
-
-    outstream.write('\t'.join(headers))
-    outstream.write(os.linesep)
-    for key in matrix:
-        columns = matrix[key]
-        
-        # skip over zero rows
-        if drop_zeros:
-            for x in columns:
-                if float(x) != 0:
-                    break
-            else:
-                continue
-            
-        outstream.write(key)
-        outstream.write('\t')
-        outstream.write('\t'.join(matrix[key]))
-        outstream.write(os.linesep)
-
-    if outstream != sys.stdout:
-        outstream.close()
-        
-
-def make_parser():
-    """Build argument parser.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--gtf', help='gtf file to load')
-    parser.add_argument('--column', default='FPKM',
-                        help='which column to use')
-    parser.add_argument('-o', '--output',
-                        help='filename to write merged matrix to')
-    parser.add_argument('--no-zeros', default=False, action='store_true',
-                        help='Drop rows that are all zero')
-    parser.add_argument('-v', '--verbose', default=False,
-                        action='store_true',
-                        help='report progress')
-    parser.add_argument('quantifications', nargs='*',
-                        help='list of quantification files to load')
-    
-    return parser
-
-
-def build_geneid_to_gene(gencode):
-    """Build a dictionary mapping from gene_id to gene_name.
-
-    Arguments:
-        gencode (str): compressed filename to read
-
-    Returns:
-        dictionary mapping gene_id to gene_name
-    """
-    logger.info("Loading %s", gencode)
-    start = time.time()
-    names = {}
-    with gzip.GzipFile(gencode, 'r') as instream:
-        for line in instream:
-            line = line.decode('ascii')
-            
-            if line.startswith('#'):
-                continue
-            
-            columns = line.split('\t')
-
-            gene_id = None
-            gene_name = None
-            
-            for item in columns[-1].split(';'):
-                item = item.strip()
-                if len(item) == 0:
-                    continue
-                item = item.split()
-                if len(item) != 2:
-                    print("Confused: {} {}".format(item, len(item)))
-                name, value = item
-                if name == 'gene_id':
-                    gene_id = value[1:-1]
-                elif name == 'gene_name':
-                    gene_name = value[1:-1]
-
-            if gene_id and gene_name:
-                names[gene_id] = gene_name
-
-    logger.info("loaded in %d seconds", time.time() - start)
-    return names
-                        
-
-if __name__ == '__main__':
-    main()
-