From: Diane Trout <diane@ghic.org>
Date: Thu, 2 Apr 2015 22:29:26 +0000 (-0700)
Subject: initial (mostly working) version
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=4dbbf8de4187b5ff00aa0885a55c156095d99716

initial (mostly working) version
---

4dbbf8de4187b5ff00aa0885a55c156095d99716
diff --git a/extractor.py b/extractor.py
new file mode 100644
index 0000000..328c8c3
--- /dev/null
+++ b/extractor.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""Generate a quantification matrix 
+
+This is intended to extract one quantification column
+from each of a set of gene quantification files.
+"""
+import argparse
+import collections
+import logging
+import gzip
+import os
+import sys
+import time
+
+logger = logging.getLogger('extractor')
+
+def main(cmdline=None):
+    parser = make_parser()
+    args = parser.parse_args(cmdline)
+
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
+
+    if not args.quantifications:
+        parser.error("Please list files to extract quantifications from")
+        
+    output_headers, matrix = load_matrixes(geneid_map,
+                                           args.quantifications,
+                                           args.column)
+    write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
+
+
+def load_matrixes(geneid_map, quantifications, column_name):
+    """Load a quantification from a list of quantification files.
+
+    This will also convert through a gene id to gene_name map.
+    if a gene name isn't found, it will default to the gene id.
+
+    Arguments:
+        geneid_map (dict): mapping between gene ids and gene names
+        quantifications (list): list of filenames to load from
+
+    Returns:
+        output_headers (list): list of column headers for matrix
+            (derived from input filenames)
+        matrix (dict of lists): selected quantification values
+            by gene.
+    """
+    matrix = collections.OrderedDict()
+    output_headers = ['#genes']
+    start = time.time()
+    for quantification in quantifications:
+        logger.info("Loading %s", quantification)
+        with open(quantification, 'rt') as instream:
+            output_headers.append(os.path.basename(quantification))
+            headers = instream.readline().split('\t')
+            try:
+                column_to_use = headers.index(column_name)
+            except ValueError as e:
+                raise RuntimeError(
+                    'Error: {} is not one of the column headers {}'.format(
+                    args.column, headers))
+            
+            for line in instream:
+                columns = line.split('\t')
+                key = geneid_map.get(columns[0], columns[0])
+                matrix.setdefault(key, []).append(columns[column_to_use])
+
+    logger.info("Loaded %d matrixes in %d seconds",
+                len(quantifications),
+                time.time() - start)
+    return output_headers, matrix
+
+
+def write_merged_matrix(output, headers, matrix, drop_zeros=False):
+    """Save matrix
+
+    Arguments:
+        output (str): output filename or None for stdout
+        headers (list): list of matrix column headers)
+        matrix (dict): gene_name: list of interested
+        drop_zeros (bool): should we drop rows that are all zero?
+    """
+    logger.info("Writing matrix")
+    if output:
+        outstream = open(output, 'wt')
+    else:
+        outstream = sys.stdout
+
+    outstream.write('\t'.join(headers))
+    outstream.write(os.linesep)
+    for key in matrix:
+        columns = matrix[key]
+        
+        # skip over zero rows
+        if drop_zeros:
+            for x in columns:
+                if float(x) != 0:
+                    break
+            else:
+                continue
+            
+        outstream.write(key)
+        outstream.write('\t')
+        outstream.write('\t'.join(matrix[key]))
+        outstream.write(os.linesep)
+
+    if outstream != sys.stdout:
+        outstream.close()
+        
+
+def make_parser():
+    """Build argument parser.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gtf', help='gtf file to load')
+    parser.add_argument('--column', default='FPKM',
+                        help='which column to use')
+    parser.add_argument('-o', '--output',
+                        help='filename to write merged matrix to')
+    parser.add_argument('--no-zeros', default=False, action='store_true',
+                        help='Drop rows that are all zero')
+    parser.add_argument('-v', '--verbose', default=False,
+                        action='store_true',
+                        help='report progress')
+    parser.add_argument('quantifications', nargs='*',
+                        help='list of quantification files to load')
+    
+    return parser
+
+
+def build_geneid_to_gene(gencode):
+    """Build a dictionary mapping from gene_id to gene_name.
+
+    Arguments:
+        gencode (str): compressed filename to read
+
+    Returns:
+        dictionary mapping gene_id to gene_name
+    """
+    logger.info("Loading %s", gencode)
+    start = time.time()
+    names = {}
+    with gzip.GzipFile(gencode, 'r') as instream:
+        for line in instream:
+            line = line.decode('ascii')
+            
+            if line.startswith('#'):
+                continue
+            
+            columns = line.split('\t')
+
+            gene_id = None
+            gene_name = None
+            
+            for item in columns[-1].split(';'):
+                item = item.strip()
+                if len(item) == 0:
+                    continue
+                item = item.split()
+                if len(item) != 2:
+                    print("Confused: {} {}".format(item, len(item)))
+                name, value = item
+                if name == 'gene_id':
+                    gene_id = value[1:-1]
+                elif name == 'gene_name':
+                    gene_name = value[1:-1]
+
+            if gene_id and gene_name:
+                names[gene_id] = gene_name
+
+    logger.info("loaded in %d seconds", time.time() - start)
+    return names
+                        
+
+if __name__ == '__main__':
+    main()
+