From: Diane Trout <diane@ghic.org>
Date: Thu, 2 Apr 2015 22:38:43 +0000 (-0700)
Subject: come up with a better name
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=a158bd813406cfbf3716fea1590cc6b657b1670f

come up with a better name
---

diff --git a/extractor.py b/extractor.py
deleted file mode 100644
index 328c8c3..0000000
--- a/extractor.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-"""Generate a quantification matrix 
-
-This is intended to extract one quantification column
-from each of a set of gene quantification files.
-"""
-import argparse
-import collections
-import logging
-import gzip
-import os
-import sys
-import time
-
-logger = logging.getLogger('extractor')
-
-def main(cmdline=None):
-    parser = make_parser()
-    args = parser.parse_args(cmdline)
-
-    if args.verbose:
-        logging.basicConfig(level=logging.INFO)
-
-    geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
-
-    if not args.quantifications:
-        parser.error("Please list files to extract quantifications from")
-        
-    output_headers, matrix = load_matrixes(geneid_map,
-                                           args.quantifications,
-                                           args.column)
-    write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
-
-
-def load_matrixes(geneid_map, quantifications, column_name):
-    """Load a quantification from a list of quantification files.
-
-    This will also convert through a gene id to gene_name map.
-    if a gene name isn't found, it will default to the gene id.
-
-    Arguments:
-        geneid_map (dict): mapping between gene ids and gene names
-        quantifications (list): list of filenames to load from
-
-    Returns:
-        output_headers (list): list of column headers for matrix
-            (derived from input filenames)
-        matrix (dict of lists): selected quantification values
-            by gene.
-    """
-    matrix = collections.OrderedDict()
-    output_headers = ['#genes']
-    start = time.time()
-    for quantification in quantifications:
-        logger.info("Loading %s", quantification)
-        with open(quantification, 'rt') as instream:
-            output_headers.append(os.path.basename(quantification))
-            headers = instream.readline().split('\t')
-            try:
-                column_to_use = headers.index(column_name)
-            except ValueError as e:
-                raise RuntimeError(
-                    'Error: {} is not one of the column headers {}'.format(
-                    args.column, headers))
-            
-            for line in instream:
-                columns = line.split('\t')
-                key = geneid_map.get(columns[0], columns[0])
-                matrix.setdefault(key, []).append(columns[column_to_use])
-
-    logger.info("Loaded %d matrixes in %d seconds",
-                len(quantifications),
-                time.time() - start)
-    return output_headers, matrix
-
-
-def write_merged_matrix(output, headers, matrix, drop_zeros=False):
-    """Save matrix
-
-    Arguments:
-        output (str): output filename or None for stdout
-        headers (list): list of matrix column headers)
-        matrix (dict): gene_name: list of interested
-        drop_zeros (bool): should we drop rows that are all zero?
-    """
-    logger.info("Writing matrix")
-    if output:
-        outstream = open(output, 'wt')
-    else:
-        outstream = sys.stdout
-
-    outstream.write('\t'.join(headers))
-    outstream.write(os.linesep)
-    for key in matrix:
-        columns = matrix[key]
-        
-        # skip over zero rows
-        if drop_zeros:
-            for x in columns:
-                if float(x) != 0:
-                    break
-            else:
-                continue
-            
-        outstream.write(key)
-        outstream.write('\t')
-        outstream.write('\t'.join(matrix[key]))
-        outstream.write(os.linesep)
-
-    if outstream != sys.stdout:
-        outstream.close()
-        
-
-def make_parser():
-    """Build argument parser.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--gtf', help='gtf file to load')
-    parser.add_argument('--column', default='FPKM',
-                        help='which column to use')
-    parser.add_argument('-o', '--output',
-                        help='filename to write merged matrix to')
-    parser.add_argument('--no-zeros', default=False, action='store_true',
-                        help='Drop rows that are all zero')
-    parser.add_argument('-v', '--verbose', default=False,
-                        action='store_true',
-                        help='report progress')
-    parser.add_argument('quantifications', nargs='*',
-                        help='list of quantification files to load')
-    
-    return parser
-
-
-def build_geneid_to_gene(gencode):
-    """Build a dictionary mapping from gene_id to gene_name.
-
-    Arguments:
-        gencode (str): compressed filename to read
-
-    Returns:
-        dictionary mapping gene_id to gene_name
-    """
-    logger.info("Loading %s", gencode)
-    start = time.time()
-    names = {}
-    with gzip.GzipFile(gencode, 'r') as instream:
-        for line in instream:
-            line = line.decode('ascii')
-            
-            if line.startswith('#'):
-                continue
-            
-            columns = line.split('\t')
-
-            gene_id = None
-            gene_name = None
-            
-            for item in columns[-1].split(';'):
-                item = item.strip()
-                if len(item) == 0:
-                    continue
-                item = item.split()
-                if len(item) != 2:
-                    print("Confused: {} {}".format(item, len(item)))
-                name, value = item
-                if name == 'gene_id':
-                    gene_id = value[1:-1]
-                elif name == 'gene_name':
-                    gene_name = value[1:-1]
-
-            if gene_id and gene_name:
-                names[gene_id] = gene_name
-
-    logger.info("loaded in %d seconds", time.time() - start)
-    return names
-                        
-
-if __name__ == '__main__':
-    main()
-        
diff --git a/translate_tsv_genes.py b/translate_tsv_genes.py
new file mode 100644
index 0000000..328c8c3
--- /dev/null
+++ b/translate_tsv_genes.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""Generate a quantification matrix 
+
+This is intended to extract one quantification column
+from each of a set of gene quantification files.
+"""
+import argparse
+import collections
+import logging
+import gzip
+import os
+import sys
+import time
+
+logger = logging.getLogger('extractor')
+
+def main(cmdline=None):
+    parser = make_parser()
+    args = parser.parse_args(cmdline)
+
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
+
+    if not args.quantifications:
+        parser.error("Please list files to extract quantifications from")
+        
+    output_headers, matrix = load_matrixes(geneid_map,
+                                           args.quantifications,
+                                           args.column)
+    write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
+
+
+def load_matrixes(geneid_map, quantifications, column_name):
+    """Load a quantification from a list of quantification files.
+
+    This will also convert through a gene id to gene_name map.
+    if a gene name isn't found, it will default to the gene id.
+
+    Arguments:
+        geneid_map (dict): mapping between gene ids and gene names
+        quantifications (list): list of filenames to load from
+
+    Returns:
+        output_headers (list): list of column headers for matrix
+            (derived from input filenames)
+        matrix (dict of lists): selected quantification values
+            by gene.
+    """
+    matrix = collections.OrderedDict()
+    output_headers = ['#genes']
+    start = time.time()
+    for quantification in quantifications:
+        logger.info("Loading %s", quantification)
+        with open(quantification, 'rt') as instream:
+            output_headers.append(os.path.basename(quantification))
+            headers = instream.readline().split('\t')
+            try:
+                column_to_use = headers.index(column_name)
+            except ValueError as e:
+                raise RuntimeError(
+                    'Error: {} is not one of the column headers {}'.format(
+                    args.column, headers))
+            
+            for line in instream:
+                columns = line.split('\t')
+                key = geneid_map.get(columns[0], columns[0])
+                matrix.setdefault(key, []).append(columns[column_to_use])
+
+    logger.info("Loaded %d matrixes in %d seconds",
+                len(quantifications),
+                time.time() - start)
+    return output_headers, matrix
+
+
+def write_merged_matrix(output, headers, matrix, drop_zeros=False):
+    """Save matrix
+
+    Arguments:
+        output (str): output filename or None for stdout
+        headers (list): list of matrix column headers)
+        matrix (dict): gene_name: list of interested
+        drop_zeros (bool): should we drop rows that are all zero?
+    """
+    logger.info("Writing matrix")
+    if output:
+        outstream = open(output, 'wt')
+    else:
+        outstream = sys.stdout
+
+    outstream.write('\t'.join(headers))
+    outstream.write(os.linesep)
+    for key in matrix:
+        columns = matrix[key]
+        
+        # skip over zero rows
+        if drop_zeros:
+            for x in columns:
+                if float(x) != 0:
+                    break
+            else:
+                continue
+            
+        outstream.write(key)
+        outstream.write('\t')
+        outstream.write('\t'.join(matrix[key]))
+        outstream.write(os.linesep)
+
+    if outstream != sys.stdout:
+        outstream.close()
+        
+
+def make_parser():
+    """Build argument parser.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gtf', help='gtf file to load')
+    parser.add_argument('--column', default='FPKM',
+                        help='which column to use')
+    parser.add_argument('-o', '--output',
+                        help='filename to write merged matrix to')
+    parser.add_argument('--no-zeros', default=False, action='store_true',
+                        help='Drop rows that are all zero')
+    parser.add_argument('-v', '--verbose', default=False,
+                        action='store_true',
+                        help='report progress')
+    parser.add_argument('quantifications', nargs='*',
+                        help='list of quantification files to load')
+    
+    return parser
+
+
+def build_geneid_to_gene(gencode):
+    """Build a dictionary mapping from gene_id to gene_name.
+
+    Arguments:
+        gencode (str): compressed filename to read
+
+    Returns:
+        dictionary mapping gene_id to gene_name
+    """
+    logger.info("Loading %s", gencode)
+    start = time.time()
+    names = {}
+    with gzip.GzipFile(gencode, 'r') as instream:
+        for line in instream:
+            line = line.decode('ascii')
+            
+            if line.startswith('#'):
+                continue
+            
+            columns = line.split('\t')
+
+            gene_id = None
+            gene_name = None
+            
+            for item in columns[-1].split(';'):
+                item = item.strip()
+                if len(item) == 0:
+                    continue
+                item = item.split()
+                if len(item) != 2:
+                    print("Confused: {} {}".format(item, len(item)))
+                name, value = item
+                if name == 'gene_id':
+                    gene_id = value[1:-1]
+                elif name == 'gene_name':
+                    gene_name = value[1:-1]
+
+            if gene_id and gene_name:
+                names[gene_id] = gene_name
+
+    logger.info("loaded in %d seconds", time.time() - start)
+    return names
+                        
+
+if __name__ == '__main__':
+    main()
+