+++ /dev/null
-#!/usr/bin/env python3
-"""Generate a quantification matrix
-
-This is intended to extract one quantification column
-from each of a set of gene quantification files.
-"""
-import argparse
-import collections
-import logging
-import gzip
-import os
-import sys
-import time
-
-logger = logging.getLogger('extractor')
-
-def main(cmdline=None):
- parser = make_parser()
- args = parser.parse_args(cmdline)
-
- if args.verbose:
- logging.basicConfig(level=logging.INFO)
-
- geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
-
- if not args.quantifications:
- parser.error("Please list files to extract quantifications from")
-
- output_headers, matrix = load_matrixes(geneid_map,
- args.quantifications,
- args.column)
- write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
-
-
-def load_matrixes(geneid_map, quantifications, column_name):
- """Load a quantification from a list of quantification files.
-
- This will also convert through a gene id to gene_name map.
- if a gene name isn't found, it will default to the gene id.
-
- Arguments:
- geneid_map (dict): mapping between gene ids and gene names
- quantifications (list): list of filenames to load from
-
- Returns:
- output_headers (list): list of column headers for matrix
- (derived from input filenames)
- matrix (dict of lists): selected quantification values
- by gene.
- """
- matrix = collections.OrderedDict()
- output_headers = ['#genes']
- start = time.time()
- for quantification in quantifications:
- logger.info("Loading %s", quantification)
- with open(quantification, 'rt') as instream:
- output_headers.append(os.path.basename(quantification))
- headers = instream.readline().split('\t')
- try:
- column_to_use = headers.index(column_name)
- except ValueError as e:
- raise RuntimeError(
- 'Error: {} is not one of the column headers {}'.format(
- args.column, headers))
-
- for line in instream:
- columns = line.split('\t')
- key = geneid_map.get(columns[0], columns[0])
- matrix.setdefault(key, []).append(columns[column_to_use])
-
- logger.info("Loaded %d matrixes in %d seconds",
- len(quantifications),
- time.time() - start)
- return output_headers, matrix
-
-
-def write_merged_matrix(output, headers, matrix, drop_zeros=False):
- """Save matrix
-
- Arguments:
- output (str): output filename or None for stdout
- headers (list): list of matrix column headers)
- matrix (dict): gene_name: list of interested
- drop_zeros (bool): should we drop rows that are all zero?
- """
- logger.info("Writing matrix")
- if output:
- outstream = open(output, 'wt')
- else:
- outstream = sys.stdout
-
- outstream.write('\t'.join(headers))
- outstream.write(os.linesep)
- for key in matrix:
- columns = matrix[key]
-
- # skip over zero rows
- if drop_zeros:
- for x in columns:
- if float(x) != 0:
- break
- else:
- continue
-
- outstream.write(key)
- outstream.write('\t')
- outstream.write('\t'.join(matrix[key]))
- outstream.write(os.linesep)
-
- if outstream != sys.stdout:
- outstream.close()
-
-
-def make_parser():
- """Build argument parser.
- """
- parser = argparse.ArgumentParser()
- parser.add_argument('--gtf', help='gtf file to load')
- parser.add_argument('--column', default='FPKM',
- help='which column to use')
- parser.add_argument('-o', '--output',
- help='filename to write merged matrix to')
- parser.add_argument('--no-zeros', default=False, action='store_true',
- help='Drop rows that are all zero')
- parser.add_argument('-v', '--verbose', default=False,
- action='store_true',
- help='report progress')
- parser.add_argument('quantifications', nargs='*',
- help='list of quantification files to load')
-
- return parser
-
-
-def build_geneid_to_gene(gencode):
- """Build a dictionary mapping from gene_id to gene_name.
-
- Arguments:
- gencode (str): compressed filename to read
-
- Returns:
- dictionary mapping gene_id to gene_name
- """
- logger.info("Loading %s", gencode)
- start = time.time()
- names = {}
- with gzip.GzipFile(gencode, 'r') as instream:
- for line in instream:
- line = line.decode('ascii')
-
- if line.startswith('#'):
- continue
-
- columns = line.split('\t')
-
- gene_id = None
- gene_name = None
-
- for item in columns[-1].split(';'):
- item = item.strip()
- if len(item) == 0:
- continue
- item = item.split()
- if len(item) != 2:
- print("Confused: {} {}".format(item, len(item)))
- name, value = item
- if name == 'gene_id':
- gene_id = value[1:-1]
- elif name == 'gene_name':
- gene_name = value[1:-1]
-
- if gene_id and gene_name:
- names[gene_id] = gene_name
-
- logger.info("loaded in %d seconds", time.time() - start)
- return names
-
-
-if __name__ == '__main__':
- main()
-
--- /dev/null
+#!/usr/bin/env python3
+"""Generate a quantification matrix
+
+This is intended to extract one quantification column
+from each of a set of gene quantification files.
+"""
+import argparse
+import collections
+import logging
+import gzip
+import os
+import sys
+import time
+
+logger = logging.getLogger('extractor')
+
+def main(cmdline=None):
+ parser = make_parser()
+ args = parser.parse_args(cmdline)
+
+ if args.verbose:
+ logging.basicConfig(level=logging.INFO)
+
+ geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
+
+ if not args.quantifications:
+ parser.error("Please list files to extract quantifications from")
+
+ output_headers, matrix = load_matrixes(geneid_map,
+ args.quantifications,
+ args.column)
+ write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
+
+
+def load_matrixes(geneid_map, quantifications, column_name):
+ """Load a quantification from a list of quantification files.
+
+ This will also convert through a gene id to gene_name map.
+ if a gene name isn't found, it will default to the gene id.
+
+ Arguments:
+ geneid_map (dict): mapping between gene ids and gene names
+ quantifications (list): list of filenames to load from
+
+ Returns:
+ output_headers (list): list of column headers for matrix
+ (derived from input filenames)
+ matrix (dict of lists): selected quantification values
+ by gene.
+ """
+ matrix = collections.OrderedDict()
+ output_headers = ['#genes']
+ start = time.time()
+ for quantification in quantifications:
+ logger.info("Loading %s", quantification)
+ with open(quantification, 'rt') as instream:
+ output_headers.append(os.path.basename(quantification))
+ headers = instream.readline().split('\t')
+ try:
+ column_to_use = headers.index(column_name)
+ except ValueError as e:
+ raise RuntimeError(
+ 'Error: {} is not one of the column headers {}'.format(
+ args.column, headers))
+
+ for line in instream:
+ columns = line.split('\t')
+ key = geneid_map.get(columns[0], columns[0])
+ matrix.setdefault(key, []).append(columns[column_to_use])
+
+ logger.info("Loaded %d matrixes in %d seconds",
+ len(quantifications),
+ time.time() - start)
+ return output_headers, matrix
+
+
+def write_merged_matrix(output, headers, matrix, drop_zeros=False):
+ """Save matrix
+
+ Arguments:
+ output (str): output filename or None for stdout
+ headers (list): list of matrix column headers)
+ matrix (dict): gene_name: list of interested
+ drop_zeros (bool): should we drop rows that are all zero?
+ """
+ logger.info("Writing matrix")
+ if output:
+ outstream = open(output, 'wt')
+ else:
+ outstream = sys.stdout
+
+ outstream.write('\t'.join(headers))
+ outstream.write(os.linesep)
+ for key in matrix:
+ columns = matrix[key]
+
+ # skip over zero rows
+ if drop_zeros:
+ for x in columns:
+ if float(x) != 0:
+ break
+ else:
+ continue
+
+ outstream.write(key)
+ outstream.write('\t')
+ outstream.write('\t'.join(matrix[key]))
+ outstream.write(os.linesep)
+
+ if outstream != sys.stdout:
+ outstream.close()
+
+
+def make_parser():
+ """Build argument parser.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--gtf', help='gtf file to load')
+ parser.add_argument('--column', default='FPKM',
+ help='which column to use')
+ parser.add_argument('-o', '--output',
+ help='filename to write merged matrix to')
+ parser.add_argument('--no-zeros', default=False, action='store_true',
+ help='Drop rows that are all zero')
+ parser.add_argument('-v', '--verbose', default=False,
+ action='store_true',
+ help='report progress')
+ parser.add_argument('quantifications', nargs='*',
+ help='list of quantification files to load')
+
+ return parser
+
+
+def build_geneid_to_gene(gencode):
+ """Build a dictionary mapping from gene_id to gene_name.
+
+ Arguments:
+ gencode (str): compressed filename to read
+
+ Returns:
+ dictionary mapping gene_id to gene_name
+ """
+ logger.info("Loading %s", gencode)
+ start = time.time()
+ names = {}
+ with gzip.GzipFile(gencode, 'r') as instream:
+ for line in instream:
+ line = line.decode('ascii')
+
+ if line.startswith('#'):
+ continue
+
+ columns = line.split('\t')
+
+ gene_id = None
+ gene_name = None
+
+ for item in columns[-1].split(';'):
+ item = item.strip()
+ if len(item) == 0:
+ continue
+ item = item.split()
+ if len(item) != 2:
+ print("Confused: {} {}".format(item, len(item)))
+ name, value = item
+ if name == 'gene_id':
+ gene_id = value[1:-1]
+ elif name == 'gene_name':
+ gene_name = value[1:-1]
+
+ if gene_id and gene_name:
+ names[gene_id] = gene_name
+
+ logger.info("loaded in %d seconds", time.time() - start)
+ return names
+
+
+if __name__ == '__main__':
+ main()
+