From: Diane Trout Date: Fri, 3 Apr 2015 18:32:39 +0000 (-0700) Subject: Rewrite gene id only on output X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=helpful_scripts.git;a=commitdiff_plain;h=c86b0c79429b2ca38dcbd7fa898bcafcb8018863 Rewrite gene id only on output the gene name is now inserted before the gene_id if it's available. --- diff --git a/translate_tsv_genes.py b/translate_tsv_genes.py index a302d65..8b9930a 100644 --- a/translate_tsv_genes.py +++ b/translate_tsv_genes.py @@ -42,21 +42,31 @@ def main(cmdline=None): if not args.quantifications: parser.error("Please list files to extract quantifications from") - output_headers, matrix = load_matrixes(geneid_map, - args.quantifications, + output_headers, matrix = load_matrixes(args.quantifications, args.column) - write_merged_matrix(args.output, output_headers, matrix, args.no_zeros) + if args.output: + outstream = open(args.output, 'wt') + else: + outstream = sys.stdout + write_merged_matrix(outstream, + geneid_map, + output_headers, + matrix, + args.no_zeros) -def load_matrixes(geneid_map, quantifications, column_name): + if args.output: + outstream.close() + +def load_matrixes(quantifications, column_name): """Load a quantification from a list of quantification files. This will also convert through a gene id to gene_name map. if a gene name isn't found, it will default to the gene id. Arguments: - geneid_map (dict): mapping between gene ids and gene names quantifications (list): list of filenames to load from + column_name (str): what column we should be looking for Returns: output_headers (list): list of column headers for matrix @@ -81,7 +91,7 @@ def load_matrixes(geneid_map, quantifications, column_name): for line in instream: columns = line.split('\t') - key = geneid_map.get(columns[0], columns[0]) + key = columns[0] matrix.setdefault(key, []).append(columns[column_to_use]) logger.info("Loaded %d matrixes in %d seconds", @@ -90,20 +100,18 @@ def load_matrixes(geneid_map, quantifications, column_name): return output_headers, matrix -def write_merged_matrix(output, headers, matrix, drop_zeros=False): +def write_merged_matrix(outstream, geneid_map, headers, matrix, + drop_zeros=False): """Save matrix Arguments: - output (str): output filename or None for stdout + outstream (stream): output to write to + geneid_map (dict): gene id to gene name mapping headers (list): list of matrix column headers) matrix (dict): gene_name: list of interested drop_zeros (bool): should we drop rows that are all zero? """ logger.info("Writing matrix") - if output: - outstream = open(output, 'wt') - else: - outstream = sys.stdout outstream.write('\t'.join(headers)) outstream.write(os.linesep) @@ -117,14 +125,17 @@ def write_merged_matrix(output, headers, matrix, drop_zeros=False): break else: continue + + label = [] + gene_name = geneid_map.get(key, None) + if gene_name: + label.append(gene_name) + label.append(key) - outstream.write(key) + outstream.write('-'.join(label)) outstream.write('\t') outstream.write('\t'.join(matrix[key])) outstream.write(os.linesep) - - if outstream != sys.stdout: - outstream.close() def make_parser():