Rewrite gene id only on output
authorDiane Trout <diane@ghic.org>
Fri, 3 Apr 2015 18:32:39 +0000 (11:32 -0700)
committerDiane Trout <diane@ghic.org>
Fri, 3 Apr 2015 18:32:39 +0000 (11:32 -0700)
the gene name is now inserted before the gene_id if it's available.

translate_tsv_genes.py

index a302d654c6450e9ca4056d54e8b0a31670fc06b0..8b9930a3975f0b167d27c70a1a347a5363e96311 100644 (file)
@@ -42,21 +42,31 @@ def main(cmdline=None):
     if not args.quantifications:
         parser.error("Please list files to extract quantifications from")
         
     if not args.quantifications:
         parser.error("Please list files to extract quantifications from")
         
-    output_headers, matrix = load_matrixes(geneid_map,
-                                           args.quantifications,
+    output_headers, matrix = load_matrixes(args.quantifications,
                                            args.column)
                                            args.column)
-    write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
+    if args.output:
+        outstream = open(args.output, 'wt')
+    else:
+        outstream = sys.stdout
 
 
+    write_merged_matrix(outstream,
+                        geneid_map,
+                        output_headers,
+                        matrix,
+                        args.no_zeros)
 
 
-def load_matrixes(geneid_map, quantifications, column_name):
+    if args.output:
+        outstream.close()
+
+def load_matrixes(quantifications, column_name):
     """Load a quantification from a list of quantification files.
 
     This will also convert through a gene id to gene_name map.
     if a gene name isn't found, it will default to the gene id.
 
     Arguments:
     """Load a quantification from a list of quantification files.
 
     This will also convert through a gene id to gene_name map.
     if a gene name isn't found, it will default to the gene id.
 
     Arguments:
-        geneid_map (dict): mapping between gene ids and gene names
         quantifications (list): list of filenames to load from
         quantifications (list): list of filenames to load from
+        column_name (str): what column we should be looking for
 
     Returns:
         output_headers (list): list of column headers for matrix
 
     Returns:
         output_headers (list): list of column headers for matrix
@@ -81,7 +91,7 @@ def load_matrixes(geneid_map, quantifications, column_name):
             
             for line in instream:
                 columns = line.split('\t')
             
             for line in instream:
                 columns = line.split('\t')
-                key = geneid_map.get(columns[0], columns[0])
+                key = columns[0]
                 matrix.setdefault(key, []).append(columns[column_to_use])
 
     logger.info("Loaded %d matrixes in %d seconds",
                 matrix.setdefault(key, []).append(columns[column_to_use])
 
     logger.info("Loaded %d matrixes in %d seconds",
@@ -90,20 +100,18 @@ def load_matrixes(geneid_map, quantifications, column_name):
     return output_headers, matrix
 
 
     return output_headers, matrix
 
 
-def write_merged_matrix(output, headers, matrix, drop_zeros=False):
+def write_merged_matrix(outstream, geneid_map, headers, matrix,
+                        drop_zeros=False):
     """Save matrix
 
     Arguments:
     """Save matrix
 
     Arguments:
-        output (str): output filename or None for stdout
+        outstream (stream): output to write to
+        geneid_map (dict): gene id to gene name mapping
         headers (list): list of matrix column headers)
         matrix (dict): gene_name: list of interested
         drop_zeros (bool): should we drop rows that are all zero?
     """
     logger.info("Writing matrix")
         headers (list): list of matrix column headers)
         matrix (dict): gene_name: list of interested
         drop_zeros (bool): should we drop rows that are all zero?
     """
     logger.info("Writing matrix")
-    if output:
-        outstream = open(output, 'wt')
-    else:
-        outstream = sys.stdout
 
     outstream.write('\t'.join(headers))
     outstream.write(os.linesep)
 
     outstream.write('\t'.join(headers))
     outstream.write(os.linesep)
@@ -117,14 +125,17 @@ def write_merged_matrix(output, headers, matrix, drop_zeros=False):
                     break
             else:
                 continue
                     break
             else:
                 continue
+
+        label = []
+        gene_name = geneid_map.get(key, None)
+        if gene_name:
+            label.append(gene_name)
+        label.append(key)
             
             
-        outstream.write(key)
+        outstream.write('-'.join(label))
         outstream.write('\t')
         outstream.write('\t'.join(matrix[key]))
         outstream.write(os.linesep)
         outstream.write('\t')
         outstream.write('\t'.join(matrix[key]))
         outstream.write(os.linesep)
-
-    if outstream != sys.stdout:
-        outstream.close()
         
 
 def make_parser():
         
 
 def make_parser():