2 """Generate a quantification matrix
4 This is intended to extract one quantification column
5 from each of a set of gene quantification files.
15 logger = logging.getLogger('extractor')
17 def main(cmdline=None):
18 parser = make_parser()
19 args = parser.parse_args(cmdline)
22 logging.basicConfig(level=logging.INFO)
24 geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
26 if not args.quantifications:
27 parser.error("Please list files to extract quantifications from")
29 output_headers, matrix = load_matrixes(geneid_map,
32 write_merged_matrix(args.output, output_headers, matrix, args.no_zeros)
35 def load_matrixes(geneid_map, quantifications, column_name):
36 """Load a quantification from a list of quantification files.
38 This will also convert through a gene id to gene_name map.
39 if a gene name isn't found, it will default to the gene id.
42 geneid_map (dict): mapping between gene ids and gene names
43 quantifications (list): list of filenames to load from
46 output_headers (list): list of column headers for matrix
47 (derived from input filenames)
48 matrix (dict of lists): selected quantification values
51 matrix = collections.OrderedDict()
52 output_headers = ['#genes']
54 for quantification in quantifications:
55 logger.info("Loading %s", quantification)
56 with open(quantification, 'rt') as instream:
57 output_headers.append(os.path.basename(quantification))
58 headers = instream.readline().split('\t')
60 column_to_use = headers.index(column_name)
61 except ValueError as e:
63 'Error: {} is not one of the column headers {}'.format(
64 args.column, headers))
67 columns = line.split('\t')
68 key = geneid_map.get(columns[0], columns[0])
69 matrix.setdefault(key, []).append(columns[column_to_use])
71 logger.info("Loaded %d matrixes in %d seconds",
74 return output_headers, matrix
77 def write_merged_matrix(output, headers, matrix, drop_zeros=False):
81 output (str): output filename or None for stdout
82 headers (list): list of matrix column headers)
83 matrix (dict): gene_name: list of interested
84 drop_zeros (bool): should we drop rows that are all zero?
86 logger.info("Writing matrix")
88 outstream = open(output, 'wt')
90 outstream = sys.stdout
92 outstream.write('\t'.join(headers))
93 outstream.write(os.linesep)
106 outstream.write('\t')
107 outstream.write('\t'.join(matrix[key]))
108 outstream.write(os.linesep)
110 if outstream != sys.stdout:
115 """Build argument parser.
117 parser = argparse.ArgumentParser()
118 parser.add_argument('--gtf', help='gtf file to load')
119 parser.add_argument('--column', default='FPKM',
120 help='which column to use')
121 parser.add_argument('-o', '--output',
122 help='filename to write merged matrix to')
123 parser.add_argument('--no-zeros', default=False, action='store_true',
124 help='Drop rows that are all zero')
125 parser.add_argument('-v', '--verbose', default=False,
127 help='report progress')
128 parser.add_argument('quantifications', nargs='*',
129 help='list of quantification files to load')
134 def build_geneid_to_gene(gencode):
135 """Build a dictionary mapping from gene_id to gene_name.
138 gencode (str): compressed filename to read
141 dictionary mapping gene_id to gene_name
143 logger.info("Loading %s", gencode)
146 with gzip.GzipFile(gencode, 'r') as instream:
147 for line in instream:
148 line = line.decode('ascii')
150 if line.startswith('#'):
153 columns = line.split('\t')
158 for item in columns[-1].split(';'):
164 print("Confused: {} {}".format(item, len(item)))
166 if name == 'gene_id':
167 gene_id = value[1:-1]
168 elif name == 'gene_name':
169 gene_name = value[1:-1]
171 if gene_id and gene_name:
172 names[gene_id] = gene_name
174 logger.info("loaded in %d seconds", time.time() - start)
178 if __name__ == '__main__':