2 """Generate a quantification matrix
4 This is intended to extract one quantification column
5 from each of a set of gene quantification files.
7 # Copyright (2015) Diane Trout & California Institute of Technology
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 2 of the License, or
12 # (at your option) any later version.
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License along
20 # with this program; if not, write to the Free Software Foundation, Inc.,
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
31 logger = logging.getLogger('extractor')
33 def main(cmdline=None):
34 parser = make_parser()
35 args = parser.parse_args(cmdline)
38 logging.basicConfig(level=logging.INFO)
40 geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
42 if not args.quantifications:
43 parser.error("Please list files to extract quantifications from")
45 output_headers, matrix = load_matrixes(args.quantifications,
48 outstream = open(args.output, 'wt')
50 outstream = sys.stdout
52 write_merged_matrix(outstream,
61 def load_matrixes(quantifications, column_name):
62 """Load a quantification from a list of quantification files.
64 This will also convert through a gene id to gene_name map.
65 if a gene name isn't found, it will default to the gene id.
68 quantifications (list): list of filenames to load from
69 column_name (str): what column we should be looking for
72 output_headers (list): list of column headers for matrix
73 (derived from input filenames)
74 matrix (dict of lists): selected quantification values
77 matrix = collections.OrderedDict()
78 output_headers = ['#genes']
80 for quantification in quantifications:
81 logger.info("Loading %s", quantification)
82 with open(quantification, 'rt') as instream:
83 output_headers.append(os.path.basename(quantification))
84 headers = instream.readline().split('\t')
86 column_to_use = headers.index(column_name)
87 except ValueError as e:
89 'Error: {} is not one of the column headers {}'.format(
90 args.column, headers))
93 columns = line.split('\t')
95 matrix.setdefault(key, []).append(columns[column_to_use])
97 logger.info("Loaded %d matrixes in %d seconds",
100 return output_headers, matrix
103 def write_merged_matrix(outstream, geneid_map, headers, matrix,
108 outstream (stream): output to write to
109 geneid_map (dict): gene id to gene name mapping
110 headers (list): list of matrix column headers)
111 matrix (dict): gene_name: list of interested
112 drop_zeros (bool): should we drop rows that are all zero?
114 logger.info("Writing matrix")
116 outstream.write('\t'.join(headers))
117 outstream.write(os.linesep)
119 columns = matrix[key]
121 # skip over zero rows
130 gene_name = geneid_map.get(key, None)
132 label.append(gene_name)
135 outstream.write('-'.join(label))
136 outstream.write('\t')
137 outstream.write('\t'.join(matrix[key]))
138 outstream.write(os.linesep)
142 """Build argument parser.
144 parser = argparse.ArgumentParser()
145 parser.add_argument('--gtf', help='gtf file to load')
146 parser.add_argument('--column', default='FPKM',
147 help='which column to use')
148 parser.add_argument('-o', '--output',
149 help='filename to write merged matrix to')
150 parser.add_argument('--no-zeros', default=False, action='store_true',
151 help='Drop rows that are all zero')
152 parser.add_argument('-v', '--verbose', default=False,
154 help='report progress')
155 parser.add_argument('quantifications', nargs='*',
156 help='list of quantification files to load')
161 def build_geneid_to_gene(gencode):
162 """Build a dictionary mapping from gene_id to gene_name.
165 gencode (str): compressed filename to read
168 dictionary mapping gene_id to gene_name
170 logger.info("Loading %s", gencode)
173 with gzip.GzipFile(gencode, 'r') as instream:
174 for line in instream:
175 line = line.decode('ascii')
177 if line.startswith('#'):
180 columns = line.split('\t')
185 for item in columns[-1].split(';'):
191 print("Confused: {} {}".format(item, len(item)))
193 if name == 'gene_id':
194 gene_id = value[1:-1]
195 elif name == 'gene_name':
196 gene_name = value[1:-1]
198 if gene_id and gene_name:
199 names[gene_id] = gene_name
201 logger.info("loaded in %d seconds", time.time() - start)
205 if __name__ == '__main__':