2 """Generate a quantification matrix
4 This is intended to extract one quantification column
5 from each of a set of gene quantification files.
7 # Copyright (2015) Diane Trout & California Institute of Technology
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 2 of the License, or
12 # (at your option) any later version.
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License along
20 # with this program; if not, write to the Free Software Foundation, Inc.,
21 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 from os.path import basename
33 logger = logging.getLogger('extractor')
35 def main(cmdline=None):
36 parser = make_parser()
37 args = parser.parse_args(cmdline)
40 logging.basicConfig(level=logging.INFO)
42 geneid_map = build_geneid_to_gene(args.gtf) if args.gtf else {}
44 if not args.quantifications:
45 parser.error("Please list files to extract quantifications from")
47 matrix = load_matrixes(args.quantifications, args.column)
49 matrix = filter_matrix(matrix, args.no_zeros)
52 outstream = open(args.output, 'wt')
54 outstream = sys.stdout
56 write_merged_matrix(outstream, geneid_map, matrix)
62 def load_matrixes(quantifications, column_name):
63 """Load a quantification from a list of quantification files.
65 This will also convert through a gene id to gene_name map.
66 if a gene name isn't found, it will default to the gene id.
69 quantifications (list): list of filenames to load from
70 column_name (str): what column we should be looking for
73 matrix (pandas.DataFrame): selected quantification values
76 columns = collections.OrderedDict()
78 for quantification in quantifications:
79 name = basename(getattr(quantification, 'name', quantification))
80 logger.info("Loading %s", name)
81 table = pandas.read_csv(quantification, index_col=0, sep='\t')
82 columns[name] = table[column_name]
84 matrix = pandas.DataFrame(columns, columns=columns.keys())
86 logger.info("Loaded %d matrixes in %d seconds",
92 def filter_matrix(matrix, drop_zeros):
93 """apply transformations to matrix
95 Should we drop rows with all zero or NaN?
98 matrix (pandas.DataFrame): source matrix
99 drop_zeros (bool): should we drop rows that are all zero?
102 matrix = matrix[matrix > 0].dropna(how='all')
104 return matrix.fillna(0)
107 def write_merged_matrix(outstream, geneid_map, matrix,
112 outstream (stream): output to write to
113 geneid_map (dict): gene id to gene name mapping
114 matrix (dict): gene_name: list of interested
116 logger.info("Writing matrix")
118 headers = ['#gene_id']
119 headers.extend(matrix.keys())
120 outstream.write('\t'.join(headers))
121 outstream.write(os.linesep)
123 for index in matrix.index:
125 gene_name = geneid_map.get(index, None)
127 label.append(gene_name)
130 outstream.write('-'.join(label))
131 outstream.write('\t')
132 outstream.write('\t'.join((str(x) for x in matrix.ix[index])))
133 outstream.write(os.linesep)
137 """Build argument parser.
139 parser = argparse.ArgumentParser()
140 parser.add_argument('--gtf', help='gtf file to load')
141 parser.add_argument('--column', default='FPKM',
142 help='which column to use')
143 parser.add_argument('-o', '--output',
144 help='filename to write merged matrix to')
145 parser.add_argument('--no-zeros', default=False, action='store_true',
146 help='Drop rows that are all zero')
147 parser.add_argument('-v', '--verbose', default=False,
149 help='report progress')
150 parser.add_argument('quantifications', nargs='*',
151 help='list of quantification files to load')
156 def build_geneid_to_gene(gencode):
157 """Build a dictionary mapping from gene_id to gene_name.
160 gencode (str): compressed filename to read
163 dictionary mapping gene_id to gene_name
165 logger.info("Loading %s", gencode)
168 with gzip.GzipFile(gencode, 'r') as instream:
169 for line in instream:
170 line = line.decode('ascii')
172 if line.startswith('#'):
175 columns = line.split('\t')
180 for item in columns[-1].split(';'):
186 print("Confused: {} {}".format(item, len(item)))
188 if name == 'gene_id':
189 gene_id = value[1:-1]
190 elif name == 'gene_name':
191 gene_name = value[1:-1]
193 if gene_id and gene_name:
194 names[gene_id] = gene_name
196 logger.info("loaded in %d seconds", time.time() - start)
200 if __name__ == '__main__':