first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / scatterfields.py
1 """
2     usage: python scatterfields.py infilename xaxisLabel xField yaxisLabel yField outImageName [--xmin xMin] [--ymin yMin]
3                   [--xmax xMax] [--ymax yMax] [--doLogF1] [--doLogF2] [--arcsinh] [--order polyOrder] [--base logBase]
4                   [--markGenes geneFile] [--markfold times] [--noregression] [--large] [--markdiag] [--title text] [--verbose]
5
6            Do a scatter plot of 2 fields from an input file.
7            fields are counted from 0.
8            use [-order polyOrder] to specify polynomial fits > 1
9            Supports very rudimentary compound fields for X value
10            using python's lambda functions (omit the keyword lambda)
11 """
12
13 import matplotlib
14 matplotlib.use("Agg")
15
16 from pylab import *
17 import math, cmath
18 import sys
19 import optparse
20 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
21
22 alphaVal = 0.5
23
24 print "scatterfields: version 3.2"
25
26 def main(argv=None):
27     if not argv:
28         argv = sys.argv
29
30     usage = __doc__
31
32     parser = getParser(usage)
33     (options, args) = parser.parse_args(argv[1:])
34
35     if len(args) < 6:
36         print usage
37         sys.exit(1)
38
39     infile = open(args[0])
40     xaxis = args[1]
41     xField = args[2]
42     yaxis = args[3]
43     yField = int(args[4])
44     outfilename = args[5]
45
46     scatterfields(infile, xaxis, xField, yaxis, yField, outfilename, options.forcexmin,
47                   options.forceymin, options.forcexmax, options.forceymax, options.doLogF1,
48                   options.doLogF2, options.doArcsinh, options.fitOrder, options.base,
49                   options.markFile, options.foldChange, options.doRegression, options.plotLarge,
50                   options.markDiag, options.figtitle, options.verbose)
51
52
53 def getParser(usage):
54     parser = optparse.OptionParser(usage=usage)
55     parser.add_option("--xmin", type="float", dest="forcexmin")
56     parser.add_option("--ymin", type="float", dest="forceymin")
57     parser.add_option("--xmax", type="float", dest="forcexmax")
58     parser.add_option("--ymax", type="float", dest="forceymax")
59     parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
60     parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
61     parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
62     parser.add_option("--order", type="int", dest="fitOrder")
63     parser.add_option("--base", type="int", dest="base")
64     parser.add_option("--markGenes", dest="markFile")
65     parser.add_option("--markfold", type="float", dest="foldChange")
66     parser.add_option("--noregression", action="store_false", dest="doRegression")
67     parser.add_option("--large", action="store_true", dest="plotLarge")
68     parser.add_option("--markdiag", action="store_true", dest="markDiag")
69     parser.add_option("--title", type="int", dest="figtitle")
70     parser.add_option("--verbose", action="store_true", dest="verbose")
71
72     configParser = getConfigParser()
73     section = "scatterfields"
74     forcexmin = getConfigFloatOption(configParser, section, "forcexmin", 0.0)
75     forceymin = getConfigFloatOption(configParser, section, "forceymin", 0.0)
76     forcexmax = getConfigIntOption(configParser, section, "forcexmax", -1)
77     forceymax = getConfigIntOption(configParser, section, "forceymax", -1)
78     doLogF1 = getConfigBoolOption(configParser, section, "doLogF1", False)
79     doLogF2 = getConfigBoolOption(configParser, section, "doLogF2", False)
80     doArcsinh = getConfigBoolOption(configParser, section, "doArcsinh", False)
81     fitOrder = getConfigIntOption(configParser, section, "fitOrder", 1)
82     base = getConfigIntOption(configParser, section, "base", 10)
83     markFile = getConfigOption(configParser, section, "markFile", None)
84     foldChange = getConfigOption(configParser, section, "foldChange", None)
85     doRegression = getConfigBoolOption(configParser, section, "doRegression", True)
86     plotLarge = getConfigBoolOption(configParser, section, "plotLarge", False)
87     markDiag = getConfigBoolOption(configParser, section, "markDiag", False)
88     figtitle = getConfigOption(configParser, section, "figtitle", "")
89     verbose = getConfigBoolOption(configParser, section, "verbose", False)
90
91     parser.set_defaults(forcexmin=forcexmin, forceymin=forceymin, forcexmax=forcexmax, forceymax=forceymax, doLogF1=doLogF1,
92                         doLogF2=doLogF2, doArcsinh=doArcsinh, fitOrder=fitOrder, base=base, markFile=markFile,
93                         foldChange=foldChange, doRegression=doRegression, plotLarge=plotLarge, markDiag=markDiag,
94                         figtitle=figtitle, verbose=verbose)
95
96     return parser
97
98
99 def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
100                   forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
101                   base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
102                   markDiag=False, figtitle="", verbose=False):
103
104     infile = open(infilename)
105     compoundField = False
106     try:
107         xField = int(xField)
108     except:
109         try:
110             compoundOp = "lambda %s" % xField
111             operator = eval(compoundOp)
112             compoundField = True
113             print "compound field %s" % xField
114         except:
115             pass
116
117         if not compoundField:
118             print "expression %s not supported" % xField
119             sys.exit(1)
120
121     markedGenes = []
122     marking = False
123     if markFile is not None:
124         for line in markFile:
125             try:
126                 markedGenes.append(line.strip().split()[0].upper())
127             except:
128                 markedGenes.append(line.strip().upper())
129        
130         markFile.close()
131         marking = True
132
133     markFold = False
134     if foldChange is not None:
135         markFold = True
136
137     newscores = []
138     oldscores = []
139
140     markednewscores = []
141     markedoldscores = []
142
143     markedfoldnewscores = []
144     markedfoldoldscores = []
145
146     ymax = 0.
147     xmax = 0.
148     for line in infile:
149         fields = line.strip().split()
150         gene = fields[0]
151         try:
152             if compoundField:
153                 score = operator(fields)
154             else:
155                 score = float(fields[xField])
156
157             newscore = float(fields[yField])
158         except:
159             continue
160
161         foldMarkThisScore = False
162         if markFold:
163             tempscore = score
164             if tempscore == 0:
165                 tempscore = 0.03
166
167             tempratio = newscore / tempscore
168             if tempratio == 0:
169                 tempratio2 = tempscore / 0.03
170             else:
171                 tempratio2 = 1. / tempratio
172
173             if tempratio > foldChange or tempratio2 > foldChange:
174                 foldMarkThisScore = True
175
176         if doArcsinh:
177             score = abs(cmath.asinh(score))
178         elif doLogF1:
179             try:
180                 score = math.log(score, base)
181             except:
182                 score = forcexmin
183
184             if score > xmax:
185                 xmax = score
186
187         if doArcsinh:
188             newscore = abs(cmath.asinh(newscore))
189         elif doLogF2:
190             try:
191                 newscore = math.log(newscore, base)
192             except:
193                 newscore = forceymin
194
195             if newscore > ymax:
196                 ymax = newscore
197
198         oldscores.append(score)
199         newscores.append(newscore)
200         if foldMarkThisScore:
201             markedfoldoldscores.append(score)
202             markedfoldnewscores.append(newscore)
203             if marking and gene.upper() not in markedGenes:
204                 print gene, score, newscore, "unmarked"
205
206             if gene.upper() in markedGenes:
207                 print gene, score, newscore, "overfold"
208
209             if verbose:
210                 print len(markedfoldoldscores), line.strip()
211
212         if gene.upper() in markedGenes:
213             if not foldMarkThisScore:
214                 print gene, score, newscore
215
216             markedoldscores.append(score)
217             markednewscores.append(newscore)
218
219     print score, newscore
220     print fields
221
222     if plotLarge and markFold:
223         plot(oldscores, newscores, "^", markersize=10., color="0.75", alpha=alphaVal)
224     elif plotLarge:
225         plot(oldscores, newscores, "b^", markersize=10., alpha=alphaVal)
226     elif markFold:
227         plot(oldscores, newscores, ",", color="0.75", alpha=alphaVal)
228     else:
229         plot(oldscores, newscores, "b,", alpha=alphaVal)
230
231     if len(markedfoldoldscores) > 0:
232         if plotLarge:
233             plot(markedfoldoldscores, markedfoldnewscores, "b^", markersize=10., alpha=alphaVal)
234         else:
235             plot(markedfoldoldscores, markedfoldnewscores, "b,", alpha=alphaVal)
236
237     if len(markedoldscores) > 0:
238         if plotLarge:
239             plot(markedoldscores, markednewscores, "r^", color="red", markersize=10., alpha=alphaVal)
240         else:
241             plot(markedoldscores, markednewscores, ".", color="red", markersize=4., alpha=alphaVal)
242
243     fitvalues = polyfit(oldscores, newscores, fitOrder)
244     print fitvalues
245     print len(oldscores)
246
247     meanObserved = float(sum(newscores)) / len(newscores)
248     if len(fitvalues) == 2:
249         predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
250     else:
251         predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
252
253     SSt = 0.
254     SSe = 0.
255
256     for index in range(len(newscores)):
257         SSt += (newscores[index] - meanObserved) ** 2
258         SSe += (newscores[index] - predicted[index]) ** 2
259
260     rSquared = 1. - SSe / SSt
261     print "R**2 = %f" % rSquared
262
263     oldscores.sort()
264     if len(fitvalues) == 2:
265         predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
266     else:
267         predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
268
269     if doRegression:
270         plot(oldscores, predicted, "-k", linewidth=2)
271
272     if figtitle == "":
273         figtitle = "%s vs %s (R^2: %.2f)" % (yaxis, xaxis, rSquared)
274
275     title(figtitle)
276
277     if markDiag:
278         min = forcexmin
279         if forceymin < min:
280             min = forceymin
281
282         max = xmax
283         if ymax > max:
284             max = ymax
285
286         if forcexmax > max:
287             max = forcexmax
288
289         if forceymax > max:
290             max = forceymax
291
292         plot([min,max], [min,max], "-g", linewidth=2)
293
294     print forcexmin, forceymin
295
296     if doLogF2:
297         ylabel("log%s(%s)" % (str(base), yaxis))
298     else:
299         ylabel(yaxis)
300
301     if doLogF1:
302         xlabel("log%s(%s)" % (str(base), xaxis))
303     else:
304         xlabel(xaxis)
305
306     if xmax > 0:
307         xlim(forcexmin - 0.05, xmax)
308
309     if ymax > 0:
310         ylim(forceymin - 0.05, ymax)
311
312     if forcexmax > 0 and forceymax > 0:
313         xlim(forcexmin - 0.05, forcexmax)
314         ylim(forceymin - 0.05, forceymax)
315
316     gca().get_xaxis().tick_bottom()
317     gca().get_yaxis().tick_left()
318
319     savefig(outfilename, dpi=100)
320
321
322 if __name__ == "__main__":
323     main(sys.argv)