snapshot of 4.0a development. initial git repo commit
[erange.git] / scatterfields.py
1 """
2     usage: python scatterfields.py infilename xaxisLabel xField yaxisLabel yField outImageName [--xmin xMin] [--ymin yMin]
3                   [--xmax xMax] [--ymax yMax] [--doLogF1] [--doLogF2] [--arcsinh] [--order polyOrder] [--base logBase]
4                   [--markGenes geneFile] [--markfold times] [--noregression] [--large] [--markdiag] [--title text] [--verbose]
5
6            Do a scatter plot of 2 fields from an input file.
7            fields are counted from 0.
8            use [-order polyOrder] to specify polynomial fits > 1
9            Supports very rudimentary compound fields for X value
10            using python's lambda functions (omit the keyword lambda)
11 """
12
13 import matplotlib
14 matplotlib.use("Agg")
15
16 from pylab import *
17 import math, cmath
18 import sys
19 import optparse
20
21 alphaVal = 0.5
22
23 print "%prog: version 3.1"
24
25 def main(argv=None):
26     if not argv:
27         argv = sys.argv
28
29     usage = __doc__
30
31     parser = optparse.OptionParser(usage=usage)
32     parser.add_option("--xmin", type="float", dest="forcexmin")
33     parser.add_option("--ymin", type="float", dest="forceymin")
34     parser.add_option("--xmax", type="float", dest="forcexmax")
35     parser.add_option("--ymax", type="float", dest="forceymax")
36     parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
37     parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
38     parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
39     parser.add_option("--order", type="int", dest="fitOrder")
40     parser.add_option("--base", type="int", dest="base")
41     parser.add_option("--markGenes", dest="markFile")
42     parser.add_option("--markfold", type="float", dest="foldChange")
43     parser.add_option("--noregression", action="store_false", dest="doRegression")
44     parser.add_option("--large", action="store_true", dest="plotLarge")
45     parser.add_option("--markdiag", action="store_true", dest="markDiag")
46     parser.add_option("--title", type="int", dest="figtitle")
47     parser.add_option("--verbose", action="store_true", dest="verbose")
48     parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False,
49                         doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None,
50                         foldChange=None, doRegression=True, plotLarge=False, markDiag=False,
51                         figtitle="", verbose=False)
52
53     (options, args) = parser.parse_args(argv[1:])
54
55     if len(args) < 6:
56         print usage
57         sys.exit(1)
58
59     infile = open(args[0])
60     xaxis = args[1]
61     xField = args[2]
62     yaxis = args[3]
63     yField = int(args[4])
64     outfilename = args[5]
65
66     scatterfields(infile, xaxis, xField, yaxis, yField, outfilename, options.forcexmin,
67                   options.forceymin, options.forcexmax, options.forceymax, options.doLogF1,
68                   options.doLogF2, options.doArcsinh, options.fitOrder, options.base,
69                   options.markFile, options.foldChange, options.doRegression, options.plotLarge,
70                   options.markDiag, options.figtitle, options.verbose)
71
72
73 def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
74                   forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
75                   base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
76                   markDiag=False, figtitle="", verbose=False):
77
78     infile = open(infilename)
79     compoundField = False
80     try:
81         xField = int(xField)
82     except:
83         try:
84             compoundOp = "lambda %s" % xField
85             operator = eval(compoundOp)
86             compoundField = True
87             print "compound field %s" % xField
88         except:
89             pass
90
91         if not compoundField:
92             print "expression %s not supported" % xField
93             sys.exit(1)
94
95     markedGenes = []
96     marking = False
97     if markFile is not None:
98         for line in markFile:
99             try:
100                 markedGenes.append(line.strip().split()[0].upper())
101             except:
102                 markedGenes.append(line.strip().upper())
103        
104         markFile.close()
105         marking = True
106
107     markFold = False
108     if foldChange is not None:
109         markFold = True
110
111     newscores = []
112     oldscores = []
113
114     markednewscores = []
115     markedoldscores = []
116
117     markedfoldnewscores = []
118     markedfoldoldscores = []
119
120     ymax = 0.
121     xmax = 0.
122     for line in infile:
123         fields = line.strip().split()
124         gene = fields[0]
125         try:
126             if compoundField:
127                 score = operator(fields)
128             else:
129                 score = float(fields[xField])
130
131             newscore = float(fields[yField])
132         except:
133             continue
134
135         foldMarkThisScore = False
136         if markFold:
137             tempscore = score
138             if tempscore == 0:
139                 tempscore = 0.03
140
141             tempratio = newscore / tempscore
142             if tempratio == 0:
143                 tempratio2 = tempscore / 0.03
144             else:
145                 tempratio2 = 1. / tempratio
146
147             if tempratio > foldChange or tempratio2 > foldChange:
148                 foldMarkThisScore = True
149
150         if doArcsinh:
151             score = abs(cmath.asinh(score))
152         elif doLogF1:
153             try:
154                 score = math.log(score, base)
155             except:
156                 score = forcexmin
157
158             if score > xmax:
159                 xmax = score
160
161         if doArcsinh:
162             newscore = abs(cmath.asinh(newscore))
163         elif doLogF2:
164             try:
165                 newscore = math.log(newscore, base)
166             except:
167                 newscore = forceymin
168
169             if newscore > ymax:
170                 ymax = newscore
171
172         oldscores.append(score)
173         newscores.append(newscore)
174         if foldMarkThisScore:
175             markedfoldoldscores.append(score)
176             markedfoldnewscores.append(newscore)
177             if marking and gene.upper() not in markedGenes:
178                 print gene, score, newscore, "unmarked"
179
180             if gene.upper() in markedGenes:
181                 print gene, score, newscore, "overfold"
182
183             if verbose:
184                 print len(markedfoldoldscores), line.strip()
185
186         if gene.upper() in markedGenes:
187             if not foldMarkThisScore:
188                 print gene, score, newscore
189
190             markedoldscores.append(score)
191             markednewscores.append(newscore)
192
193     print score, newscore
194     print fields
195
196     if plotLarge and markFold:
197         plot(oldscores, newscores, "^", markersize=10., color="0.75", alpha=alphaVal)
198     elif plotLarge:
199         plot(oldscores, newscores, "b^", markersize=10., alpha=alphaVal)
200     elif markFold:
201         plot(oldscores, newscores, ",", color="0.75", alpha=alphaVal)
202     else:
203         plot(oldscores, newscores, "b,", alpha=alphaVal)
204
205     if len(markedfoldoldscores) > 0:
206         if plotLarge:
207             plot(markedfoldoldscores, markedfoldnewscores, "b^", markersize=10., alpha=alphaVal)
208         else:
209             plot(markedfoldoldscores, markedfoldnewscores, "b,", alpha=alphaVal)
210
211     if len(markedoldscores) > 0:
212         if plotLarge:
213             plot(markedoldscores, markednewscores, "r^", color="red", markersize=10., alpha=alphaVal)
214         else:
215             plot(markedoldscores, markednewscores, ".", color="red", markersize=4., alpha=alphaVal)
216
217     fitvalues = polyfit(oldscores, newscores, fitOrder)
218     print fitvalues
219     print len(oldscores)
220
221     meanObserved = float(sum(newscores)) / len(newscores)
222     if len(fitvalues) == 2:
223         predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
224     else:
225         predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
226
227     SSt = 0.
228     SSe = 0.
229
230     for index in range(len(newscores)):
231         SSt += (newscores[index] - meanObserved) ** 2
232         SSe += (newscores[index] - predicted[index]) ** 2
233
234     rSquared = 1. - SSe / SSt
235     print "R**2 = %f" % rSquared
236
237     oldscores.sort()
238     if len(fitvalues) == 2:
239         predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
240     else:
241         predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
242
243     if doRegression:
244         plot(oldscores, predicted, "-k", linewidth=2)
245
246     if figtitle == "":
247         figtitle = "%s vs %s (R^2: %.2f)" % (yaxis, xaxis, rSquared)
248
249     title(figtitle)
250
251     if markDiag:
252         min = forcexmin
253         if forceymin < min:
254             min = forceymin
255
256         max = xmax
257         if ymax > max:
258             max = ymax
259
260         if forcexmax > max:
261             max = forcexmax
262
263         if forceymax > max:
264             max = forceymax
265
266         plot([min,max], [min,max], "-g", linewidth=2)
267
268     print forcexmin, forceymin
269
270     if doLogF2:
271         ylabel("log%s(%s)" % (str(base), yaxis))
272     else:
273         ylabel(yaxis)
274
275     if doLogF1:
276         xlabel("log%s(%s)" % (str(base), xaxis))
277     else:
278         xlabel(xaxis)
279
280     if xmax > 0:
281         xlim(forcexmin - 0.05, xmax)
282
283     if ymax > 0:
284         ylim(forceymin - 0.05, ymax)
285
286     if forcexmax > 0 and forceymax > 0:
287         xlim(forcexmin - 0.05, forcexmax)
288         ylim(forceymin - 0.05, forceymax)
289
290     gca().get_xaxis().tick_bottom()
291     gca().get_yaxis().tick_left()
292
293     savefig(outfilename, dpi=100)
294
295
296 if __name__ == "__main__":
297     main(sys.argv)