erange version 4.0a dev release
[erange.git] / getallNRSE.py
1 import sys, optparse
2
3 try:
4     import psyco
5     psyco.full()
6 except:
7     print 'psyco not running'
8
9 from cistematic.core import complement
10 from cistematic.core.motif import Motif
11 from cistematic.genomes import Genome
12 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
13 import ReadDataset
14 from pylab import *
15 import matplotlib
16
17 print 'getallNRSE: version 3.5'
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     usage = "usage: python %s genome regionfile siteOutfile [options]"
24
25     parser = getParser(usage)
26     (options, args) = parser.parse_args(argv[1:])
27
28     if len(args) < 3:
29         print usage
30         sys.exit(1)
31
32     genome = argv[0]
33     infilename = args[1]
34     outfilename = args[2]
35
36     getallNRSE(genome, infilename, outfilename, options.chipfilename,
37                options.minHeight, options.minFraction, options.plotname,
38                options.doCache, options.normalize, options.doVerbose,
39                options.doMarkov1, options.maxpeakdist, options.fullOnly,
40                options.motifDir)
41
42
43 def getParser(usage):
44
45     parser = optparse.OptionParser(usage=usage)
46     parser.add_option("--dataset", dest="chipfilename")
47     parser.add_option("--min", type="float", dest="minHeight")
48     parser.add_option("--minfraction", type="float", dest="minFraction")
49     parser.add_option("--plot", dest="plotname")
50     parser.add_option("--cache", action="store_true", dest="doCache")
51     parser.add_option("--raw", action="store_false", dest="normalize")
52     parser.add_option("--verbose", action="store_true", dest="doVerbose")
53     parser.add_option("--markov1", action="store_true", dest="doMarkov1")
54     parser.add_option("--peakdist", type="int", dest="maxpeakdist")
55     parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
56     parser.add_option("--motifdir", dest="motifDir")
57
58     configParser = getConfigParser()
59     section = "getallNRSE"
60     chipfilename = getConfigOption(configParser, section, "chipfilename", "")
61     minHeight = getConfigFloatOption(configParser, section, "minHeight", -2.)
62     minFraction = getConfigFloatOption(configParser, section, "minFraction", -2.)
63     plotname = getConfigOption(configParser, section, "plotname", "")
64     doCache = getConfigBoolOption(configParser, section, "doCache", False)
65     normalize = getConfigBoolOption(configParser, section, "normalize", True)
66     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
67     doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
68     maxpeakdist = getConfigOption(configParser, section, "maxpeakdist", None)
69     fullOnly = getConfigBoolOption(configParser, section, "fullOnly", False)
70     motifDir = getConfigOption(configParser, section, "motifDir", "./")
71
72     parser.set_defaults(chipfilename=chipfilename, minHeight=minHeight, minFraction=minFraction, plotname=plotname,
73                         doCache=doCache, normalize=normalize, doVerbose=doVerbose, doMarkov1=doMarkov1,
74                         maxpeakdist=maxpeakdist, fullOnly=fullOnly, motifDir=motifDir)
75
76     return parser
77
78
79 def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
80                minFraction=-2., plotname="", doCache=False, normalize=True,
81                doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
82                motifDir="./"):
83
84     doPlot = False
85     if plotname:
86         matplotlib.use("Agg")
87         doPlot = True
88
89     if motifDir[-1] != "/":
90         motifDir += "/"
91
92     doDataset = False
93     normalizeBy = 1
94     if chipfilename:
95         hitRDS = ReadDataset.ReadDataset(chipfilename, verbose=doVerbose, cache=doCache)
96         doDataset = True
97         if normalize:
98             normalizeBy = len(hitRDS) / 1000000.
99
100     if minFraction > 1.:
101         minFraction /= 100.
102         print "scaling minFraction to %.2f" % minFraction
103
104     if maxpeakdist is not None:
105         enforcePeakDist = True
106     else:
107         enforcePeakDist = False
108         maxpeakdist = 101
109
110     mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
111     motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
112     motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
113     bestScore = mot.bestConsensusScore()
114     bestLeft = motL.bestConsensusScore()
115     bestRight = motR.bestConsensusScore()
116
117     hg = Genome(genome)
118
119     regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
120
121     outfile = open(outfilename, "w")
122     outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
123     outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
124     outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
125
126     index = 0
127     regionList = []
128
129     for rchrom in regions:
130         if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
131             continue
132
133         for region in regions[rchrom]:
134             regionList.append((rchrom, region.start, region.length))
135
136     notFoundIndex = 0
137     currentChrom = ""
138     for (rchrom, start, length) in regionList:
139         seq = hg.sequence(rchrom, start, length)
140         if doDataset:
141             if rchrom != currentChrom:
142                 fullchrom = "chr" + rchrom
143                 hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
144                 currentChrom = rchrom
145
146             peak = findPeak(hitDict[rchrom], start, length, doWeight=True)
147             topPos = peak.topPos
148             numHits = peak.numHits
149             if len(topPos) == 0:
150                 print "topPos error"
151
152             peakpos = topPos[0]
153             peakscore = peak.smoothArray[peakpos]
154             if peakscore == 0.:
155                 peakscore = -1.
156
157             if normalize:
158                 numHits /= normalizeBy
159                 peakscore /= normalizeBy
160         else:
161             peakpos = length
162             peakscore = -1
163             numHits = 0
164             smoothArray = [0.] * length
165
166         found = []
167         if doMarkov1:
168             lefts = motL.locateMarkov1(seq, 3.)
169             rights = motR.locateMarkov1(seq, 3.)
170         else:
171             lefts = motL.locateMotif(seq, 70)
172             rights = motR.locateMotif(seq, 70)
173
174         allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
175         allhalfs.sort()
176
177         # look for canonicals and non-canonicals
178         if len(allhalfs) > 1:
179             (firstpos, firstsense, firsttype) = allhalfs[0]
180             for (secondpos, secondsense, secondtype) in allhalfs[1:]:
181                 if enforcePeakDist:
182                     withinDistance = False
183                     for aPos in topPos:
184                         if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
185                             withinDistance = True
186                     if not withinDistance:
187                         firstpos = secondpos
188                         firstsense = secondsense
189                         firsttype = secondtype
190                         continue
191
192                 if firsttype == "L":
193                     dist = secondpos - firstpos + 2
194                 else:
195                     dist = secondpos - firstpos -1
196
197                 if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
198                     if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
199                         found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
200
201                     if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
202                         found.append((start + firstpos, firstpos  - peakpos + (dist + 10)/2, dist))
203
204                 firstpos = secondpos
205                 firstsense = secondsense
206                 firsttype = secondtype
207
208         # did we miss any 70%+ matches ?
209         if doMarkov1:
210             matches = mot.locateMarkov1(seq, 3.5)
211         else:
212             matches = mot.locateMotif(seq, 70)
213
214         for (pos, sense) in matches:
215             alreadyFound = False
216             for (fpos, fpeakdist, fdist) in found:
217                 if pos + start == fpos:
218                     alreadyFound = True
219
220             if not alreadyFound:
221                 if enforcePeakDist:
222                     withinDistance = False
223                     for aPos in topPos:
224                         if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
225                             withinDistance = True
226                             thePos = aPos
227
228                     if withinDistance:
229                         found.append((start + pos, pos - thePos + 10, 11))
230
231                 else:
232                     found.append((start + pos, pos - peakpos + 10, 11))
233
234         # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
235         if len(found) == 0 and not fullOnly:
236             bestone = -1
237             if not doDataset:
238                 bestdist = maxpeakdist
239             else:
240                 bestdist = length
241
242             index = 0
243             for (pos, sense, type) in allhalfs:
244                 if doDataset:
245                     for aPos in topPos:
246                         if abs(pos - aPos) < bestdist:
247                             bestdist = abs(pos - aPos)
248                             bestone = index
249                             peakpos = aPos
250                 else:
251                     found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
252
253                 index += 1
254
255             if (doDataset and bestdist < 101):
256                 try:
257                     found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
258                 except:
259                     continue
260
261         # see if we found an acceptable match
262         foundValue = False
263         for (foundpos, posdist, dist) in found:
264             # get a score for 21-mer, report
265             seq = hg.sequence(rchrom, foundpos, 21)
266             # height will be measured from the center of the motif
267             height = -2.
268             for pos in range(10 + dist):
269                 try:
270                     currentHeight = smoothArray[int(peakpos + posdist + pos)]
271                 except: 
272                     pass
273
274                 if currentHeight > height:
275                     height = currentHeight
276
277             if normalize:
278                 height /= normalizeBy
279
280             fractionHeight = height / peakscore
281             if height < minHeight or fractionHeight < minFraction:
282                 continue
283
284             foundValue = True
285             (front, back) = mot.scoreMotif(seq)
286             sense = "+"
287             if front > back:
288                 score = int(100 * front / bestScore)
289                 theseq = hg.sequence(rchrom, foundpos, 10 + dist)
290             else:
291                 score = int(100 * back / bestScore)
292                 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
293                 sense = "-"
294                 foundpos + 1
295
296             leftScore = -1.
297             rightScore = -1.
298             leftseq = ""
299             rightseq = ""
300             if dist > 0:
301                 testseq = hg.sequence(rchrom, foundpos, 10 + dist)
302                 if sense == "-":
303                     testseq = complement(testseq)
304
305                 leftseq = testseq[:9]
306                 rightseq = testseq[dist-2:]
307             elif dist == 0:
308                 testseq = hg.sequence(rchrom, foundpos, 12)
309                 if sense == "-":
310                     testseq = complement(testseq)
311                     leftseq = testseq[3:]
312                 else:
313                     leftseq = testseq[:9]
314
315                 rightseq = testseq
316
317             (lfront, lback) = motL.scoreMotif(leftseq)
318             (rfront, rback) = motR.scoreMotif(rightseq)
319             if lfront > lback:
320                 leftScore = int(100 * lfront) / bestLeft
321                 leftSense = "+"
322             else:
323                 leftScore = int(100 * lback) / bestLeft
324                 leftSense = "-"
325
326             if rfront > rback:
327                 rightScore = int(100 * rfront) / bestRight
328                 rightSense = "+"
329             else:
330                 rightScore = int(100 * rback) / bestRight
331                 rightSense = "-"
332
333             if dist != 11:
334                 if rightScore > leftScore:
335                     sense = rightSense
336                 else:
337                     sense = leftSense
338
339                 if sense == "-" and dist > 0:
340                     theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
341
342             outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
343             if doVerbose:
344                 print outline
345
346             outfile.write(outline + "/n")
347
348         # we didn't find a site - draw region
349         if not foundValue and doVerbose:
350             outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
351             print outline
352             outfile.write(outline + "\n")
353
354         if not foundValue and doPlot:
355             drawarray = [val + notFoundIndex for val in smoothArray]
356             drawpos = [drawarray[val] for val in topPos]
357             plot(drawarray, "b")
358             plot(topPos, drawpos, "r.")
359             goodmatches = mot.locateMotif(seq, 75)
360             if len(goodmatches) > 0:
361                 print topPos
362                 print goodmatches
363                 drawgood = []
364                 drawgoody = []
365                 for (mstart, sense) in goodmatches:
366                     drawgood.append(mstart)
367                     drawgoody.append(drawarray[mstart])
368
369                 plot(drawgood, drawgoody, "g.")
370
371             notFoundIndex -= 30
372
373     outfile.close()
374     if doPlot:
375         savefig(plotname)
376
377
378 if __name__ == "__main__":
379     main(sys.argv)