snapshot of 4.0a development. initial git repo commit
[erange.git] / getallNRSE.py
1 import sys, optparse
2
3 try:
4     import psyco
5     psyco.full()
6 except:
7     print 'psyco not running'
8
9 from cistematic.core import complement
10 from cistematic.core.motif import Motif
11 from cistematic.genomes import Genome
12 from commoncode import readDataset, getMergedRegions, findPeak
13 from pylab import *
14 import matplotlib
15
16 print '%s: version 3.4' % sys.argv[0]
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %s genome regionfile siteOutfile [options]"
23
24     parser = optparse.OptionParser(usage=usage)
25     parser.add_option("--dataset", dest="chipfilename")
26     parser.add_option("--min", type="float", dest="minHeight")
27     parser.add_option("--minfraction", type="float", dest="minFraction")
28     parser.add_option("--plot", dest="plotname")
29     parser.add_option("--cache", action="store_true", dest="doCache")
30     parser.add_option("--raw", action="store_false", dest="normalize")
31     parser.add_option("--verbose", action="store_true", dest="doVerbose")
32     parser.add_option("--markov1", action="store_true", dest="doMarkov1")
33     parser.add_option("--peakdist", type="int", dest="maxpeakdist")
34     parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
35     parser.add_option("--motifdir", dest="motifDir")
36     parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
37                         doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
38                         maxpeakdist=None, fullOnly=False, motifDir="./")
39     (options, args) = parser.parse_args(argv[1:])
40
41     if len(args) < 3:
42         print usage
43         sys.exit(1)
44
45     genome = argv[0]
46     infilename = args[1]
47     outfilename = args[2]
48
49     getallNRSE(genome, infilename, outfilename, options.chipfilename,
50                options.minHeight, options.minFraction, options.plotname,
51                options.doCache, options.normalize, options.doVerbose,
52                options.doMarkov1, options.maxpeakdist, options.fullOnly,
53                options.motifDir)
54
55
56 def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
57                minFraction=-2., plotname="", doCache=False, normalize=True,
58                doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
59                motifDir="./"):
60
61     doPlot = False
62     if plotname:
63         matplotlib.use("Agg")
64         doPlot = True
65
66     if motifDir[-1] != "/":
67         motifDir += "/"
68
69     doDataset = False
70     normalizeBy = 1
71     if chipfilename:
72         hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
73         doDataset = True
74         if normalize:
75             normalizeBy = len(hitRDS) / 1000000.
76
77     if minFraction > 1.:
78         minFraction /= 100.
79         print "scaling minFraction to %.2f" % minFraction
80
81     if maxpeakdist is not None:
82         enforcePeakDist = True
83     else:
84         enforcePeakDist = False
85         maxpeakdist = 101
86
87     mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
88     motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
89     motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
90     bestScore = mot.bestConsensusScore()
91     bestLeft = motL.bestConsensusScore()
92     bestRight = motR.bestConsensusScore()
93
94     hg = Genome(genome)
95
96     regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
97
98     outfile = open(outfilename, "w")
99     outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
100     outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
101     outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
102
103     index = 0
104     regionList = []
105
106     for rchrom in regions:
107         if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
108             continue
109
110         for (start, stop, length) in regions[rchrom]:
111             regionList.append((rchrom, start, length))
112
113     notFoundIndex = 0
114     currentChrom = ""
115     for (rchrom, start, length) in regionList:
116         seq = hg.sequence(rchrom, start, length)
117         if doDataset:
118             if rchrom != currentChrom:
119                 fullchrom = "chr" + rchrom
120                 hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
121                 currentChrom = rchrom
122
123             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
124             if len(topPos) == 0:
125                 print "topPos error"
126
127             peakpos = topPos[0]
128             peakscore = smoothArray[peakpos]
129             if peakscore == 0.:
130                 peakscore = -1.
131
132             if normalize:
133                 numHits /= normalizeBy
134                 peakscore /= normalizeBy
135         else:
136             peakpos = length
137             peakscore = -1
138             numHits = 0
139             smoothArray = [0.] * length
140
141         found = []
142         if doMarkov1:
143             lefts = motL.locateMarkov1(seq, 3.)
144             rights = motR.locateMarkov1(seq, 3.)
145         else:
146             lefts = motL.locateMotif(seq, 70)
147             rights = motR.locateMotif(seq, 70)
148
149         allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
150         allhalfs.sort()
151
152         # look for canonicals and non-canonicals
153         if len(allhalfs) > 1:
154             (firstpos, firstsense, firsttype) = allhalfs[0]
155             for (secondpos, secondsense, secondtype) in allhalfs[1:]:
156                 if enforcePeakDist:
157                     withinDistance = False
158                     for aPos in topPos:
159                         if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
160                             withinDistance = True
161                     if not withinDistance:
162                         firstpos = secondpos
163                         firstsense = secondsense
164                         firsttype = secondtype
165                         continue
166
167                 if firsttype == "L":
168                     dist = secondpos - firstpos + 2
169                 else:
170                     dist = secondpos - firstpos -1
171
172                 if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
173                     if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
174                         found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
175
176                     if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
177                         found.append((start + firstpos, firstpos  - peakpos + (dist + 10)/2, dist))
178
179                 firstpos = secondpos
180                 firstsense = secondsense
181                 firsttype = secondtype
182
183         # did we miss any 70%+ matches ?
184         if doMarkov1:
185             matches = mot.locateMarkov1(seq, 3.5)
186         else:
187             matches = mot.locateMotif(seq, 70)
188
189         for (pos, sense) in matches:
190             alreadyFound = False
191             for (fpos, fpeakdist, fdist) in found:
192                 if pos + start == fpos:
193                     alreadyFound = True
194
195             if not alreadyFound:
196                 if enforcePeakDist:
197                     withinDistance = False
198                     for aPos in topPos:
199                         if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
200                             withinDistance = True
201                             thePos = aPos
202
203                     if withinDistance:
204                         found.append((start + pos, pos - thePos + 10, 11))
205
206                 else:
207                     found.append((start + pos, pos - peakpos + 10, 11))
208
209         # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
210         if len(found) == 0 and not fullOnly:
211             bestone = -1
212             if not doDataset:
213                 bestdist = maxpeakdist
214             else:
215                 bestdist = length
216
217             index = 0
218             for (pos, sense, type) in allhalfs:
219                 if doDataset:
220                     for aPos in topPos:
221                         if abs(pos - aPos) < bestdist:
222                             bestdist = abs(pos - aPos)
223                             bestone = index
224                             peakpos = aPos
225                 else:
226                     found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
227
228                 index += 1
229
230             if (doDataset and bestdist < 101):
231                 try:
232                     found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
233                 except:
234                     continue
235
236         # see if we found an acceptable match
237         foundValue = False
238         for (foundpos, posdist, dist) in found:
239             # get a score for 21-mer, report
240             seq = hg.sequence(rchrom, foundpos, 21)
241             # height will be measured from the center of the motif
242             height = -2.
243             for pos in range(10 + dist):
244                 try:
245                     currentHeight = smoothArray[int(peakpos + posdist + pos)]
246                 except: 
247                     pass
248
249                 if currentHeight > height:
250                     height = currentHeight
251
252             if normalize:
253                 height /= normalizeBy
254
255             fractionHeight = height / peakscore
256             if height < minHeight or fractionHeight < minFraction:
257                 continue
258
259             foundValue = True
260             (front, back) = mot.scoreMotif(seq)
261             sense = "+"
262             if front > back:
263                 score = int(100 * front / bestScore)
264                 theseq = hg.sequence(rchrom, foundpos, 10 + dist)
265             else:
266                 score = int(100 * back / bestScore)
267                 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
268                 sense = "-"
269                 foundpos + 1
270
271             leftScore = -1.
272             rightScore = -1.
273             leftseq = ""
274             rightseq = ""
275             if dist > 0:
276                 testseq = hg.sequence(rchrom, foundpos, 10 + dist)
277                 if sense == "-":
278                     testseq = complement(testseq)
279
280                 leftseq = testseq[:9]
281                 rightseq = testseq[dist-2:]
282             elif dist == 0:
283                 testseq = hg.sequence(rchrom, foundpos, 12)
284                 if sense == "-":
285                     testseq = complement(testseq)
286                     leftseq = testseq[3:]
287                 else:
288                     leftseq = testseq[:9]
289
290                 rightseq = testseq
291
292             (lfront, lback) = motL.scoreMotif(leftseq)
293             (rfront, rback) = motR.scoreMotif(rightseq)
294             if lfront > lback:
295                 leftScore = int(100 * lfront) / bestLeft
296                 leftSense = "+"
297             else:
298                 leftScore = int(100 * lback) / bestLeft
299                 leftSense = "-"
300
301             if rfront > rback:
302                 rightScore = int(100 * rfront) / bestRight
303                 rightSense = "+"
304             else:
305                 rightScore = int(100 * rback) / bestRight
306                 rightSense = "-"
307
308             if dist != 11:
309                 if rightScore > leftScore:
310                     sense = rightSense
311                 else:
312                     sense = leftSense
313
314                 if sense == "-" and dist > 0:
315                     theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
316
317             outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
318             if doVerbose:
319                 print outline
320
321             outfile.write(outline + "/n")
322
323         # we didn't find a site - draw region
324         if not foundValue and doVerbose:
325             outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
326             print outline
327             outfile.write(outline + "\n")
328
329         if not foundValue and doPlot:
330             drawarray = [val + notFoundIndex for val in smoothArray]
331             drawpos = [drawarray[val] for val in topPos]
332             plot(drawarray, "b")
333             plot(topPos, drawpos, "r.")
334             goodmatches = mot.locateMotif(seq, 75)
335             if len(goodmatches) > 0:
336                 print topPos
337                 print goodmatches
338                 drawgood = []
339                 drawgoody = []
340                 for (mstart, sense) in goodmatches:
341                     drawgood.append(mstart)
342                     drawgoody.append(drawarray[mstart])
343
344                 plot(drawgood, drawgoody, "g.")
345
346             notFoundIndex -= 30
347
348     outfile.close()
349     if doPlot:
350         savefig(plotname)
351
352
353 if __name__ == "__main__":
354     main(sys.argv)