7 print 'psyco not running'
9 from cistematic.core import complement
10 from cistematic.core.motif import Motif
11 from cistematic.genomes import Genome
12 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
17 print 'getallNRSE: version 3.5'
23 usage = "usage: python %s genome regionfile siteOutfile [options]"
25 parser = getParser(usage)
26 (options, args) = parser.parse_args(argv[1:])
36 getallNRSE(genome, infilename, outfilename, options.chipfilename,
37 options.minHeight, options.minFraction, options.plotname,
38 options.doCache, options.normalize, options.doVerbose,
39 options.doMarkov1, options.maxpeakdist, options.fullOnly,
45 parser = optparse.OptionParser(usage=usage)
46 parser.add_option("--dataset", dest="chipfilename")
47 parser.add_option("--min", type="float", dest="minHeight")
48 parser.add_option("--minfraction", type="float", dest="minFraction")
49 parser.add_option("--plot", dest="plotname")
50 parser.add_option("--cache", action="store_true", dest="doCache")
51 parser.add_option("--raw", action="store_false", dest="normalize")
52 parser.add_option("--verbose", action="store_true", dest="doVerbose")
53 parser.add_option("--markov1", action="store_true", dest="doMarkov1")
54 parser.add_option("--peakdist", type="int", dest="maxpeakdist")
55 parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
56 parser.add_option("--motifdir", dest="motifDir")
58 configParser = getConfigParser()
59 section = "getallNRSE"
60 chipfilename = getConfigOption(configParser, section, "chipfilename", "")
61 minHeight = getConfigFloatOption(configParser, section, "minHeight", -2.)
62 minFraction = getConfigFloatOption(configParser, section, "minFraction", -2.)
63 plotname = getConfigOption(configParser, section, "plotname", "")
64 doCache = getConfigBoolOption(configParser, section, "doCache", False)
65 normalize = getConfigBoolOption(configParser, section, "normalize", True)
66 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
67 doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
68 maxpeakdist = getConfigOption(configParser, section, "maxpeakdist", None)
69 fullOnly = getConfigBoolOption(configParser, section, "fullOnly", False)
70 motifDir = getConfigOption(configParser, section, "motifDir", "./")
72 parser.set_defaults(chipfilename=chipfilename, minHeight=minHeight, minFraction=minFraction, plotname=plotname,
73 doCache=doCache, normalize=normalize, doVerbose=doVerbose, doMarkov1=doMarkov1,
74 maxpeakdist=maxpeakdist, fullOnly=fullOnly, motifDir=motifDir)
79 def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
80 minFraction=-2., plotname="", doCache=False, normalize=True,
81 doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
89 if motifDir[-1] != "/":
95 hitRDS = ReadDataset.ReadDataset(chipfilename, verbose=doVerbose, cache=doCache)
98 normalizeBy = len(hitRDS) / 1000000.
102 print "scaling minFraction to %.2f" % minFraction
104 if maxpeakdist is not None:
105 enforcePeakDist = True
107 enforcePeakDist = False
110 mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
111 motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
112 motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
113 bestScore = mot.bestConsensusScore()
114 bestLeft = motL.bestConsensusScore()
115 bestRight = motR.bestConsensusScore()
119 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
121 outfile = open(outfilename, "w")
122 outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
123 outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
124 outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
129 for rchrom in regions:
130 if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
133 for region in regions[rchrom]:
134 regionList.append((rchrom, region.start, region.length))
138 for (rchrom, start, length) in regionList:
139 seq = hg.sequence(rchrom, start, length)
141 if rchrom != currentChrom:
142 fullchrom = "chr" + rchrom
143 hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
144 currentChrom = rchrom
146 peak = findPeak(hitDict[rchrom], start, length, doWeight=True)
148 numHits = peak.numHits
153 peakscore = peak.smoothArray[peakpos]
158 numHits /= normalizeBy
159 peakscore /= normalizeBy
164 smoothArray = [0.] * length
168 lefts = motL.locateMarkov1(seq, 3.)
169 rights = motR.locateMarkov1(seq, 3.)
171 lefts = motL.locateMotif(seq, 70)
172 rights = motR.locateMotif(seq, 70)
174 allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
177 # look for canonicals and non-canonicals
178 if len(allhalfs) > 1:
179 (firstpos, firstsense, firsttype) = allhalfs[0]
180 for (secondpos, secondsense, secondtype) in allhalfs[1:]:
182 withinDistance = False
184 if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
185 withinDistance = True
186 if not withinDistance:
188 firstsense = secondsense
189 firsttype = secondtype
193 dist = secondpos - firstpos + 2
195 dist = secondpos - firstpos -1
197 if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
198 if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
199 found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
201 if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
202 found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
205 firstsense = secondsense
206 firsttype = secondtype
208 # did we miss any 70%+ matches ?
210 matches = mot.locateMarkov1(seq, 3.5)
212 matches = mot.locateMotif(seq, 70)
214 for (pos, sense) in matches:
216 for (fpos, fpeakdist, fdist) in found:
217 if pos + start == fpos:
222 withinDistance = False
224 if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
225 withinDistance = True
229 found.append((start + pos, pos - thePos + 10, 11))
232 found.append((start + pos, pos - peakpos + 10, 11))
234 # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
235 if len(found) == 0 and not fullOnly:
238 bestdist = maxpeakdist
243 for (pos, sense, type) in allhalfs:
246 if abs(pos - aPos) < bestdist:
247 bestdist = abs(pos - aPos)
251 found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
255 if (doDataset and bestdist < 101):
257 found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
261 # see if we found an acceptable match
263 for (foundpos, posdist, dist) in found:
264 # get a score for 21-mer, report
265 seq = hg.sequence(rchrom, foundpos, 21)
266 # height will be measured from the center of the motif
268 for pos in range(10 + dist):
270 currentHeight = smoothArray[int(peakpos + posdist + pos)]
274 if currentHeight > height:
275 height = currentHeight
278 height /= normalizeBy
280 fractionHeight = height / peakscore
281 if height < minHeight or fractionHeight < minFraction:
285 (front, back) = mot.scoreMotif(seq)
288 score = int(100 * front / bestScore)
289 theseq = hg.sequence(rchrom, foundpos, 10 + dist)
291 score = int(100 * back / bestScore)
292 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
301 testseq = hg.sequence(rchrom, foundpos, 10 + dist)
303 testseq = complement(testseq)
305 leftseq = testseq[:9]
306 rightseq = testseq[dist-2:]
308 testseq = hg.sequence(rchrom, foundpos, 12)
310 testseq = complement(testseq)
311 leftseq = testseq[3:]
313 leftseq = testseq[:9]
317 (lfront, lback) = motL.scoreMotif(leftseq)
318 (rfront, rback) = motR.scoreMotif(rightseq)
320 leftScore = int(100 * lfront) / bestLeft
323 leftScore = int(100 * lback) / bestLeft
327 rightScore = int(100 * rfront) / bestRight
330 rightScore = int(100 * rback) / bestRight
334 if rightScore > leftScore:
339 if sense == "-" and dist > 0:
340 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
342 outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
346 outfile.write(outline + "/n")
348 # we didn't find a site - draw region
349 if not foundValue and doVerbose:
350 outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
352 outfile.write(outline + "\n")
354 if not foundValue and doPlot:
355 drawarray = [val + notFoundIndex for val in smoothArray]
356 drawpos = [drawarray[val] for val in topPos]
358 plot(topPos, drawpos, "r.")
359 goodmatches = mot.locateMotif(seq, 75)
360 if len(goodmatches) > 0:
365 for (mstart, sense) in goodmatches:
366 drawgood.append(mstart)
367 drawgoody.append(drawarray[mstart])
369 plot(drawgood, drawgoody, "g.")
378 if __name__ == "__main__":