7 print 'psyco not running'
9 from cistematic.core import complement
10 from cistematic.core.motif import Motif
11 from cistematic.genomes import Genome
12 from commoncode import readDataset, getMergedRegions, findPeak
16 print '%s: version 3.4' % sys.argv[0]
22 usage = "usage: python %s genome regionfile siteOutfile [options]"
24 parser = optparse.OptionParser(usage=usage)
25 parser.add_option("--dataset", dest="chipfilename")
26 parser.add_option("--min", type="float", dest="minHeight")
27 parser.add_option("--minfraction", type="float", dest="minFraction")
28 parser.add_option("--plot", dest="plotname")
29 parser.add_option("--cache", action="store_true", dest="doCache")
30 parser.add_option("--raw", action="store_false", dest="normalize")
31 parser.add_option("--verbose", action="store_true", dest="doVerbose")
32 parser.add_option("--markov1", action="store_true", dest="doMarkov1")
33 parser.add_option("--peakdist", type="int", dest="maxpeakdist")
34 parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
35 parser.add_option("--motifdir", dest="motifDir")
36 parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
37 doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
38 maxpeakdist=None, fullOnly=False, motifDir="./")
39 (options, args) = parser.parse_args(argv[1:])
49 getallNRSE(genome, infilename, outfilename, options.chipfilename,
50 options.minHeight, options.minFraction, options.plotname,
51 options.doCache, options.normalize, options.doVerbose,
52 options.doMarkov1, options.maxpeakdist, options.fullOnly,
56 def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
57 minFraction=-2., plotname="", doCache=False, normalize=True,
58 doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
66 if motifDir[-1] != "/":
72 hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
75 normalizeBy = len(hitRDS) / 1000000.
79 print "scaling minFraction to %.2f" % minFraction
81 if maxpeakdist is not None:
82 enforcePeakDist = True
84 enforcePeakDist = False
87 mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
88 motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
89 motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
90 bestScore = mot.bestConsensusScore()
91 bestLeft = motL.bestConsensusScore()
92 bestRight = motR.bestConsensusScore()
96 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
98 outfile = open(outfilename, "w")
99 outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
100 outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
101 outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
106 for rchrom in regions:
107 if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
110 for (start, stop, length) in regions[rchrom]:
111 regionList.append((rchrom, start, length))
115 for (rchrom, start, length) in regionList:
116 seq = hg.sequence(rchrom, start, length)
118 if rchrom != currentChrom:
119 fullchrom = "chr" + rchrom
120 hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
121 currentChrom = rchrom
123 (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
128 peakscore = smoothArray[peakpos]
133 numHits /= normalizeBy
134 peakscore /= normalizeBy
139 smoothArray = [0.] * length
143 lefts = motL.locateMarkov1(seq, 3.)
144 rights = motR.locateMarkov1(seq, 3.)
146 lefts = motL.locateMotif(seq, 70)
147 rights = motR.locateMotif(seq, 70)
149 allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
152 # look for canonicals and non-canonicals
153 if len(allhalfs) > 1:
154 (firstpos, firstsense, firsttype) = allhalfs[0]
155 for (secondpos, secondsense, secondtype) in allhalfs[1:]:
157 withinDistance = False
159 if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
160 withinDistance = True
161 if not withinDistance:
163 firstsense = secondsense
164 firsttype = secondtype
168 dist = secondpos - firstpos + 2
170 dist = secondpos - firstpos -1
172 if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
173 if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
174 found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
176 if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
177 found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
180 firstsense = secondsense
181 firsttype = secondtype
183 # did we miss any 70%+ matches ?
185 matches = mot.locateMarkov1(seq, 3.5)
187 matches = mot.locateMotif(seq, 70)
189 for (pos, sense) in matches:
191 for (fpos, fpeakdist, fdist) in found:
192 if pos + start == fpos:
197 withinDistance = False
199 if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
200 withinDistance = True
204 found.append((start + pos, pos - thePos + 10, 11))
207 found.append((start + pos, pos - peakpos + 10, 11))
209 # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
210 if len(found) == 0 and not fullOnly:
213 bestdist = maxpeakdist
218 for (pos, sense, type) in allhalfs:
221 if abs(pos - aPos) < bestdist:
222 bestdist = abs(pos - aPos)
226 found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
230 if (doDataset and bestdist < 101):
232 found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
236 # see if we found an acceptable match
238 for (foundpos, posdist, dist) in found:
239 # get a score for 21-mer, report
240 seq = hg.sequence(rchrom, foundpos, 21)
241 # height will be measured from the center of the motif
243 for pos in range(10 + dist):
245 currentHeight = smoothArray[int(peakpos + posdist + pos)]
249 if currentHeight > height:
250 height = currentHeight
253 height /= normalizeBy
255 fractionHeight = height / peakscore
256 if height < minHeight or fractionHeight < minFraction:
260 (front, back) = mot.scoreMotif(seq)
263 score = int(100 * front / bestScore)
264 theseq = hg.sequence(rchrom, foundpos, 10 + dist)
266 score = int(100 * back / bestScore)
267 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
276 testseq = hg.sequence(rchrom, foundpos, 10 + dist)
278 testseq = complement(testseq)
280 leftseq = testseq[:9]
281 rightseq = testseq[dist-2:]
283 testseq = hg.sequence(rchrom, foundpos, 12)
285 testseq = complement(testseq)
286 leftseq = testseq[3:]
288 leftseq = testseq[:9]
292 (lfront, lback) = motL.scoreMotif(leftseq)
293 (rfront, rback) = motR.scoreMotif(rightseq)
295 leftScore = int(100 * lfront) / bestLeft
298 leftScore = int(100 * lback) / bestLeft
302 rightScore = int(100 * rfront) / bestRight
305 rightScore = int(100 * rback) / bestRight
309 if rightScore > leftScore:
314 if sense == "-" and dist > 0:
315 theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
317 outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
321 outfile.write(outline + "/n")
323 # we didn't find a site - draw region
324 if not foundValue and doVerbose:
325 outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
327 outfile.write(outline + "\n")
329 if not foundValue and doPlot:
330 drawarray = [val + notFoundIndex for val in smoothArray]
331 drawpos = [drawarray[val] for val in topPos]
333 plot(topPos, drawpos, "r.")
334 goodmatches = mot.locateMotif(seq, 75)
335 if len(goodmatches) > 0:
340 for (mstart, sense) in goodmatches:
341 drawgood.append(mstart)
342 drawgoody.append(drawarray[mstart])
344 plot(drawgood, drawgoody, "g.")
353 if __name__ == "__main__":