6 print 'psyco not running'
8 from cistematic.core.motif import Motif, hasMotifExtension
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
11 from commoncode import readDataset, getMergedRegions, findPeak
13 print "%prog: version 2.4"
20 usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
22 parser = optparse.OptionParser(usage=usage)
23 parser.add_option("--dataset", dest="chipfilename")
24 parser.add_option("--cache", action="store_true", dest="doCache")
25 parser.add_option("--best", action="store_true", dest="bestOnly",
26 help="only report the best position for each region")
27 parser.add_option("--usepeak", action="store_true", dest="usePeak",
28 help="use peak position and height from regions file")
29 parser.add_option("--printseq", action="store_true", dest="printSeq")
30 parser.add_option("--nomerge", action="store_true", dest="noMerge")
31 parser.add_option("--markov1", action="store_true", dest="doMarkov1")
32 parser.add_option("--rank", type="int", dest="useRank",
33 help="return region ranking based on peak height ranking [requires --usepeak]")
34 parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
35 printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
37 (options, args) = parser.parse_args(argv[1:])
45 motThreshold = float(args[2])
49 getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
50 options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
51 options.useRank, options.noMerge)
54 def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
55 doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
56 useRank=False, noMerge=False):
58 if motThreshold < 1.0 and doMarkov1:
59 print "motThreshold should be between 1.0 and 10.0 for markov1"
61 elif motThreshold < 55.0 and not doMarkov1:
62 print "motThreshold should be between 55 and 99 for a regular PSFM"
66 print "will use cistematic.core.motif C-extension to speed up motif search"
68 if useRank and usePeak:
69 print "will return region ranking based on peak height ranking"
72 print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
75 mot = Motif("", motifFile=motfilename)
77 bestScore = mot.bestConsensusScore()
81 # minHits=-1 will force regions to be used regardless
82 # maxDist= 0 prevents merging of non-overlapping regions
84 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
86 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
93 hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
95 outfile = open(outfilename, "w")
100 if "rand" in chrom or "M" in chrom:
104 for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
105 regionList.append((peakHeight, chrom, start, length, peakPos))
107 for (start, stop, length) in regions[chrom]:
108 regionList.append((chrom, start, length))
117 for tuple in regionList:
119 (rpeakheight, rchrom, start, length, rpeakpos) = tuple
121 (rchrom, start, length) = tuple
124 seq = hg.sequence(rchrom, start, length)
126 print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
136 numHits = rpeakheight
138 if rchrom != currentChrom:
139 fullchrom = "chr" + rchrom
140 hitDict = hitRDS.getReadsDict(chrom=fullchrom)
141 currentChrom = rchrom
143 (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
151 matches = mot.locateMarkov1(seq, motThreshold)
153 matches = mot.locateMotif(seq, motThreshold)
155 for (pos, sense) in matches:
157 for (fpos, fdist) in found:
158 if pos + start == fpos:
163 found.append((start + pos, start + pos + motLen/2 - peakpos))
165 found.append((start + pos, pos + motLen/2 - peakpos))
167 found.append((start + pos, -1))
171 for (foundpos, peakdist) in found:
172 seq = hg.sequence(rchrom, foundpos, motLen)
174 (front, back) = mot.scoreMotif(seq)
177 score = int(100 * front / bestScore)
179 score = int(100 * back / bestScore)
181 seq = complement(seq)
186 outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
188 bestList.append((abs(peakdist), outline))
190 outfile.write(outline)
192 if bestOnly and foundValue:
194 outfile.write(bestList[0][1])
198 print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
201 if (count % 10000) == 0 and not printSeq:
205 print "did not find motif in %d regions" % notFoundIndex
208 if __name__ == "__main__":