6 print 'psyco not running'
8 from cistematic.core.motif import Motif, hasMotifExtension
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
11 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
14 print "getallsites: version 2.5"
21 usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
23 parser = getParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
32 motThreshold = float(args[2])
36 getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
37 options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
38 options.useRank, options.noMerge)
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--dataset", dest="chipfilename")
44 parser.add_option("--cache", action="store_true", dest="doCache")
45 parser.add_option("--best", action="store_true", dest="bestOnly",
46 help="only report the best position for each region")
47 parser.add_option("--usepeak", action="store_true", dest="usePeak",
48 help="use peak position and height from regions file")
49 parser.add_option("--printseq", action="store_true", dest="printSeq")
50 parser.add_option("--nomerge", action="store_true", dest="noMerge")
51 parser.add_option("--markov1", action="store_true", dest="doMarkov1")
52 parser.add_option("--rank", type="int", dest="useRank",
53 help="return region ranking based on peak height ranking [requires --usepeak]")
55 configParser = getConfigParser()
56 section = "getallsites"
57 chipfilename = getConfigOption(configParser, section, "chipfilename", "")
58 doCache = getConfigBoolOption(configParser, section, "doCache", False)
59 bestOnly = getConfigBoolOption(configParser, section, "bestOnly", False)
60 usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
61 printSeq = getConfigBoolOption(configParser, section, "printSeq", False)
62 doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
63 useRank = getConfigBoolOption(configParser, section, "useRank", False)
64 noMerge = getConfigBoolOption(configParser, section, "noMerge", False)
66 parser.set_defaults(chipfilename=chipfilename, doCache=doCache, bestOnly=bestOnly, usePeak=usePeak,
67 printSeq=printSeq, doMarkov1=doMarkov1, useRank=useRank, noMerge=noMerge)
72 def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
73 doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
74 useRank=False, noMerge=False):
76 if motThreshold < 1.0 and doMarkov1:
77 print "motThreshold should be between 1.0 and 10.0 for markov1"
79 elif motThreshold < 55.0 and not doMarkov1:
80 print "motThreshold should be between 55 and 99 for a regular PSFM"
84 print "will use cistematic.core.motif C-extension to speed up motif search"
86 if useRank and usePeak:
87 print "will return region ranking based on peak height ranking"
90 print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
93 mot = Motif("", motifFile=motfilename)
95 bestScore = mot.bestConsensusScore()
99 # minHits=-1 will force regions to be used regardless
100 # maxDist= 0 prevents merging of non-overlapping regions
102 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
104 regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
111 hitRDS = ReadDataset.ReadDataset(chipfilename, verbose = True, cache=doCache)
113 outfile = open(outfilename, "w")
117 for chrom in regions:
118 if "rand" in chrom or "M" in chrom:
122 for region in regions[chrom]:
123 regionList.append((region.peakHeight, chrom, region.start, region.length, region.peakPos))
125 for region in regions[chrom]:
126 regionList.append((chrom, region.start, region.length))
135 for tuple in regionList:
137 (rpeakheight, rchrom, start, length, rpeakpos) = tuple
139 (rchrom, start, length) = tuple
142 seq = hg.sequence(rchrom, start, length)
144 print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
154 numHits = rpeakheight
156 if rchrom != currentChrom:
157 fullchrom = "chr" + rchrom
158 hitDict = hitRDS.getReadsDict(chrom=fullchrom)
159 currentChrom = rchrom
161 (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
169 matches = mot.locateMarkov1(seq, motThreshold)
171 matches = mot.locateMotif(seq, motThreshold)
173 for (pos, sense) in matches:
175 for (fpos, fdist) in found:
176 if pos + start == fpos:
181 found.append((start + pos, start + pos + motLen/2 - peakpos))
183 found.append((start + pos, pos + motLen/2 - peakpos))
185 found.append((start + pos, -1))
189 for (foundpos, peakdist) in found:
190 seq = hg.sequence(rchrom, foundpos, motLen)
192 (front, back) = mot.scoreMotif(seq)
195 score = int(100 * front / bestScore)
197 score = int(100 * back / bestScore)
199 seq = complement(seq)
204 outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
206 bestList.append((abs(peakdist), outline))
208 outfile.write(outline)
210 if bestOnly and foundValue:
212 outfile.write(bestList[0][1])
216 print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
219 if (count % 10000) == 0 and not printSeq:
223 print "did not find motif in %d regions" % notFoundIndex
226 if __name__ == "__main__":