erange version 4.0a dev release
[erange.git] / getallsites.py
1 import sys, optparse
2 try:
3     import psyco
4     psyco.full()
5 except:
6     print 'psyco not running'
7
8 from cistematic.core.motif import Motif, hasMotifExtension
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
11 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
12 import ReadDataset
13
14 print "getallsites: version 2.5"
15
16
17 def main(argv=None):
18     if not argv:
19         argv = sys.argv
20
21     usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
22
23     parser = getParser(usage)
24     (options, args) = parser.parse_args(argv[1:])
25
26     if len(args) < 5:
27         print usage
28         sys.exit(1)
29
30     genome = args[0]
31     motfilename = args[1]
32     motThreshold = float(args[2])
33     infilename = args[3]
34     outfilename = args[4]
35
36     getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
37                 options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
38                 options.useRank, options.noMerge)
39
40
41 def getParser(usage):
42     parser = optparse.OptionParser(usage=usage)
43     parser.add_option("--dataset", dest="chipfilename")
44     parser.add_option("--cache", action="store_true", dest="doCache")
45     parser.add_option("--best", action="store_true", dest="bestOnly",
46                       help="only report the best position for each region")
47     parser.add_option("--usepeak", action="store_true", dest="usePeak",
48                       help="use peak position and height from regions file")
49     parser.add_option("--printseq", action="store_true", dest="printSeq")
50     parser.add_option("--nomerge", action="store_true", dest="noMerge")
51     parser.add_option("--markov1", action="store_true", dest="doMarkov1")
52     parser.add_option("--rank", type="int", dest="useRank",
53                       help="return region ranking based on peak height ranking [requires --usepeak]")
54
55     configParser = getConfigParser()
56     section = "getallsites"
57     chipfilename = getConfigOption(configParser, section, "chipfilename", "")
58     doCache = getConfigBoolOption(configParser, section, "doCache", False)
59     bestOnly = getConfigBoolOption(configParser, section, "bestOnly", False)
60     usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
61     printSeq = getConfigBoolOption(configParser, section, "printSeq", False)
62     doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
63     useRank = getConfigBoolOption(configParser, section, "useRank", False)
64     noMerge = getConfigBoolOption(configParser, section, "noMerge", False)
65
66     parser.set_defaults(chipfilename=chipfilename, doCache=doCache, bestOnly=bestOnly, usePeak=usePeak,
67                         printSeq=printSeq, doMarkov1=doMarkov1, useRank=useRank, noMerge=noMerge)
68
69     return parser
70
71
72 def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
73                 doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
74                 useRank=False, noMerge=False):
75
76     if motThreshold < 1.0 and doMarkov1:
77         print "motThreshold should be between 1.0 and 10.0 for markov1"
78         sys.exit(1)
79     elif motThreshold < 55.0 and not doMarkov1:
80         print "motThreshold should be between 55 and 99 for a regular PSFM"
81         sys.exit(1)
82
83     if hasMotifExtension:
84         print "will use cistematic.core.motif C-extension to speed up motif search"
85
86     if useRank and usePeak:
87         print "will return region ranking based on peak height ranking"
88         useRank = True
89     else:
90         print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
91         useRank = False
92
93     mot = Motif("", motifFile=motfilename)
94     motLen = len(mot)
95     bestScore = mot.bestConsensusScore()
96
97     hg = Genome(genome)
98
99     # minHits=-1 will force regions to be used regardless
100     # maxDist= 0 prevents merging of non-overlapping regions
101     if noMerge:
102         regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
103     else:
104         regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
105
106     doRDS = False
107     if chipfilename:
108         doRDS = True
109
110     if doRDS:
111         hitRDS = ReadDataset.ReadDataset(chipfilename, verbose = True, cache=doCache)
112
113     outfile = open(outfilename, "w")
114
115     regionList = []
116
117     for chrom in regions:
118         if "rand" in chrom or "M" in chrom:
119             continue
120
121         if usePeak:
122             for region in regions[chrom]:
123                 regionList.append((region.peakHeight, chrom, region.start, region.length, region.peakPos))
124         else:
125             for region in regions[chrom]:
126                 regionList.append((chrom, region.start, region.length))
127
128     if usePeak:
129         regionList.sort()
130         regionList.reverse()
131
132     notFoundIndex = 0
133     currentChrom = ""
134     count = 0
135     for tuple in regionList:
136         if usePeak:
137             (rpeakheight, rchrom, start, length, rpeakpos) = tuple
138         else:
139             (rchrom, start, length) = tuple
140
141         try:
142             seq = hg.sequence(rchrom, start, length)
143         except:
144             print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
145             continue
146
147         count += 1
148         numHits = -1
149         if usePeak:
150             peakpos = rpeakpos
151             if useRank:
152                 numHits = count
153             else:
154                 numHits = rpeakheight
155         elif doRDS:
156             if rchrom != currentChrom:
157                 fullchrom = "chr" + rchrom
158                 hitDict = hitRDS.getReadsDict(chrom=fullchrom)
159                 currentChrom = rchrom
160
161             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
162             if len(topPos) == 0:
163                 print "topPos error"
164
165             peakpos = topPos[0]
166
167         found = []
168         if doMarkov1:
169             matches = mot.locateMarkov1(seq, motThreshold)
170         else:
171             matches = mot.locateMotif(seq, motThreshold)
172
173         for (pos, sense) in matches:
174             alreadyFound = False
175             for (fpos, fdist) in found:
176                 if pos + start == fpos:
177                     alreadyFound = True
178
179             if not alreadyFound:
180                 if usePeak:
181                     found.append((start + pos, start + pos  + motLen/2 - peakpos))
182                 elif doRDS:
183                     found.append((start + pos, pos  + motLen/2 - peakpos))
184                 else:
185                     found.append((start + pos, -1))
186
187         foundValue = False
188         bestList = []
189         for (foundpos, peakdist) in found:
190             seq = hg.sequence(rchrom, foundpos, motLen)
191             foundValue = True
192             (front, back) = mot.scoreMotif(seq)
193             sense = "+"
194             if front >= back:
195                 score = int(100 * front / bestScore)
196             else:
197                 score = int(100 * back / bestScore)
198                 sense = "-"
199                 seq = complement(seq)
200
201             if printSeq:
202                 print seq
203
204             outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
205             if bestOnly:
206                 bestList.append((abs(peakdist), outline))
207             else:
208                 outfile.write(outline)
209
210         if bestOnly and foundValue:
211             bestList.sort()
212             outfile.write(bestList[0][1])
213
214         if not foundValue:
215             if printSeq:
216                 print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
217
218             notFoundIndex += 1
219         if (count % 10000) == 0 and not printSeq:
220             print count
221
222     outfile.close()
223     print "did not find motif in %d regions" % notFoundIndex
224
225
226 if __name__ == "__main__":
227     main(sys.argv)