snapshot of 4.0a development. initial git repo commit
[erange.git] / getallsites.py
1 import sys, optparse
2 try:
3     import psyco
4     psyco.full()
5 except:
6     print 'psyco not running'
7
8 from cistematic.core.motif import Motif, hasMotifExtension
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
11 from commoncode import readDataset, getMergedRegions, findPeak
12
13 print "%prog: version 2.4"
14
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
21
22     parser = optparse.OptionParser(usage=usage)
23     parser.add_option("--dataset", dest="chipfilename")
24     parser.add_option("--cache", action="store_true", dest="doCache")
25     parser.add_option("--best", action="store_true", dest="bestOnly",
26                       help="only report the best position for each region")
27     parser.add_option("--usepeak", action="store_true", dest="usePeak",
28                       help="use peak position and height from regions file")
29     parser.add_option("--printseq", action="store_true", dest="printSeq")
30     parser.add_option("--nomerge", action="store_true", dest="noMerge")
31     parser.add_option("--markov1", action="store_true", dest="doMarkov1")
32     parser.add_option("--rank", type="int", dest="useRank",
33                       help="return region ranking based on peak height ranking [requires --usepeak]")
34     parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
35                         printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
36
37     (options, args) = parser.parse_args(argv[1:])
38
39     if len(args) < 5:
40         print usage
41         sys.exit(1)
42
43     genome = args[0]
44     motfilename = args[1]
45     motThreshold = float(args[2])
46     infilename = args[3]
47     outfilename = args[4]
48
49     getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
50                 options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
51                 options.useRank, options.noMerge)
52
53
54 def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
55                 doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
56                 useRank=False, noMerge=False):
57
58     if motThreshold < 1.0 and doMarkov1:
59         print "motThreshold should be between 1.0 and 10.0 for markov1"
60         sys.exit(1)
61     elif motThreshold < 55.0 and not doMarkov1:
62         print "motThreshold should be between 55 and 99 for a regular PSFM"
63         sys.exit(1)
64
65     if hasMotifExtension:
66         print "will use cistematic.core.motif C-extension to speed up motif search"
67
68     if useRank and usePeak:
69         print "will return region ranking based on peak height ranking"
70         useRank = True
71     else:
72         print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
73         useRank = False
74
75     mot = Motif("", motifFile=motfilename)
76     motLen = len(mot)
77     bestScore = mot.bestConsensusScore()
78
79     hg = Genome(genome)
80
81     # minHits=-1 will force regions to be used regardless
82     # maxDist= 0 prevents merging of non-overlapping regions
83     if noMerge:
84         regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
85     else:
86         regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
87
88     doRDS = False
89     if chipfilename:
90         doRDS = True
91
92     if doRDS:
93         hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
94
95     outfile = open(outfilename, "w")
96
97     regionList = []
98
99     for chrom in regions:
100         if "rand" in chrom or "M" in chrom:
101             continue
102
103         if usePeak:
104             for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
105                 regionList.append((peakHeight, chrom, start, length, peakPos))
106         else:
107             for (start, stop, length) in regions[chrom]:
108                 regionList.append((chrom, start, length))
109
110     if usePeak:
111         regionList.sort()
112         regionList.reverse()
113
114     notFoundIndex = 0
115     currentChrom = ""
116     count = 0
117     for tuple in regionList:
118         if usePeak:
119             (rpeakheight, rchrom, start, length, rpeakpos) = tuple
120         else:
121             (rchrom, start, length) = tuple
122
123         try:
124             seq = hg.sequence(rchrom, start, length)
125         except:
126             print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
127             continue
128
129         count += 1
130         numHits = -1
131         if usePeak:
132             peakpos = rpeakpos
133             if useRank:
134                 numHits = count
135             else:
136                 numHits = rpeakheight
137         elif doRDS:
138             if rchrom != currentChrom:
139                 fullchrom = "chr" + rchrom
140                 hitDict = hitRDS.getReadsDict(chrom=fullchrom)
141                 currentChrom = rchrom
142
143             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
144             if len(topPos) == 0:
145                 print "topPos error"
146
147             peakpos = topPos[0]
148
149         found = []
150         if doMarkov1:
151             matches = mot.locateMarkov1(seq, motThreshold)
152         else:
153             matches = mot.locateMotif(seq, motThreshold)
154
155         for (pos, sense) in matches:
156             alreadyFound = False
157             for (fpos, fdist) in found:
158                 if pos + start == fpos:
159                     alreadyFound = True
160
161             if not alreadyFound:
162                 if usePeak:
163                     found.append((start + pos, start + pos  + motLen/2 - peakpos))
164                 elif doRDS:
165                     found.append((start + pos, pos  + motLen/2 - peakpos))
166                 else:
167                     found.append((start + pos, -1))
168
169         foundValue = False
170         bestList = []
171         for (foundpos, peakdist) in found:
172             seq = hg.sequence(rchrom, foundpos, motLen)
173             foundValue = True
174             (front, back) = mot.scoreMotif(seq)
175             sense = "+"
176             if front >= back:
177                 score = int(100 * front / bestScore)
178             else:
179                 score = int(100 * back / bestScore)
180                 sense = "-"
181                 seq = complement(seq)
182
183             if printSeq:
184                 print seq
185
186             outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
187             if bestOnly:
188                 bestList.append((abs(peakdist), outline))
189             else:
190                 outfile.write(outline)
191
192         if bestOnly and foundValue:
193             bestList.sort()
194             outfile.write(bestList[0][1])
195
196         if not foundValue:
197             if printSeq:
198                 print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
199
200             notFoundIndex += 1
201         if (count % 10000) == 0 and not printSeq:
202             print count
203
204     outfile.close()
205     print "did not find motif in %d regions" % notFoundIndex
206
207
208 if __name__ == "__main__":
209     main(sys.argv)