snapshot of 4.0a development. initial git repo commit
[erange.git] / getSNPs.py
1 #
2 #  getSNPs.py
3 #  ENRAGE
4 #
5 # Originally written by: Wendy Lee
6 # Last modified: May 11th, 2009 by Ali Mortazavi
7
8 """
9     Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl). 
10     For each mismatch, choose the base change that occur most frequently (ie: has the highest number
11     of independent reads)
12     Threshold of Sl and Cl are from user input
13     Sl = # of independent reads supporting a base change at position S 
14     Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
15
16     usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
17
18     uniqStartMin = # of independent reads supporting a base change at position S
19     totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
20 """
21
22 import sys, optparse
23 from commoncode import readDataset, writeLog
24
25 print "%prog: version 3.5"
26
27 try:
28     import psyco
29     psyco.full()
30 except:
31     print "psyco is not running"
32     pass
33
34 def usage():
35     print __doc__
36
37
38 def main(argv=None):
39     if not argv:
40         argv = sys.argv
41
42     usage = __doc__
43
44     parser = optparse.OptionParser(usage=usage)
45     parser.add_option("--nosplices", action="store_false", dest="doSplices")
46     parser.add_option("--enforceChr", action="store_true", dest="forceChr")
47     parser.add_option("--cache", type="int", dest="cachePages")
48     parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
49     (options, args) = parser.parse_args(argv[1:])
50
51     if len(args) < 4:
52         usage()
53         sys.exit(2)
54
55     hitfile = args[0]
56     uniqStartMin = float(args[1])
57     totalRatioMin = float(args[2])
58     outfilename = args[3]
59
60     if options.cachePages > 0:
61         doCache = True
62     else:
63         doCache = False
64
65     writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
66
67
68 def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
69     writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
70
71     outfile  = open(outfilename, "w")
72     header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange" 
73     outfile.write(header + "\n")
74
75     snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
76     for snpEntry in snpPropertiesList:
77         outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
78         print outline
79         outfile.write(outline + "\n")
80         outfile.flush() 
81
82     outfile.close()
83
84     writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
85
86
87 def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
88
89     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
90     if cachePages > 20000:
91         hitRDS.setDBcache(cachePages)
92
93     snpPropertiesList = []
94     readLength = hitRDS.getReadSize() 
95     chromList = hitRDS.getChromosomes()
96
97     for chrom in chromList:
98         if doNotProcessChromosome(forceChr, chrom):
99             continue
100
101         matchDict = getMatchDict(hitRDS, chrom, doSplices)
102         print "got match dict for %s " % chrom
103         mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
104         print "got mismatch dict for %s " % chrom
105         mismatchPositions = mismatchDict.keys()
106         mismatchPositions.sort()
107         for position in mismatchPositions:
108             totalCount = mismatchDict[position]["totalCount"]
109             uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
110             totalBaseDict = mismatchDict[position]["totalBaseDict"]
111             highestCount = 0
112             highestBaseChange = "N-N"
113             highestTotalCount = 0
114             for baseChange in uniqBaseDict:
115                 if totalBaseDict[baseChange] > highestTotalCount:
116                     highestBaseChange = baseChange
117                     highestCount = uniqBaseDict[baseChange]
118                     highestTotalCount = totalBaseDict[baseChange]
119
120             Cl = 0.
121             matchCount = 0
122             if highestCount >= uniqStartMin:
123                 for matchpos in xrange(position - readLength + 1, position + 1):
124                     try:
125                         matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
126                     except:
127                         pass
128
129                 matchCount -= totalCount
130                 if matchCount < 0:
131                     matchCount = 0
132
133                 Sl = highestCount/float(readLength)
134                 Cl = highestTotalCount/float(highestTotalCount + matchCount)
135                 if Cl >= totalRatioMin:
136                     snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
137                     snpPropertiesList.append(snpProperties)
138
139     return snpPropertiesList
140
141
142 def doNotProcessChromosome(forceChr, chromosome):
143     if forceChr:
144         if chromosome[:3] != "chr":
145             return True
146     else:
147         return False
148
149
150 def getMatchDict(rds, chrom, withSplices=True):
151     spliceDict = {}
152     readDict = {}
153     finalDict = {}
154
155     try:
156         readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
157     except:
158         readDict[chrom] = []
159
160     for (start, stop) in readDict[chrom]:
161         if finalDict.has_key(start):
162             finalDict[start].append(stop)
163         else:
164             finalDict[start] = [stop]
165
166     if withSplices:
167         try:
168             spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
169         except:
170             spliceDict[chrom] = []
171
172         for (start, stop) in spliceDict[chrom]:
173             if finalDict.has_key(start):
174                 finalDict[start].append(stop)
175             else:
176                 finalDict[start] = [stop]
177
178     return finalDict
179
180
181 def getMismatchDict(rds, chrom, withSplices=True):
182     mismatchDict = {}
183     spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
184     for (start, change_at, change_base, change_from) in spliceDict[chrom]:
185         change = "%s-%s" % (change_base, change_from)
186         uniqueReadCount = 1
187         totalCount = 1
188         back = "%s:%s" % (str(start), change)
189         uniqBaseDict = {change: 1}
190         totalBaseDict = {change: 1}
191         if mismatchDict.has_key(change_at):
192             (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at]
193             pos = "%s:%s" % (str(start), change)
194             totalCount += 1
195             if totalBaseDict.has_key(change): 
196                 totalBaseDict[change] += 1
197
198             if pos not in back:
199                 uniqueReadCount += 1
200                 if uniqBaseDict.has_key(change):
201                     uniqBaseDict[change] += 1 # dict contains total unique read counts
202
203                 back = "%s,%s" % (back, pos)
204
205         mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
206                                    "totalCount": totalCount, 
207                                    "back": back,
208                                    "uniqBaseDict": uniqBaseDict,
209                                    "totalBaseDict": totalBaseDict
210         }
211
212     return mismatchDict
213
214
215 if __name__ == "__main__":
216     main(sys.argv)