first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / getSNPs.py
1 #
2 #  getSNPs.py
3 #  ENRAGE
4 #
5 # Originally written by: Wendy Lee
6 # Last modified: May 11th, 2009 by Ali Mortazavi
7
8 """
9     Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl). 
10     For each mismatch, choose the base change that occur most frequently (ie: has the highest number
11     of independent reads)
12     Threshold of Sl and Cl are from user input
13     Sl = # of independent reads supporting a base change at position S 
14     Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
15
16     usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
17
18     uniqStartMin = # of independent reads supporting a base change at position S
19     totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
20 """
21
22 import sys
23 import optparse
24 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
25 import ReadDataset
26
27 print "getSNPs: version 3.6"
28
29 try:
30     import psyco
31     psyco.full()
32 except:
33     print "psyco is not running"
34     pass
35
36 def usage():
37     print __doc__
38
39
40 def main(argv=None):
41     if not argv:
42         argv = sys.argv
43
44     usage = __doc__
45
46     parser = makeParser(usage)
47     (options, args) = parser.parse_args(argv[1:])
48
49     if len(args) < 4:
50         usage()
51         sys.exit(2)
52
53     hitfile = args[0]
54     uniqStartMin = float(args[1])
55     totalRatioMin = float(args[2])
56     outfilename = args[3]
57
58     if options.cachePages > 0:
59         doCache = True
60     else:
61         doCache = False
62
63     writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
64
65
66 def makeParser(usage=""):
67     parser = optparse.OptionParser(usage=usage)
68     parser.add_option("--nosplices", action="store_false", dest="doSplices")
69     parser.add_option("--enforceChr", action="store_true", dest="forceChr")
70     parser.add_option("--cache", type="int", dest="cachePages")
71
72     configParser = getConfigParser()
73     section = "getSNPs"
74     doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
75     forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
76     cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
77
78     parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
79
80     return parser
81
82
83 def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
84     writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
85
86     outfile  = open(outfilename, "w")
87     header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange" 
88     outfile.write(header + "\n")
89
90     snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
91     for snpEntry in snpPropertiesList:
92         outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
93         outfile.write(outline)
94         outfile.flush() 
95
96     outfile.close()
97
98     writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
99
100
101 def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
102
103     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
104     if cachePages > 20000:
105         hitRDS.setDBcache(cachePages)
106
107     snpPropertiesList = []
108     readLength = hitRDS.getReadSize() 
109     chromList = hitRDS.getChromosomes()
110
111     for chrom in chromList:
112         if doNotProcessChromosome(forceChr, chrom):
113             continue
114
115         matchDict = getMatchDict(hitRDS, chrom, doSplices)
116         print "got match dict for %s " % chrom
117         mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
118         print "got mismatch dict for %s " % chrom
119         mismatchPositions = mismatchDict.keys()
120         mismatchPositions.sort()
121         for position in mismatchPositions:
122             totalCount = mismatchDict[position]["totalCount"]
123             uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
124             totalBaseDict = mismatchDict[position]["totalBaseDict"]
125             highestCount = 0
126             highestBaseChange = "N-N"
127             highestTotalCount = 0
128             for baseChange in uniqBaseDict:
129                 if totalBaseDict[baseChange] > highestTotalCount:
130                     highestBaseChange = baseChange
131                     highestCount = uniqBaseDict[baseChange]
132                     highestTotalCount = totalBaseDict[baseChange]
133
134             Cl = 0.
135             matchCount = 0
136             if highestCount >= uniqStartMin:
137                 for matchpos in xrange(position - readLength + 1, position + 1):
138                     try:
139                         matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
140                     except:
141                         pass
142
143                 matchCount -= totalCount
144                 if matchCount < 0:
145                     matchCount = 0
146
147                 Sl = highestCount/float(readLength)
148                 Cl = highestTotalCount/float(highestTotalCount + matchCount)
149                 if Cl >= totalRatioMin:
150                     snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
151                     snpPropertiesList.append(snpProperties)
152
153     return snpPropertiesList
154
155
156 def doNotProcessChromosome(forceChr, chromosome):
157     if forceChr:
158         if chromosome[:3] != "chr":
159             return True
160     else:
161         return False
162
163
164 def getMatchDict(rds, chrom, withSplices=True):
165     spliceDict = {}
166     readDict = {}
167     finalDict = {}
168
169     try:
170         readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
171     except:
172         readDict[chrom] = []
173
174     for read in readDict[chrom]:
175         start = read["start"]
176         stop = read["stop"]
177         if start in finalDict:
178             finalDict[start].append(stop)
179         else:
180             finalDict[start] = [stop]
181
182     if withSplices:
183         try:
184             spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
185         except:
186             spliceDict[chrom] = []
187
188         for read in spliceDict[chrom]:
189             try:
190                 start = read["startL"]
191                 stop = read["stopL"]
192             except KeyError:
193                 start = read["startR"]
194                 stop = read["stopR"]
195
196             if start in finalDict:
197                 finalDict[start].append(stop)
198             else:
199                 finalDict[start] = [stop]
200
201     return finalDict
202
203
204 def getMismatchDict(rds, chrom, withSplices=True):
205     mismatchDict = {}
206     spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
207     for (start, change_at, change_base, change_from) in spliceDict[chrom]:
208         change = "%s-%s" % (change_base, change_from)
209         uniqueReadCount = 1
210         totalCount = 1
211         back = "%s:%s" % (str(start), change)
212         uniqBaseDict = {change: 1}
213         totalBaseDict = {change: 1}
214         if change_at in mismatchDict:
215             pos = "%s:%s" % (str(start), change)
216             totalCount = mismatchDict[change_at]["totalCount"]
217             totalCount += 1
218             totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
219             if change in totalBaseDict: 
220                 totalBaseDict[change] += 1
221
222             uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
223             uniqueReadCount = mismatchDict[change_at]["uniqueReadCount"]
224             back = mismatchDict[change_at]["back"]
225             if pos not in back:
226                 uniqueReadCount += 1
227                 if change in uniqBaseDict:
228                     uniqBaseDict[change] += 1 # dict contains total unique read counts
229
230                 back = "%s,%s" % (back, pos)
231
232         mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
233                                    "totalCount": totalCount, 
234                                    "back": back,
235                                    "uniqBaseDict": uniqBaseDict,
236                                    "totalBaseDict": totalBaseDict
237         }
238
239     return mismatchDict
240
241
242 if __name__ == "__main__":
243     main(sys.argv)