5 # Originally written by: Wendy Lee
6 # Last modified: May 11th, 2009 by Ali Mortazavi
9 Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl).
10 For each mismatch, choose the base change that occur most frequently (ie: has the highest number
12 Threshold of Sl and Cl are from user input
13 Sl = # of independent reads supporting a base change at position S
14 Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
16 usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
18 uniqStartMin = # of independent reads supporting a base change at position S
19 totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
24 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
27 print "getSNPs: version 3.6"
33 print "psyco is not running"
46 parser = makeParser(usage)
47 (options, args) = parser.parse_args(argv[1:])
54 uniqStartMin = float(args[1])
55 totalRatioMin = float(args[2])
58 if options.cachePages > 0:
63 writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
66 def makeParser(usage=""):
67 parser = optparse.OptionParser(usage=usage)
68 parser.add_option("--nosplices", action="store_false", dest="doSplices")
69 parser.add_option("--enforceChr", action="store_true", dest="forceChr")
70 parser.add_option("--cache", type="int", dest="cachePages")
72 configParser = getConfigParser()
74 doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
75 forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
76 cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
78 parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
83 def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
84 writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
86 outfile = open(outfilename, "w")
87 header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange"
88 outfile.write(header + "\n")
90 snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
91 for snpEntry in snpPropertiesList:
92 outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
93 outfile.write(outline)
98 writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
101 def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
103 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
104 if cachePages > 20000:
105 hitRDS.setDBcache(cachePages)
107 snpPropertiesList = []
108 readLength = hitRDS.getReadSize()
109 chromList = hitRDS.getChromosomes()
111 for chrom in chromList:
112 if doNotProcessChromosome(forceChr, chrom):
115 matchDict = getMatchDict(hitRDS, chrom, doSplices)
116 print "got match dict for %s " % chrom
117 mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
118 print "got mismatch dict for %s " % chrom
119 mismatchPositions = mismatchDict.keys()
120 mismatchPositions.sort()
121 for position in mismatchPositions:
122 totalCount = mismatchDict[position]["totalCount"]
123 uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
124 totalBaseDict = mismatchDict[position]["totalBaseDict"]
126 highestBaseChange = "N-N"
127 highestTotalCount = 0
128 for baseChange in uniqBaseDict:
129 if totalBaseDict[baseChange] > highestTotalCount:
130 highestBaseChange = baseChange
131 highestCount = uniqBaseDict[baseChange]
132 highestTotalCount = totalBaseDict[baseChange]
136 if highestCount >= uniqStartMin:
137 for matchpos in xrange(position - readLength + 1, position + 1):
139 matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
143 matchCount -= totalCount
147 Sl = highestCount/float(readLength)
148 Cl = highestTotalCount/float(highestTotalCount + matchCount)
149 if Cl >= totalRatioMin:
150 snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
151 snpPropertiesList.append(snpProperties)
153 return snpPropertiesList
156 def doNotProcessChromosome(forceChr, chromosome):
158 if chromosome[:3] != "chr":
164 def getMatchDict(rds, chrom, withSplices=True):
170 readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
174 for read in readDict[chrom]:
175 start = read["start"]
177 if start in finalDict:
178 finalDict[start].append(stop)
180 finalDict[start] = [stop]
184 spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
186 spliceDict[chrom] = []
188 for read in spliceDict[chrom]:
190 start = read["startL"]
193 start = read["startR"]
196 if start in finalDict:
197 finalDict[start].append(stop)
199 finalDict[start] = [stop]
204 def getMismatchDict(rds, chrom, withSplices=True):
206 spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
207 for (start, change_at, change_base, change_from) in spliceDict[chrom]:
208 change = "%s-%s" % (change_base, change_from)
211 back = "%s:%s" % (str(start), change)
212 uniqBaseDict = {change: 1}
213 totalBaseDict = {change: 1}
214 if change_at in mismatchDict:
215 pos = "%s:%s" % (str(start), change)
216 totalCount = mismatchDict[change_at]["totalCount"]
218 totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
219 if change in totalBaseDict:
220 totalBaseDict[change] += 1
222 uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
223 uniqueReadCount = mismatchDict[change_at]["uniqueReadCount"]
224 back = mismatchDict[change_at]["back"]
227 if change in uniqBaseDict:
228 uniqBaseDict[change] += 1 # dict contains total unique read counts
230 back = "%s,%s" % (back, pos)
232 mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
233 "totalCount": totalCount,
235 "uniqBaseDict": uniqBaseDict,
236 "totalBaseDict": totalBaseDict
242 if __name__ == "__main__":