getSNPs.py

   1 #
   2 #  getSNPs.py
   3 #  ENRAGE
   4 #
   5 # Originally written by: Wendy Lee
   6 # Last modified: May 11th, 2009 by Ali Mortazavi
   7
   8 """
   9     Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl).
  10     For each mismatch, choose the base change that occur most frequently (ie: has the highest number
  11     of independent reads)
  12     Threshold of Sl and Cl are from user input
  13     Sl = # of independent reads supporting a base change at position S
  14     Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
  15
  16     usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
  17
  18     uniqStartMin = # of independent reads supporting a base change at position S
  19     totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
  20 """
  21
  22 import sys
  23 import optparse
  24 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
  25 import ReadDataset
  26
  27 print "getSNPs: version 3.6"
  28
  29 try:
  30     import psyco
  31     psyco.full()
  32 except:
  33     print "psyco is not running"
  34     pass
  35
  36 def usage():
  37     print __doc__
  38
  39
  40 def main(argv=None):
  41     if not argv:
  42         argv = sys.argv
  43
  44     usage = __doc__
  45
  46     parser = makeParser(usage)
  47     (options, args) = parser.parse_args(argv[1:])
  48
  49     if len(args) < 4:
  50         usage()
  51         sys.exit(2)
  52
  53     hitfile = args[0]
  54     uniqStartMin = float(args[1])
  55     totalRatioMin = float(args[2])
  56     outfilename = args[3]
  57
  58     if options.cachePages > 0:
  59         doCache = True
  60     else:
  61         doCache = False
  62
  63     writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
  64
  65
  66 def makeParser(usage=""):
  67     parser = optparse.OptionParser(usage=usage)
  68     parser.add_option("--nosplices", action="store_false", dest="doSplices")
  69     parser.add_option("--enforceChr", action="store_true", dest="forceChr")
  70     parser.add_option("--cache", type="int", dest="cachePages")
  71
  72     configParser = getConfigParser()
  73     section = "getSNPs"
  74     doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
  75     forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
  76     cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
  77
  78     parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
  79
  80     return parser
  81
  82
  83 def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
  84     writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
  85
  86     outfile  = open(outfilename, "w")
  87     header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange"
  88     outfile.write(header + "\n")
  89
  90     snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
  91     for snpEntry in snpPropertiesList:
  92         outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
  93         outfile.write(outline)
  94         outfile.flush()
  95
  96     outfile.close()
  97
  98     writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
  99
 100
 101 def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
 102
 103     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
 104     if cachePages > 20000:
 105         hitRDS.setDBcache(cachePages)
 106
 107     snpPropertiesList = []
 108     readLength = hitRDS.getReadSize()
 109     chromList = hitRDS.getChromosomes()
 110
 111     for chrom in chromList:
 112         if doNotProcessChromosome(forceChr, chrom):
 113             continue
 114
 115         matchDict = getMatchDict(hitRDS, chrom, doSplices)
 116         print "got match dict for %s " % chrom
 117         mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
 118         print "got mismatch dict for %s " % chrom
 119         mismatchPositions = mismatchDict.keys()
 120         mismatchPositions.sort()
 121         for position in mismatchPositions:
 122             totalCount = mismatchDict[position]["totalCount"]
 123             uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
 124             totalBaseDict = mismatchDict[position]["totalBaseDict"]
 125             highestCount = 0
 126             highestBaseChange = "N-N"
 127             highestTotalCount = 0
 128             for baseChange in uniqBaseDict:
 129                 if totalBaseDict[baseChange] > highestTotalCount:
 130                     highestBaseChange = baseChange
 131                     highestCount = uniqBaseDict[baseChange]
 132                     highestTotalCount = totalBaseDict[baseChange]
 133
 134             Cl = 0.
 135             matchCount = 0
 136             if highestCount >= uniqStartMin:
 137                 for matchpos in xrange(position - readLength + 1, position + 1):
 138                     try:
 139                         matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
 140                     except:
 141                         pass
 142
 143                 matchCount -= totalCount
 144                 if matchCount < 0:
 145                     matchCount = 0
 146
 147                 Sl = highestCount/float(readLength)
 148                 Cl = highestTotalCount/float(highestTotalCount + matchCount)
 149                 if Cl >= totalRatioMin:
 150                     snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
 151                     snpPropertiesList.append(snpProperties)
 152
 153     return snpPropertiesList
 154
 155
 156 def doNotProcessChromosome(forceChr, chromosome):
 157     if forceChr:
 158         if chromosome[:3] != "chr":
 159             return True
 160     else:
 161         return False
 162
 163
 164 def getMatchDict(rds, chrom, withSplices=True):
 165     spliceDict = {}
 166     readDict = {}
 167     finalDict = {}
 168
 169     try:
 170         readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
 171     except:
 172         readDict[chrom] = []
 173
 174     for read in readDict[chrom]:
 175         start = read["start"]
 176         stop = read["stop"]
 177         if start in finalDict:
 178             finalDict[start].append(stop)
 179         else:
 180             finalDict[start] = [stop]
 181
 182     if withSplices:
 183         try:
 184             spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
 185         except:
 186             spliceDict[chrom] = []
 187
 188         for read in spliceDict[chrom]:
 189             try:
 190                 start = read["startL"]
 191                 stop = read["stopL"]
 192             except KeyError:
 193                 start = read["startR"]
 194                 stop = read["stopR"]
 195
 196             if start in finalDict:
 197                 finalDict[start].append(stop)
 198             else:
 199                 finalDict[start] = [stop]
 200
 201     return finalDict
 202
 203
 204 def getMismatchDict(rds, chrom, withSplices=True):
 205     mismatchDict = {}
 206     spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
 207     for (start, change_at, change_base, change_from) in spliceDict[chrom]:
 208         change = "%s-%s" % (change_base, change_from)
 209         uniqueReadCount = 1
 210         totalCount = 1
 211         back = "%s:%s" % (str(start), change)
 212         uniqBaseDict = {change: 1}
 213         totalBaseDict = {change: 1}
 214         if change_at in mismatchDict:
 215             pos = "%s:%s" % (str(start), change)
 216             totalCount = mismatchDict[change_at]["totalCount"]
 217             totalCount += 1
 218             totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
 219             if change in totalBaseDict:
 220                 totalBaseDict[change] += 1
 221
 222             uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
 223             uniqueReadCount = mismatchDict[change_at]["uniqueReadCount"]
 224             back = mismatchDict[change_at]["back"]
 225             if pos not in back:
 226                 uniqueReadCount += 1
 227                 if change in uniqBaseDict:
 228                     uniqBaseDict[change] += 1 # dict contains total unique read counts
 229
 230                 back = "%s,%s" % (back, pos)
 231
 232         mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
 233                                    "totalCount": totalCount,
 234                                    "back": back,
 235                                    "uniqBaseDict": uniqBaseDict,
 236                                    "totalBaseDict": totalBaseDict
 237         }
 238
 239     return mismatchDict
 240
 241
 242 if __name__ == "__main__":
 243     main(sys.argv)