getSNPs.py

   1 #
   2 #  getSNPs.py
   3 #  ENRAGE
   4 #
   5 # Originally written by: Wendy Lee
   6 # Last modified: May 11th, 2009 by Ali Mortazavi
   7
   8 """
   9     Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl).
  10     For each mismatch, choose the base change that occur most frequently (ie: has the highest number
  11     of independent reads)
  12     Threshold of Sl and Cl are from user input
  13     Sl = # of independent reads supporting a base change at position S
  14     Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
  15
  16     usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
  17
  18     uniqStartMin = # of independent reads supporting a base change at position S
  19     totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
  20 """
  21
  22 import sys, optparse
  23 from commoncode import readDataset, writeLog
  24
  25 print "%prog: version 3.5"
  26
  27 try:
  28     import psyco
  29     psyco.full()
  30 except:
  31     print "psyco is not running"
  32     pass
  33
  34 def usage():
  35     print __doc__
  36
  37
  38 def main(argv=None):
  39     if not argv:
  40         argv = sys.argv
  41
  42     usage = __doc__
  43
  44     parser = optparse.OptionParser(usage=usage)
  45     parser.add_option("--nosplices", action="store_false", dest="doSplices")
  46     parser.add_option("--enforceChr", action="store_true", dest="forceChr")
  47     parser.add_option("--cache", type="int", dest="cachePages")
  48     parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
  49     (options, args) = parser.parse_args(argv[1:])
  50
  51     if len(args) < 4:
  52         usage()
  53         sys.exit(2)
  54
  55     hitfile = args[0]
  56     uniqStartMin = float(args[1])
  57     totalRatioMin = float(args[2])
  58     outfilename = args[3]
  59
  60     if options.cachePages > 0:
  61         doCache = True
  62     else:
  63         doCache = False
  64
  65     writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
  66
  67
  68 def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
  69     writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
  70
  71     outfile  = open(outfilename, "w")
  72     header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange"
  73     outfile.write(header + "\n")
  74
  75     snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
  76     for snpEntry in snpPropertiesList:
  77         outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
  78         print outline
  79         outfile.write(outline + "\n")
  80         outfile.flush()
  81
  82     outfile.close()
  83
  84     writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
  85
  86
  87 def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
  88
  89     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
  90     if cachePages > 20000:
  91         hitRDS.setDBcache(cachePages)
  92
  93     snpPropertiesList = []
  94     readLength = hitRDS.getReadSize()
  95     chromList = hitRDS.getChromosomes()
  96
  97     for chrom in chromList:
  98         if doNotProcessChromosome(forceChr, chrom):
  99             continue
 100
 101         matchDict = getMatchDict(hitRDS, chrom, doSplices)
 102         print "got match dict for %s " % chrom
 103         mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
 104         print "got mismatch dict for %s " % chrom
 105         mismatchPositions = mismatchDict.keys()
 106         mismatchPositions.sort()
 107         for position in mismatchPositions:
 108             totalCount = mismatchDict[position]["totalCount"]
 109             uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
 110             totalBaseDict = mismatchDict[position]["totalBaseDict"]
 111             highestCount = 0
 112             highestBaseChange = "N-N"
 113             highestTotalCount = 0
 114             for baseChange in uniqBaseDict:
 115                 if totalBaseDict[baseChange] > highestTotalCount:
 116                     highestBaseChange = baseChange
 117                     highestCount = uniqBaseDict[baseChange]
 118                     highestTotalCount = totalBaseDict[baseChange]
 119
 120             Cl = 0.
 121             matchCount = 0
 122             if highestCount >= uniqStartMin:
 123                 for matchpos in xrange(position - readLength + 1, position + 1):
 124                     try:
 125                         matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
 126                     except:
 127                         pass
 128
 129                 matchCount -= totalCount
 130                 if matchCount < 0:
 131                     matchCount = 0
 132
 133                 Sl = highestCount/float(readLength)
 134                 Cl = highestTotalCount/float(highestTotalCount + matchCount)
 135                 if Cl >= totalRatioMin:
 136                     snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
 137                     snpPropertiesList.append(snpProperties)
 138
 139     return snpPropertiesList
 140
 141
 142 def doNotProcessChromosome(forceChr, chromosome):
 143     if forceChr:
 144         if chromosome[:3] != "chr":
 145             return True
 146     else:
 147         return False
 148
 149
 150 def getMatchDict(rds, chrom, withSplices=True):
 151     spliceDict = {}
 152     readDict = {}
 153     finalDict = {}
 154
 155     try:
 156         readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
 157     except:
 158         readDict[chrom] = []
 159
 160     for (start, stop) in readDict[chrom]:
 161         if finalDict.has_key(start):
 162             finalDict[start].append(stop)
 163         else:
 164             finalDict[start] = [stop]
 165
 166     if withSplices:
 167         try:
 168             spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
 169         except:
 170             spliceDict[chrom] = []
 171
 172         for (start, stop) in spliceDict[chrom]:
 173             if finalDict.has_key(start):
 174                 finalDict[start].append(stop)
 175             else:
 176                 finalDict[start] = [stop]
 177
 178     return finalDict
 179
 180
 181 def getMismatchDict(rds, chrom, withSplices=True):
 182     mismatchDict = {}
 183     spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
 184     for (start, change_at, change_base, change_from) in spliceDict[chrom]:
 185         change = "%s-%s" % (change_base, change_from)
 186         uniqueReadCount = 1
 187         totalCount = 1
 188         back = "%s:%s" % (str(start), change)
 189         uniqBaseDict = {change: 1}
 190         totalBaseDict = {change: 1}
 191         if mismatchDict.has_key(change_at):
 192             (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at]
 193             pos = "%s:%s" % (str(start), change)
 194             totalCount += 1
 195             if totalBaseDict.has_key(change):
 196                 totalBaseDict[change] += 1
 197
 198             if pos not in back:
 199                 uniqueReadCount += 1
 200                 if uniqBaseDict.has_key(change):
 201                     uniqBaseDict[change] += 1 # dict contains total unique read counts
 202
 203                 back = "%s,%s" % (back, pos)
 204
 205         mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
 206                                    "totalCount": totalCount,
 207                                    "back": back,
 208                                    "uniqBaseDict": uniqBaseDict,
 209                                    "totalBaseDict": totalBaseDict
 210         }
 211
 212     return mismatchDict
 213
 214
 215 if __name__ == "__main__":
 216     main(sys.argv)