first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / getSNPs.py
diff --git a/getSNPs.py b/getSNPs.py

index 0adde42c0b09a921f45c9edf01070077e68bf805..64948d405c523146e378c00fa4e204ca3ec75a4b 100755 (executable)
--- a/getSNPs.py
+++ b/getSNPs.py
@@ -19,10 +19,12 @@
      totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
  """
  
-import sys, optparse
-from commoncode import readDataset, writeLog
+import sys
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
+import ReadDataset
  
-print "%prog: version 3.5"
+print "getSNPs: version 3.6"
  
  try:
      import psyco
@@ -41,11 +43,7 @@ def main(argv=None):
  
      usage = __doc__
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nosplices", action="store_false", dest="doSplices")
-    parser.add_option("--enforceChr", action="store_true", dest="forceChr")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -65,6 +63,23 @@ def main(argv=None):
      writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nosplices", action="store_false", dest="doSplices")
+    parser.add_option("--enforceChr", action="store_true", dest="forceChr")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "getSNPs"
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
+    forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
+
+    parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
+
+    return parser
+
+
  def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
      writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
  
@@ -75,8 +90,7 @@ def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache,
      snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
      for snpEntry in snpPropertiesList:
          outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
-        print outline
-        outfile.write(outline + "\n")
+        outfile.write(outline)
          outfile.flush() 
  
      outfile.close()
@@ -86,7 +100,7 @@ def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache,
  
  def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      if cachePages > 20000:
          hitRDS.setDBcache(cachePages)
  
@@ -157,8 +171,10 @@ def getMatchDict(rds, chrom, withSplices=True):
      except:
          readDict[chrom] = []
  
-    for (start, stop) in readDict[chrom]:
-        if finalDict.has_key(start):
+    for read in readDict[chrom]:
+        start = read["start"]
+        stop = read["stop"]
+        if start in finalDict:
              finalDict[start].append(stop)
          else:
              finalDict[start] = [stop]
@@ -169,8 +185,15 @@ def getMatchDict(rds, chrom, withSplices=True):
          except:
              spliceDict[chrom] = []
  
-        for (start, stop) in spliceDict[chrom]:
-            if finalDict.has_key(start):
+        for read in spliceDict[chrom]:
+            try:
+                start = read["startL"]
+                stop = read["stopL"]
+            except KeyError:
+                start = read["startR"]
+                stop = read["stopR"]
+
+            if start in finalDict:
                  finalDict[start].append(stop)
              else:
                  finalDict[start] = [stop]
@@ -188,16 +211,20 @@ def getMismatchDict(rds, chrom, withSplices=True):
          back = "%s:%s" % (str(start), change)
          uniqBaseDict = {change: 1}
          totalBaseDict = {change: 1}
-        if mismatchDict.has_key(change_at):
-            (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at]
+        if change_at in mismatchDict:
              pos = "%s:%s" % (str(start), change)
+            totalCount = mismatchDict[change_at]["totalCount"]
              totalCount += 1
-            if totalBaseDict.has_key(change): 
+            totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
+            if change in totalBaseDict: 
                  totalBaseDict[change] += 1
  
+            uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
+            uniqueReadCount = mismatchDict[change_at]["uniqueReadCount"]
+            back = mismatchDict[change_at]["back"]
              if pos not in back:
                  uniqueReadCount += 1
-                if uniqBaseDict.has_key(change):
+                if change in uniqBaseDict:
                      uniqBaseDict[change] += 1 # dict contains total unique read counts
  
                  back = "%s,%s" % (back, pos)