first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / normalizeExpandedExonic.py
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index 4d174bf1cd1c35b6a4e6e84c95e86e50bc4f3b95..e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -4,29 +4,23 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import pysam
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
  
-print "%prog: version 5.6"
+print "normalizeExpandedExonic: version 5.7"
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
-
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--gidField", type="int", dest="fieldID")
-    parser.add_option("--maxLength", type="float", dest="maxLength")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
-                        replaceModels=False)
+    usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(sys.argv) < 6:
@@ -40,121 +34,62 @@ def main(argv=None):
      splicecountfile = args[3]
      outfile = args[4]
  
-    candidateLines = []
+    candidatefilename = ""
      acceptedfilename = ""
-    if len(args) > 5:
-        try:
-            candidatefile = open(args[5])
-            candidateLines = candidatefile.readlines()
-            candidatefile.close()
-            acceptedfilename = args[6]
-        except IndexError:
-            pass
-
-    normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
-                            candidateLines, acceptedfilename, options.fieldID,
+    try:
+        candidatefilename = args[5]
+        acceptedfilename = args[6]
+    except IndexError:
+        pass
+
+    bamfile = pysam.Samfile(hitfile, "rb")    
+    uniqcount = getHeaderComment(bamfile.header, "Unique")
+    
+    normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
+                            candidatefilename, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
  
  
-def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
-                            outfilename, candidateLines=[], acceptedfilename="",
-                            fieldID=0, maxLength=1000000000., doCache=False,
-                            extendGenome="", replaceModels=False):
-
-    uniquecountfile = open(uniquecountfilename)
-
-    if acceptedfilename:
-        acceptedfile = open(acceptedfilename, "w")
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--gidField", type="int", dest="fieldID")
+    parser.add_option("--maxLength", type="float", dest="maxLength")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  
-    dosplicecount = False
-    if splicecountfilename != "none":
-        dosplicecount = True
-        splicecountfile = open(splicecountfilename)
+    configParser = getConfigParser()
+    section = "normalizeExpandedExonic"
+    fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
+    maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
  
-    if extendGenome:
-        if replaceModels:
-            print "will replace gene models with %s" % extendGenome
-        else:
-            print "will extend gene models with %s" % extendGenome
+    parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
+                        replaceModels=replaceModels)
  
-    if doCache:
-        cacheGeneDB(genome)
-        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        print "%s cached" % genome
-    else:
-        hg = Genome(genome, inRAM=True)
+    return parser
  
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
+                            outfilename, candidatefilename="", acceptedfilename="",
+                            fieldID=0, maxLength=1000000000., doCache=False,
+                            extendGenome="", replaceModels=False):
+  
      print "%d unique reads" % uniqcount
-
-    splicecount = 0
-    countDict = {}
-    gidList = []
-    farList = []
-    candidateDict = {}
-
-    gidToGeneDict = {}
-
-    featuresDict = hg.getallGeneFeatures()
-    print "got featuresDict"
-
-    outfile = open(outfilename, "w")
-
-    for line in uniquecountfile:
-        fields = line.strip().split()
-        gid = fields[fieldID]
-        gene = fields[1]
-        countDict[gid] = float(fields[-1])
-        gidList.append(gid)
-        gidToGeneDict[gid] = gene
-
-    uniquecountfile.close()
-
-    if dosplicecount:
-        for line in splicecountfile:
-            fields = line.strip().split()
-            gid = fields[fieldID]
-            try:
-                countDict[gid] += float(fields[-1])
-            except:
-                print fields
-                continue
-
-            splicecount += float(fields[-1])
-
-        splicecountfile.close()
-
-    for line in candidateLines:
-        if "#" in line:
-            continue
-
-        fields = line.strip().split()
-        gid = fields[1]
-        gene = fields[0]
-        if gid not in gidList:
-            if gid not in farList:
-                farList.append(gid)
-                gidToGeneDict[gid] = gene
-
-            if gid not in countDict:
-                countDict[gid] = 0
-
-            countDict[gid] += float(fields[6])
-
-        if gid not in candidateDict:
-            candidateDict[gid] = []
-
-        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
+    gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
      totalCount = (uniqcount + splicecount) / 1000000.
      uniqScale = uniqcount / 1000000.
+    if acceptedfilename:
+        acceptedfile = open(acceptedfilename, "w")
+
+    outfile = open(outfilename, "w")
+    featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+    print "got featuresDict"
      for gid in gidList:
-        gene = gidToGeneDict[gid]
+        gene = gidToGeneDict[gid]["name"]
          featureList = []
          try:
              featureList = featuresDict[gid]
@@ -176,38 +111,41 @@ def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfil
          elif geneLength > maxLength:
              geneLength = maxLength
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          if gid in candidateDict:
              for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-                cratio = cCount / (cLength / 1000.)
-                cratio = (uniqScale * cratio) / totalCount
+                kilobaseLength = cLength / 1000.
+                cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
                  if 10. * cratio < rpkm:
                      continue
  
-                countDict[gid] += cCount
-                geneLength += cLength / 1000.
-                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                gidToGeneDict[gid]["count"] += cCount
+                geneLength += kilobaseLength
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
  
-    for gid in farList:
-        gene = gidToGeneDict[gid]
-        geneLength = 0
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            geneLength += cLength / 1000.
-
+    for (gid, geneLength) in farList:
          if geneLength < 0.1:
              continue
  
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            cratio = cCount / (cLength / 1000.)
-            cratio = cratio / totalCount
-            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
-        rpm = countDict[gid] / totalCount
+        gene = gidToGeneDict[gid]["name"]
+        if acceptedfilename:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                kilobaseLength = cLength / 1000.
+                cratio = (cCount / kilobaseLength) / totalCount
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
+
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
  
@@ -217,9 +155,115 @@ def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfil
      except:
          pass
  
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+    gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+    uniqueGidList = gidToGeneDict.keys()
+    if candidatefilename:
+        candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+    else:
+        candidateDict = {}
+        geneDictUpdates = {}
+
+    farList = []
+    for gid in geneDictUpdates:
+        gidToGeneDict[gid] = geneDictUpdates[gid]
+        geneLength = 0.
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        farList.append((gid, geneLength))
+
+    return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+    gidToGeneDict = {}
+    uniquecountfile = open(uniquecountfilename)
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+    uniquecountfile.close()
+    splicecount = 0.
+    if splicecountfilename != "none":
+        splicecountfile = open(splicecountfilename)
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                gidToGeneDict[gid]["count"] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+    """ Reads entries from the candiates file
+        gid entries not in gidList are returned for inclusion
+        creates and returns a dictionary keyed off gid with a single list of
+        tuples (count, length, chrom, start, stop) for each gid
+    """
+    geneDictUpdates = {}
+    candidateDict = {}
+    candidatefile = open(candidatefilename)
+    for line in candidatefile.readlines():
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            try:
+                geneDictUpdates[gid]["count"] += float(fields[6])
+            except KeyError:
+                geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+        try:
+            candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+        except KeyError:
+            candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+    candidatefile.close()
+
+    return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    featuresDict = hg.getallGeneFeatures()
+
      if doCache:
          uncacheGeneDB(genome)
  
+    return featuresDict
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file