convert standard analysis pipelines to use bam format natively

[erange.git] / normalizeExpandedExonic.py
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index cf02cb3af4e26d987a971817057c9e790dae874d..e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -6,10 +6,10 @@ except:
  
  import sys
  import optparse
-import ReadDataset
+import pysam
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
  
  print "normalizeExpandedExonic: version 5.7"
  
@@ -18,7 +18,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+    usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -34,22 +34,19 @@ def main(argv=None):
      splicecountfile = args[3]
      outfile = args[4]
  
-    candidateLines = []
+    candidatefilename = ""
      acceptedfilename = ""
-    if len(args) > 5:
-        try:
-            candidatefile = open(args[5])
-            candidateLines = candidatefile.readlines()
-            candidatefile.close()
-            acceptedfilename = args[6]
-        except IndexError:
-            pass
-
-    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+    try:
+        candidatefilename = args[5]
+        acceptedfilename = args[6]
+    except IndexError:
+        pass
+
+    bamfile = pysam.Samfile(hitfile, "rb")    
+    uniqcount = getHeaderComment(bamfile.header, "Unique")
      
      normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
-                            candidateLines, acceptedfilename, options.fieldID,
+                            candidatefilename, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
  
@@ -77,102 +74,22 @@ def makeParser(usage=""):
  
  
  def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
-                            outfilename, candidateLines=[], acceptedfilename="",
+                            outfilename, candidatefilename="", acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
-
-    uniquecountfile = open(uniquecountfilename)
-
+  
+    print "%d unique reads" % uniqcount
+    gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
+    totalCount = (uniqcount + splicecount) / 1000000.
+    uniqScale = uniqcount / 1000000.
      if acceptedfilename:
          acceptedfile = open(acceptedfilename, "w")
  
-    dosplicecount = False
-    if splicecountfilename != "none":
-        dosplicecount = True
-        splicecountfile = open(splicecountfilename)
-
-    if extendGenome:
-        if replaceModels:
-            print "will replace gene models with %s" % extendGenome
-        else:
-            print "will extend gene models with %s" % extendGenome
-
-    if doCache:
-        cacheGeneDB(genome)
-        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        print "%s cached" % genome
-    else:
-        hg = Genome(genome, inRAM=True)
-
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
-
-    
-    print "%d unique reads" % uniqcount
-
-    splicecount = 0
-    countDict = {}
-    gidList = []
-    farList = []
-    candidateDict = {}
-
-    gidToGeneDict = {}
-
-    featuresDict = hg.getallGeneFeatures()
-    print "got featuresDict"
-
      outfile = open(outfilename, "w")
-
-    for line in uniquecountfile:
-        fields = line.strip().split()
-        gid = fields[fieldID]
-        gene = fields[1]
-        countDict[gid] = float(fields[-1])
-        gidList.append(gid)
-        gidToGeneDict[gid] = gene
-
-    uniquecountfile.close()
-
-    if dosplicecount:
-        for line in splicecountfile:
-            fields = line.strip().split()
-            gid = fields[fieldID]
-            try:
-                countDict[gid] += float(fields[-1])
-            except:
-                print fields
-                continue
-
-            splicecount += float(fields[-1])
-
-        splicecountfile.close()
-
-    for line in candidateLines:
-        if "#" in line:
-            continue
-
-        fields = line.strip().split()
-        gid = fields[1]
-        gene = fields[0]
-        if gid not in gidList:
-            if gid not in farList:
-                farList.append(gid)
-                gidToGeneDict[gid] = gene
-
-            if gid not in countDict:
-                countDict[gid] = 0
-
-            countDict[gid] += float(fields[6])
-
-        if gid not in candidateDict:
-            candidateDict[gid] = []
-
-        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
-    totalCount = (uniqcount + splicecount) / 1000000.
-    uniqScale = uniqcount / 1000000.
+    featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+    print "got featuresDict"
      for gid in gidList:
-        gene = gidToGeneDict[gid]
+        gene = gidToGeneDict[gid]["name"]
          featureList = []
          try:
              featureList = featuresDict[gid]
@@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
          elif geneLength > maxLength:
              geneLength = maxLength
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          if gid in candidateDict:
              for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-                cratio = cCount / (cLength / 1000.)
-                cratio = (uniqScale * cratio) / totalCount
+                kilobaseLength = cLength / 1000.
+                cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
                  if 10. * cratio < rpkm:
                      continue
  
-                countDict[gid] += cCount
-                geneLength += cLength / 1000.
-                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                gidToGeneDict[gid]["count"] += cCount
+                geneLength += kilobaseLength
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
  
-    for gid in farList:
-        gene = gidToGeneDict[gid]
-        geneLength = 0
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            geneLength += cLength / 1000.
-
+    for (gid, geneLength) in farList:
          if geneLength < 0.1:
              continue
  
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            cratio = cCount / (cLength / 1000.)
-            cratio = cratio / totalCount
-            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
-        rpm = countDict[gid] / totalCount
+        gene = gidToGeneDict[gid]["name"]
+        if acceptedfilename:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                kilobaseLength = cLength / 1000.
+                cratio = (cCount / kilobaseLength) / totalCount
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
+
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
  
@@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
      except:
          pass
  
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+    gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+    uniqueGidList = gidToGeneDict.keys()
+    if candidatefilename:
+        candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+    else:
+        candidateDict = {}
+        geneDictUpdates = {}
+
+    farList = []
+    for gid in geneDictUpdates:
+        gidToGeneDict[gid] = geneDictUpdates[gid]
+        geneLength = 0.
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        farList.append((gid, geneLength))
+
+    return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+    gidToGeneDict = {}
+    uniquecountfile = open(uniquecountfilename)
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+    uniquecountfile.close()
+    splicecount = 0.
+    if splicecountfilename != "none":
+        splicecountfile = open(splicecountfilename)
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                gidToGeneDict[gid]["count"] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+    """ Reads entries from the candiates file
+        gid entries not in gidList are returned for inclusion
+        creates and returns a dictionary keyed off gid with a single list of
+        tuples (count, length, chrom, start, stop) for each gid
+    """
+    geneDictUpdates = {}
+    candidateDict = {}
+    candidatefile = open(candidatefilename)
+    for line in candidatefile.readlines():
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            try:
+                geneDictUpdates[gid]["count"] += float(fields[6])
+            except KeyError:
+                geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+        try:
+            candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+        except KeyError:
+            candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+    candidatefile.close()
+
+    return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    featuresDict = hg.getallGeneFeatures()
+
      if doCache:
          uncacheGeneDB(genome)
  
+    return featuresDict
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file