first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / geneLocusBins.py
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 6a1efc48e270d92a8453e8fcb6907db00185ed92..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -12,6 +12,7 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
+import string
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
@@ -92,66 +93,77 @@ def getParser(usage):
  
      return parser
  
  
      return parser
  
+
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
  
  
-    outfile = open(outfilename,'w')
  
  
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
      for gid in gidList:
      for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except:
+            except (KeyError, IndexError):
                  pass
          else:
              symbol = gid
                  pass
          else:
              symbol = gid
+
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
          for binAmount in gidBins[gid]:
              if normalizeBins:
          for binAmount in gidBins[gid]:
              if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
      outfile.close()
  
  
      outfile.close()