Release version for Erange 4.0a

[erange.git] / geneLocusBins.py
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -12,6 +12,7 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
+import string
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
  
      return parser
  
  
      return parser
  
+
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
  
  
-    outfile = open(outfilename,'w')
  
  
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
      for gid in gidList:
      for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                  pass
          else:
              symbol = gid
                  pass
          else:
              symbol = gid
+
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
          for binAmount in gidBins[gid]:
              if normalizeBins:
          for binAmount in gidBins[gid]:
              if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
      outfile.close()
  
  
  if __name__ == "__main__":
      outfile.close()
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)