convert standard analysis pipelines to use bam format natively

[erange.git] / geneStartBins.py
diff --git a/geneStartBins.py b/geneStartBins.py

index d929d41a89a2da712b1ee953770e0b01bb2f96bb..3c5183730c288f9c61e29f41fb1df69c3e0f992f 100755 (executable)
--- a/geneStartBins.py
+++ b/geneStartBins.py
@@ -1,8 +1,3 @@
-#
-#  geneStartBins.py
-#  ENRAGE
-#
-
  try:
      import psyco
      psyco.full()
@@ -10,11 +5,10 @@ except:
      pass
  
  import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
  from cistematic.genomes import Genome
  
-
  print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
@@ -24,7 +18,6 @@ if len(sys.argv) < 4:
  genome = sys.argv[1]
  hitfile =  sys.argv[2]
  outfilename = sys.argv[3]
-
  standardMinDist = 3000
  if '-max' in sys.argv:
      standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
@@ -42,23 +35,21 @@ if '-cache' in sys.argv:
  
  bins = 10
  standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
  normalizationFactor = 1.0
  if normalize:
-    totalCount = len(hitRDS)
+    totalCount = getHeaderComment(bamfile.header, "Total")
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
  gidDict = {}
  geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
-
  outfile = open(outfilename,'w')
-
  gidList = hg.allGIDs()
  gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
  for gid in gidList:
      symbol = 'LOC' + gid
      geneinfo = ''
@@ -78,7 +69,7 @@ for gid in gidList:
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
  
-    if chrom not in hitDict:
+    if chrom not in chromosomeList:
          continue
  
      newfeatureList.sort()
@@ -109,15 +100,16 @@ for gid in gidList:
  
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
+
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
  
      binList = [0] * bins
-    for read in hitDict[chrom]:
-        tagStart = read["start"] - gstart
-        sense = read["sense"]
-        weight = read["weight"]
+    for alignedread in bamfile.fetch(chrom):
+        tagStart = alignedread.pos - gstart
+        sense = getReadSense(alignedread)
+        weight = 1.0/alignedread.opt('NH')
          if tagStart >= 2 * glen:
              break