first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / bamPreprocessing.py
diff --git a/bamPreprocessing.py b/bamPreprocessing.py

index dd5e8612001dd31f19d49470c956205c789f016c..9f8b6fa0e607114af40d95e73707922eabcd981b 100644 (file)
--- a/bamPreprocessing.py
+++ b/bamPreprocessing.py
@@ -15,7 +15,7 @@ import sys
  import pysam
  import optparse
  from cistematic.genomes import Genome
-from commoncode import isSpliceEntry, getFeaturesByChromDict
+from commoncode import isSpliceEntry
  
  
  def main(argv=None):
@@ -35,7 +35,7 @@ def main(argv=None):
      samfile = pysam.Samfile(BAM, "rb" )
      readMultiplicityDict = getReadMultiplicity(samfile)
  
-    counts = getReadCounts(samfile, readMultiplicityDict)
+    counts, readLength = getReadCounts(samfile, readMultiplicityDict)
      outheader = buildHeader(samfile.header, counts)
      if options.markGID:
          genome = Genome(options.genomeName, inRAM=True)
@@ -43,7 +43,6 @@ def main(argv=None):
              genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
  
          print "getting gene features...."
-        featuresByChromDict = getFeaturesByChromDict(genome)    
          outheader['CO'].append(options.genomeComment)
  
      outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
@@ -52,11 +51,13 @@ def main(argv=None):
          if options.markGID:
              chrom = samfile.getrname(alignedread.tid)
              chromNum = chrom[3:]
-            geneModelFlag = "NM"
-            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
-                if start < alignedread.pos and stop > alignedread.pos:
-                    geneModelFlag = gid
-                    continue
+            #new direct query
+            geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
+            try:
+                (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
+                geneModelFlag = name
+            except IndexError:
+                geneModelFlag = "NM"
  
          ID = getReadID(alignedread)
          try:
@@ -156,7 +157,7 @@ def getReadCounts(samfile, readMultiplicityDict):
                "ReadLength\t%d" % readLength
      ]
  
-    return counts
+    return counts, readLength
  
  
  def buildHeader(templateheader, commentList):