first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / bamPreprocessing.py
index dd5e8612001dd31f19d49470c956205c789f016c..9f8b6fa0e607114af40d95e73707922eabcd981b 100644 (file)
@@ -15,7 +15,7 @@ import sys
 import pysam
 import optparse
 from cistematic.genomes import Genome
-from commoncode import isSpliceEntry, getFeaturesByChromDict
+from commoncode import isSpliceEntry
 
 
 def main(argv=None):
@@ -35,7 +35,7 @@ def main(argv=None):
     samfile = pysam.Samfile(BAM, "rb" )
     readMultiplicityDict = getReadMultiplicity(samfile)
 
-    counts = getReadCounts(samfile, readMultiplicityDict)
+    counts, readLength = getReadCounts(samfile, readMultiplicityDict)
     outheader = buildHeader(samfile.header, counts)
     if options.markGID:
         genome = Genome(options.genomeName, inRAM=True)
@@ -43,7 +43,6 @@ def main(argv=None):
             genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
 
         print "getting gene features...."
-        featuresByChromDict = getFeaturesByChromDict(genome)    
         outheader['CO'].append(options.genomeComment)
 
     outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
@@ -52,11 +51,13 @@ def main(argv=None):
         if options.markGID:
             chrom = samfile.getrname(alignedread.tid)
             chromNum = chrom[3:]
-            geneModelFlag = "NM"
-            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
-                if start < alignedread.pos and stop > alignedread.pos:
-                    geneModelFlag = gid
-                    continue
+            #new direct query
+            geneFeatures = genome.getFeaturesIntersecting(chromNum, alignedread.pos, readLength)
+            try:
+                (name, version, chromosome, start, stop, orientation, atype) = geneFeatures[0]
+                geneModelFlag = name
+            except IndexError:
+                geneModelFlag = "NM"
 
         ID = getReadID(alignedread)
         try:
@@ -156,7 +157,7 @@ def getReadCounts(samfile, readMultiplicityDict):
               "ReadLength\t%d" % readLength
     ]
 
-    return counts
+    return counts, readLength
 
 
 def buildHeader(templateheader, commentList):