-#
-# geneStartBins.py
-# ENRAGE
-#
-
try:
import psyco
psyco.full()
pass
import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
from cistematic.genomes import Genome
-
print "geneStartBins: version 2.1"
if len(sys.argv) < 4:
print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
genome = sys.argv[1]
hitfile = sys.argv[2]
outfilename = sys.argv[3]
-
standardMinDist = 3000
if '-max' in sys.argv:
standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
bins = 10
standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
normalizationFactor = 1.0
if normalize:
- totalCount = len(hitRDS)
+ totalCount = getHeaderComment(bamfile.header, "Total")
normalizationFactor = totalCount / 1000000.
hg = Genome(genome)
gidDict = {}
geneinfoDict = getGeneInfoDict(genome, cache=True)
featuresDict = hg.getallGeneFeatures()
-
outfile = open(outfilename,'w')
-
gidList = hg.allGIDs()
gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
for gid in gidList:
symbol = 'LOC' + gid
geneinfo = ''
if (start, stop) not in newfeatureList:
newfeatureList.append((start, stop))
- if chrom not in hitDict:
+ if chrom not in chromosomeList:
continue
newfeatureList.sort()
gstart = newfeatureList[-1][1] - glen
gstop = newfeatureList[-1][1] + glen
+
tagCount = 0
if glen < standardMinDist / 2:
continue
binList = [0] * bins
- for read in hitDict[chrom]:
- tagStart = read["start"] - gstart
- sense = read["sense"]
- weight = read["weight"]
+ for alignedread in bamfile.fetch(chrom):
+ tagStart = alignedread.pos - gstart
+ sense = getReadSense(alignedread)
+ weight = 1.0/alignedread.opt('NH')
if tagStart >= 2 * glen:
break