8 from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
10 from cistematic.genomes import Genome
12 print "geneStartBins: version 2.1"
14 print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
15 print '\n\twhere regionSize is the optional maximum region in bp\n'
20 outfilename = sys.argv[3]
21 standardMinDist = 3000
22 if '-max' in sys.argv:
23 standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
25 if '-raw' in sys.argv:
33 if '-cache' in sys.argv:
37 standardMinThresh = standardMinDist / bins
38 bamfile = pysam.Samfile(hitfile, "rb")
39 readlen = getHeaderComment(bamfile.header, "ReadLength")
40 normalizationFactor = 1.0
42 totalCount = getHeaderComment(bamfile.header, "Total")
43 normalizationFactor = totalCount / 1000000.
47 geneinfoDict = getGeneInfoDict(genome, cache=True)
48 featuresDict = hg.getallGeneFeatures()
49 outfile = open(outfilename,'w')
50 gidList = hg.allGIDs()
52 chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
58 geneinfo = geneinfoDict[gid]
59 featureList = featuresDict[gid]
60 symbol = geneinfo[0][0]
65 if len(featureList) == 0:
68 for (ftype, chrom, start, stop, fsense) in featureList:
69 if (start, stop) not in newfeatureList:
70 newfeatureList.append((start, stop))
72 if chrom not in chromosomeList:
76 if len(newfeatureList) < 1:
79 glen = standardMinDist / 2
81 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
82 if nextGene < glen * 2:
88 gstart = newfeatureList[0][0] - glen
92 gstop = newfeatureList[0][0] + glen
94 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
95 if nextGene < glen * 2:
101 gstart = newfeatureList[-1][1] - glen
102 gstop = newfeatureList[-1][1] + glen
105 if glen < standardMinDist / 2:
109 for alignedread in bamfile.fetch(chrom):
110 tagStart = alignedread.pos - gstart
111 sense = getReadSense(alignedread)
112 weight = 1.0/alignedread.opt('NH')
113 if tagStart >= 2 * glen:
119 # we are relying on python's integer division quirk
120 binID = tagStart / standardMinThresh
121 binList[binID] += weight
123 rdist = 2 * glen - tagStart
124 binID = rdist / standardMinThresh
125 binList[binID] += weight
130 print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
131 outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
132 for binAmount in binList:
133 outfile.write('\t%d' % binAmount)