12 # originally from version 1.3 of geneDownstreamBins.py
13 from commoncode import *
14 from cistematic.genomes import Genome
15 from cistematic.core.geneinfo import geneinfoDB
18 print '%s: version 2.0' % sys.argv[0]
20 print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
21 print '\n\twhere regionSize is the optional maximum region in bp\n'
26 outfilename = sys.argv[3]
28 standardMinDist = 3000
29 if '-max' in sys.argv:
30 standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
32 if '-raw' in sys.argv:
40 if '-cache' in sys.argv:
44 standardMinThresh = standardMinDist / bins
46 hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
47 readlen = hitRDS.getReadSize()
48 normalizationFactor = 1.0
50 totalCount = len(hitRDS)
51 normalizationFactor = totalCount / 1000000.
54 idb = geneinfoDB(cache=True)
57 geneinfoDict = idb.getallGeneInfo(genome)
58 featuresDict = hg.getallGeneFeatures()
60 #infile = open(infilename)
61 outfile = open(outfilename,'w')
63 gidList = hg.allGIDs()
70 geneinfo = geneinfoDict[gid]
71 featureList = featuresDict[gid]
72 symbol = geneinfo[0][0]
76 if len(featureList) == 0:
78 for (ftype, chrom, start, stop, fsense) in featureList:
79 if (start, stop) not in newfeatureList:
80 newfeatureList.append((start, stop))
81 if chrom not in hitDict:
84 if len(newfeatureList) < 1:
85 #print '%s %s %d' % (gid, symbol, -1)
86 #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
88 glen = standardMinDist / 2
90 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
91 if nextGene < glen * 2:
95 gstart = newfeatureList[0][0] - glen
98 gstop = newfeatureList[0][0] + glen
100 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
101 if nextGene < glen * 2:
105 gstart = newfeatureList[-1][1] - glen
106 gstop = newfeatureList[-1][1] + glen
108 if glen < standardMinDist / 2:
111 for (tagStart, sense, weight) in hitDict[chrom]:
113 if tagStart >= 2 * glen:
118 # we are relying on python's integer division quirk
119 binID = tagStart / standardMinThresh
120 binList[binID] += weight
122 rdist = 2 * glen - tagStart
123 binID = rdist / standardMinThresh
124 binList[binID] += weight
127 print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
128 outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
129 for binAmount in binList:
130 outfile.write('\t%d' % binAmount)