rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / geneStartBins.py
1 #
2 #  geneStartBins.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys
13 from commoncode import *
14 import ReadDataset
15 from cistematic.genomes import Genome
16
17
18 print "geneStartBins: version 2.1"
19 if len(sys.argv) < 4:
20     print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
21     print '\n\twhere regionSize is the optional maximum region in bp\n'
22     sys.exit(1)
23
24 genome = sys.argv[1]
25 hitfile =  sys.argv[2]
26 outfilename = sys.argv[3]
27
28 standardMinDist = 3000
29 if '-max' in sys.argv:
30     standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
31
32 if '-raw' in sys.argv:
33     normalize = False
34     normalizeBins = False
35 else:
36     normalize = True
37     normalizeBins = True    
38
39 doCache = False
40 if '-cache' in sys.argv:
41     doCache = True
42
43 bins = 10
44 standardMinThresh = standardMinDist / bins
45
46 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
47 readlen = hitRDS.getReadSize()
48 normalizationFactor = 1.0
49 if normalize:
50     totalCount = len(hitRDS)
51     normalizationFactor = totalCount / 1000000.
52
53 hg = Genome(genome)
54 gidDict = {}
55 geneinfoDict = getGeneInfoDict(genome, cache=True)
56 featuresDict = hg.getallGeneFeatures()
57
58 outfile = open(outfilename,'w')
59
60 gidList = hg.allGIDs()
61 gidList.sort()
62 for gid in gidList:
63     symbol = 'LOC' + gid
64     geneinfo = ''
65     featureList = []
66     try:
67         geneinfo = geneinfoDict[gid]
68         featureList = featuresDict[gid]
69         symbol = geneinfo[0][0]
70     except:
71         print geneinfo
72
73     newfeatureList = []
74     if len(featureList) == 0:
75         continue
76
77     for (ftype, chrom, start, stop, fsense) in featureList:
78         if (start, stop) not in newfeatureList:
79             newfeatureList.append((start, stop))
80
81     if chrom not in hitDict:
82         continue
83
84     newfeatureList.sort()
85     if len(newfeatureList) < 1:
86         continue
87
88     glen = standardMinDist / 2
89     if fsense == 'F':
90         nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
91         if nextGene < glen * 2:
92             glen = nextGene / 2
93
94         if glen < 1:
95             glen = 1
96
97         gstart = newfeatureList[0][0] - glen
98         if gstart < 0:
99             gstart = 0
100
101         gstop = newfeatureList[0][0]  + glen
102     else:
103         nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
104         if nextGene < glen * 2:
105             glen = nextGene / 2
106
107         if glen < 1:
108             glen = 1
109
110         gstart = newfeatureList[-1][1] - glen
111         gstop = newfeatureList[-1][1] + glen
112     tagCount = 0
113     if glen < standardMinDist / 2:
114         continue
115
116     binList = [0] * bins
117     for read in hitDict[chrom]:
118         tagStart = read["start"] - gstart
119         sense = read["sense"]
120         weight = read["weight"]
121         if tagStart >= 2 * glen:
122             break
123
124         if tagStart > 0:
125             tagCount += weight
126             if fsense == 'R':
127                 # we are relying on python's integer division quirk
128                 binID = tagStart / standardMinThresh 
129                 binList[binID] += weight
130             else:
131                 rdist = 2 * glen - tagStart
132                 binID = rdist / standardMinThresh 
133                 binList[binID] += weight
134
135     if tagCount < 2:
136         continue
137
138     print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
139     outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
140     for binAmount in binList:
141         outfile.write('\t%d' % binAmount)
142
143     outfile.write('\n')
144
145 outfile.close()
146