snapshot of 4.0a development. initial git repo commit
[erange.git] / geneUpstreamBins.py
1 #
2 #  geneUpstreamBins.py
3 #  ENRAGE
4 #
5 # originally from version 1.3 of geneDownstreamBins.py
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys, optparse
13 from commoncode import readDataset
14 from cistematic.genomes import Genome
15 from cistematic.core.geneinfo import geneinfoDB
16
17 print "%prog: version 2.0"
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
24
25     parser = optparse.OptionParser(usage=usage)
26     parser.add_option("--raw", action="store_false", dest="normalize",
27                        help="maximum region in bp")
28     parser.add_option("--max", type="int", dest="standardMinDist")
29     parser.add_option("--cache", action="store_true", dest="doCache")
30     parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
31     (options, args) = parser.parse_args(argv[1:])
32
33     if len(args) < 3:
34         print usage
35         sys.exit(1)
36
37     genome = args[0]
38     hitfile =  args[1]
39     outfilename = args[3]
40
41     geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
42
43
44 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
45     bins = 10
46     standardMinThresh = standardMinDist / bins
47
48     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
49     normalizationFactor = 1.0
50     if normalize:
51         totalCount = len(hitRDS)
52         normalizationFactor = totalCount / 1000000.
53
54     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
55
56     hg = Genome(genome)
57     idb = geneinfoDB(cache=True)
58
59     geneinfoDict = idb.getallGeneInfo(genome)
60     featuresDict = hg.getallGeneFeatures()
61
62     outfile = open(outfilename,"w")
63
64     gidList = hg.allGIDs()
65     gidList.sort()
66     for gid in gidList:
67         symbol = "LOC" + gid
68         geneinfo = ""
69         featureList = []
70         try:
71             geneinfo = geneinfoDict[gid]
72             featureList = featuresDict[gid]
73             symbol = geneinfo[0][0]
74         except:
75             print geneinfo
76
77         newfeatureList = []
78         if len(featureList) == 0:
79             continue
80
81         for (ftype, chrom, start, stop, fsense) in featureList:
82             if (start, stop) not in newfeatureList:
83                 newfeatureList.append((start, stop))
84
85         if chrom not in hitDict:
86             continue
87
88         newfeatureList.sort()
89         if len(newfeatureList) < 1:
90             continue
91
92         glen = standardMinDist
93         if fsense == "F":
94             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
95             if nextGene < glen * 2:
96                 glen = nextGene / 2
97
98             if glen < 1:
99                 glen = 1
100
101             gstart = newfeatureList[0][0] - glen
102             if gstart < 0:
103                 gstart = 0
104
105         else:
106             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
107             if nextGene < glen * 2:
108                 glen = nextGene / 2
109
110             if glen < 1:
111                 glen = 1
112
113             gstart = newfeatureList[-1][1]
114
115         tagCount = 0
116         if glen < standardMinDist:
117             continue
118
119         binList = [0] * bins
120         for (tagStart, sense, weight) in hitDict[chrom]:
121             tagStart -= gstart
122             if tagStart >= glen:
123                 break
124
125             if tagStart > 0:
126                 tagCount += weight
127                 if fsense == "R":
128                     # we are relying on python's integer division quirk
129                     binID = tagStart / standardMinThresh 
130                     binList[binID] += weight
131                 else:
132                     rdist = glen - tagStart
133                     binID = rdist / standardMinThresh 
134                     binList[binID] += weight
135
136         if tagCount < 2:
137             continue
138
139         print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
140         outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
141         for binAmount in binList:
142             outfile.write("\t%d" % binAmount)
143         outfile.write("\n")
144
145     outfile.close()
146
147
148 if __name__ == "__main__":
149     main(sys.argv)