erange version 4.0a dev release
[erange.git] / geneDownstreamBins.py
1 #
2 #  geneDownstreamBins.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
13 import sys
14 import optparse
15 import ReadDataset
16 from cistematic.genomes import Genome
17 from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
18
19 print "geneDownstreamBins: version 2.1"
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
26
27     parser = makeParser(usage)
28     (options, args) = parser.parse_args(argv[1:])
29
30     if len(args) < 3:
31         print usage
32         sys.exit(1)
33
34     genome = args[0]
35     hitfile =  args[1]
36     outfilename = args[2]
37
38     geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
39
40
41 def makeParser(usage=""):
42     parser = optparse.OptionParser(usage=usage)
43     parser.add_option("--max", type="int", dest="standardMinDist",
44                       help="maximum region in bp")
45
46     configParser = getConfigParser()
47     section = "geneDownstreamBins"
48     standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
49
50     parser.set_defaults(standardMinDist=standardMinDist)
51
52     return parser
53
54
55 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
56     bins = 10
57     standardMinThresh = standardMinDist / bins
58
59     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
60     normalizationFactor = 1.0
61     if normalize:
62         hitDictSize = len(hitRDS)
63         normalizationFactor = hitDictSize / 1000000.
64
65     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
66     geneinfoDict = getGeneInfoDict(genome, cache=True)
67     hg = Genome(genome)
68     featuresDict = hg.getallGeneFeatures()
69
70     outfile = open(outfilename, "w")
71
72     gidList = hg.allGIDs()
73     gidList.sort()
74     for gid in gidList:
75         symbol = "LOC" + gid
76         geneinfo = ""
77         featureList = []
78         try:
79             geneinfo = geneinfoDict[gid]
80             featureList = featuresDict[gid]
81             symbol = geneinfo[0][0]
82         except:
83             print gid
84
85         if len(featureList) == 0:
86             continue
87
88         newfeatureList = []
89         for (ftype, chrom, start, stop, fsense) in featureList:
90             if (start, stop) not in newfeatureList:
91                 newfeatureList.append((start, stop))
92
93         if chrom not in hitDict:
94             continue
95
96         newfeatureList.sort()
97         if len(newfeatureList) < 1:
98             continue
99
100         glen = standardMinDist
101         if fsense == "F":
102             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
103             if nextGene < glen * 2:
104                 glen = nextGene / 2
105
106             if glen < 1:
107                 glen = 1
108
109             gstart = newfeatureList[-1][1]
110         else:
111             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
112             if nextGene < glen * 2:
113                 glen = nextGene / 2
114
115             if glen < 1:
116                 glen = 1
117
118             gstart = newfeatureList[0][0] - glen
119             if gstart < 0:
120                 gstart = 0
121
122         tagCount = 0
123         if glen < standardMinDist:
124             continue
125
126         binList = [0.] * bins
127         for read in hitDict[chrom]:
128             tagStart = read["start"]
129             weight = read["weight"]
130             tagStart -= gstart
131             if tagStart >= glen:
132                 break
133
134             if tagStart > 0:
135                 tagCount += weight
136                 if fsense == "F":
137                     # we are relying on python's integer division quirk
138                     binID = tagStart / standardMinThresh 
139                     binList[binID] += weight
140                 else:
141                     rdist = glen - tagStart
142                     binID = rdist / standardMinThresh 
143                     binList[binID] += weight
144
145         if tagCount < 2:
146             continue
147
148         tagCount *= normalizationFactor
149         print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
150         outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
151         for binAmount in binList:
152             outfile.write("\t%.2f" % binAmount)
153
154         outfile.write("\n")
155
156     outfile.close()
157
158
159 if __name__ == "__main__":
160     main(sys.argv)