first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneUpstreamBins.py
1 #
2 #  geneUpstreamBins.py
3 #  ENRAGE
4 #
5 # originally from version 1.3 of geneDownstreamBins.py
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys
13 import optparse
14 import ReadDataset
15 from cistematic.genomes import Genome
16 from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
17
18 print "geneUpstreamBins: version 2.1"
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
25
26     parser = makeParser(usage)
27     (options, args) = parser.parse_args(argv[1:])
28
29     if len(args) < 3:
30         print usage
31         sys.exit(1)
32
33     genome = args[0]
34     hitfile =  args[1]
35     outfilename = args[3]
36
37     geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
38
39
40 def makeParser(usage=""):
41     parser = optparse.OptionParser(usage=usage)
42     parser.add_option("--raw", action="store_false", dest="normalize",
43                        help="maximum region in bp")
44     parser.add_option("--max", type="int", dest="standardMinDist")
45     parser.add_option("--cache", action="store_true", dest="doCache")
46
47     configParser = getConfigParser()
48     section = "geneUpstreamBins"
49     standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
50     normalize = getConfigBoolOption(configParser, section, "normalize", True)
51     doCache = getConfigBoolOption(configParser, section, "doCache", False)
52
53     parser.set_defaults(standardMinDist=standardMinDist, normalize=normalize, doCache=doCache)
54
55     return parser
56
57
58 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
59     bins = 10
60     standardMinThresh = standardMinDist / bins
61
62     hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
63     normalizationFactor = 1.0
64     if normalize:
65         totalCount = len(hitRDS)
66         normalizationFactor = totalCount / 1000000.
67
68     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
69
70     hg = Genome(genome)
71     geneinfoDict = getGeneInfoDict(genome, cache=True)
72     featuresDict = hg.getallGeneFeatures()
73
74     outfile = open(outfilename,"w")
75
76     gidList = hg.allGIDs()
77     gidList.sort()
78     for gid in gidList:
79         symbol = "LOC" + gid
80         geneinfo = ""
81         featureList = []
82         try:
83             geneinfo = geneinfoDict[gid]
84             featureList = featuresDict[gid]
85             symbol = geneinfo[0][0]
86         except:
87             print geneinfo
88
89         newfeatureList = []
90         if len(featureList) == 0:
91             continue
92
93         for (ftype, chrom, start, stop, fsense) in featureList:
94             if (start, stop) not in newfeatureList:
95                 newfeatureList.append((start, stop))
96
97         if chrom not in hitDict:
98             continue
99
100         newfeatureList.sort()
101         if len(newfeatureList) < 1:
102             continue
103
104         glen = standardMinDist
105         if fsense == "F":
106             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
107             if nextGene < glen * 2:
108                 glen = nextGene / 2
109
110             if glen < 1:
111                 glen = 1
112
113             gstart = newfeatureList[0][0] - glen
114             if gstart < 0:
115                 gstart = 0
116
117         else:
118             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
119             if nextGene < glen * 2:
120                 glen = nextGene / 2
121
122             if glen < 1:
123                 glen = 1
124
125             gstart = newfeatureList[-1][1]
126
127         tagCount = 0
128         if glen < standardMinDist:
129             continue
130
131         binList = [0] * bins
132         for read in hitDict[chrom]:
133             tagStart = read["start"]
134             weight = read["weight"]
135             tagStart -= gstart
136             if tagStart >= glen:
137                 break
138
139             if tagStart > 0:
140                 tagCount += weight
141                 if fsense == "R":
142                     # we are relying on python's integer division quirk
143                     binID = tagStart / standardMinThresh 
144                     binList[binID] += weight
145                 else:
146                     rdist = glen - tagStart
147                     binID = rdist / standardMinThresh 
148                     binList[binID] += weight
149
150         if tagCount < 2:
151             continue
152
153         print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
154         outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
155         for binAmount in binList:
156             outfile.write("\t%d" % binAmount)
157         outfile.write("\n")
158
159     outfile.close()
160
161
162 if __name__ == "__main__":
163     main(sys.argv)