first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneLocusBins.py
1 #
2 #  geneLocusBins.py
3 #  ENRAGE
4 #
5
6 # originally from version 1.3 of geneDownstreamBins.py
7 try:
8     import psyco
9     psyco.full()
10 except:
11     pass
12
13 import sys
14 import optparse
15 import string
16 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
17 import ReadDataset
18 from cistematic.genomes import Genome
19 from commoncode import getGeneInfoDict
20
21 print "geneLocusBins: version 2.2"
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
28
29     parser = getParser(usage)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         sys.exit(1)
35
36     genome = args[0]
37     hitfile =  args[1]
38     outfilename = args[2]
39    
40     upstreamBp = 0
41     downstreamBp = 0
42     doFlank = False
43     if options.flankBP is not None:
44         upstreamBp = options.flankBP
45         downstreamBp = options.flankBP
46         doFlank = True
47
48     if options.upstreamBP is not None:
49         upstreamBp = options.upstreamBP
50         doFlank = True
51
52     if options.downstreamBP is not None:
53         downstreamBp = options.downstreamBP
54         doFlank = True
55
56     geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
57
58
59 def getParser(usage):
60     parser = optparse.OptionParser(usage=usage)
61     parser.add_option("--bins", type="int", dest="bins",
62                       help="number of bins to use [default: 10]")
63     parser.add_option("--flank", type="int", dest="flankBP",
64                       help="number of flanking BP on both upstream and downstream [default: 0]")
65     parser.add_option("--upstream", type="int", dest="upstreamBP",
66                       help="number of upstream flanking BP [default: 0]")
67     parser.add_option("--downstream", type="int", dest="downstreamBP",
68                       help="number of downstream flanking BP [default: 0]")
69     parser.add_option("--nocds", action="store_false", dest="doCDS",
70                       help="do not CDS")
71     parser.add_option("--raw", action="store_false", dest="normalizeBins",
72                       help="do not normalize results")
73     parser.add_option("--force", action="store_false", dest="limitNeighbor",
74                       help="limit neighbor region")
75     parser.add_option("--regions", dest="acceptfile")
76     parser.add_option("--cache", action="store_true", dest="doCache",
77                       help="use cache")
78
79     configParser = getConfigParser()
80     section = "geneLocusBins"
81     normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
82     doCache = getConfigBoolOption(configParser, section, "doCache", False)
83     bins = getConfigIntOption(configParser, section, "bins", 10)
84     flankBP = getConfigOption(configParser, section, "flankBP", None)
85     upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
86     downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
87     doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
88     limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
89
90     parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
91                         upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
92                         limitNeighbor=limitNeighbor)
93
94     return parser
95
96
97 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
98                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
99                   acceptfile=None):
100
101
102     hg = Genome(genome)
103     gidList = hg.allGIDs()
104     gidList.sort()
105     if acceptfile is None:
106         acceptDict = {}
107     else:
108         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
109         for chrom in acceptDict:
110             for region in acceptDict[chrom]:
111                 if region.label not in gidList:
112                     gidList.append(region.label)
113
114     if doFlank:
115         locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
116     else:
117         locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
118
119     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
120     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
121     readlen = hitRDS.getReadSize()
122     normalizationFactor = 1.0
123     if normalizeBins:
124         totalCount = len(hitRDS)
125         normalizationFactor = totalCount / 1000000.
126
127     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
128     geneinfoDict = getGeneInfoDict(genome, cache=doCache)
129     writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
130
131
132 def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
133     outfile = open(outfilename, "w")
134     for gid in gidList:
135         if "FAR" not in gid:
136             symbol = "LOC%s" % gid
137             geneinfo = ""
138             try:
139                 geneinfo = geneinfoDict[gid]
140                 symbol = geneinfo[0][0]
141             except (KeyError, IndexError):
142                 pass
143         else:
144             symbol = gid
145
146         if gid in gidBins and gid in gidLen:
147             tagCount = 0.
148             for binAmount in gidBins[gid]:
149                 tagCount += binAmount
150
151         outputList = [gid, symbol, tagCount, gidLen[gid]]
152         for binAmount in gidBins[gid]:
153             if normalizeBins:
154                 try:
155                     normalizedValue = 100. * binAmount / tagCount
156                 except ZeroDivisionError:
157                     #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
158                     normalizedValue = 100. * binAmount
159
160                 binAmount = normalizedValue
161
162             outputList.append("%.1f" % binAmount)
163
164         outLine = string.join(outputList, "\t")
165         outfile.write("%s\n" % outLine)
166
167     outfile.close()
168
169
170 if __name__ == "__main__":
171     main(sys.argv)