snapshot of 4.0a development. initial git repo commit
[erange.git] / geneLocusBins.py
1 #
2 #  geneLocusBins.py
3 #  ENRAGE
4 #
5
6 # originally from version 1.3 of geneDownstreamBins.py
7 try:
8     import psyco
9     psyco.full()
10 except:
11     pass
12
13 import sys, optparse
14 from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
15 from cistematic.genomes import Genome
16 from cistematic.core.geneinfo import geneinfoDB
17
18 print '%s: version 2.1' % sys.argv[0]
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
25
26     parser = optparse.OptionParser(usage=usage)
27     parser.add_option("--bins", type="int", dest="bins",
28                       help="number of bins to use [default: 10]")
29     parser.add_option("--flank", type="int", dest="flankBP",
30                       help="number of flanking BP on both upstream and downstream [default: 0]")
31     parser.add_option("--upstream", type="int", dest="upstreamBP",
32                       help="number of upstream flanking BP [default: 0]")
33     parser.add_option("--downstream", type="int", dest="downstreamBP",
34                       help="number of downstream flanking BP [default: 0]")
35     parser.add_option("--nocds", action="store_false", dest="doCDS",
36                       help="do not CDS")
37     parser.add_option("--raw", action="store_false", dest="normalizeBins",
38                       help="do not normalize results")
39     parser.add_option("--force", action="store_false", dest="limitNeighbor",
40                       help="limit neighbor region")
41     parser.add_option("--regions", dest="acceptfile")
42     parser.add_option("--cache", action="store_true", dest="doCache",
43                       help="use cache")
44     parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
45     (options, args) = parser.parse_args(argv[1:])
46
47     if len(args) < 3:
48         print usage
49         sys.exit(1)
50
51     genome = args[0]
52     hitfile =  args[1]
53     outfilename = args[2]
54    
55     upstreamBp = 0
56     downstreamBp = 0
57     doFlank = False
58     if options.flankBP is not None:
59         upstreamBp = options.flankBP
60         downstreamBp = options.flankBP
61         doFlank = True
62
63     if options.upstreamBP is not None:
64         upstreamBp = options.upstreamBP
65         doFlank = True
66
67     if options.downstreamBP is not None:
68         downstreamBp = options.downstreamBP
69         doFlank = True
70
71     geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
72
73
74 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
75     if acceptfile is None:
76         acceptDict = {}
77     else:
78         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
79     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
80     readlen = hitRDS.getReadSize()
81     normalizationFactor = 1.0
82     if normalizeBins:
83         totalCount = len(hitRDS)
84         normalizationFactor = totalCount / 1000000.
85
86     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
87
88     hg = Genome(genome)
89     idb = geneinfoDB(cache=doCache)
90
91     geneinfoDict = idb.getallGeneInfo(genome)
92     if doFlank:
93         locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
94     else:
95         locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
96
97     gidList = hg.allGIDs()
98     gidList.sort()
99     for chrom in acceptDict:
100         for (label, start, stop, length) in acceptDict[chrom]:
101             if label not in gidList:
102                 gidList.append(label)
103
104     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
105
106     outfile = open(outfilename,'w')
107
108     for gid in gidList:
109         if 'FAR' not in gid:
110             symbol = 'LOC' + gid
111             geneinfo = ''
112             try:
113                 geneinfo = geneinfoDict[gid]
114                 symbol = geneinfo[0][0]
115             except:
116                 pass
117         else:
118             symbol = gid
119         if gid in gidBins and gid in gidLen:
120             tagCount = 0.
121             for binAmount in gidBins[gid]:
122                 tagCount += binAmount
123         outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
124         for binAmount in gidBins[gid]:
125             if normalizeBins:
126                 if tagCount == 0:
127                     tagCount = 1
128                 outfile.write('\t%.1f' % (100. * binAmount / tagCount))
129             else:
130                 outfile.write('\t%.1f' % binAmount)
131         outfile.write('\n')
132     outfile.close()
133
134
135 if __name__ == "__main__":
136     main(sys.argv)