rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / geneLocusPeaks.py
1 #
2 #  geneLocusPeaks.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys
13 import optparse
14 from commoncode import getMergedRegions, findPeak, getLocusByChromDict
15 import ReadDataset
16 from cistematic.genomes import Genome
17 from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
18
19
20 print "geneLocusPeaks: version 2.1"
21
22 def main(argv=None):
23     if not argv:
24         argv = sys.argv
25
26     usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
27
28     parser = makeParser(usage)
29     (options, args) = parser.parse_args(argv[1:])
30
31     if len(args) < 3:
32         print usage
33         print "\twhere upstream and downstream are in bp and and optional"
34         sys.exit(1)
35
36     genome = args[0]
37     hitfile =  args[1]
38     outfilename = args[2]
39
40     geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
41
42
43 def makeParser(usage=""):
44     parser = optparse.OptionParser(usage=usage)
45     parser.add_option("--up", type="int", dest="upstream")
46     parser.add_option("--down", type="int", dest="downstream")
47     parser.add_option("--regions", dest="acceptfile")
48     parser.add_option("--raw", action="store_false", dest="normalize")
49     parser.add_option("--cache", action="store_true", dest="doCache")
50
51     configParser = getConfigParser()
52     section = "geneLocusPeaks"
53     upstream = getConfigIntOption(configParser, section, "upstream", 0)
54     downstream = getConfigIntOption(configParser, section, "downstream", 0)
55     acceptfile = getConfigOption(configParser, section, "acceptfile", "")
56     normalize = getConfigBoolOption(configParser, section, "normalize", True)
57     doCache = getConfigBoolOption(configParser, section, "doCache", False)
58
59     parser.set_defaults(upstream=upstream, downstream=downstream, acceptfile=acceptfile, normalize=normalize, doCache=doCache)
60
61     return parser
62
63
64 def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
65     acceptDict = {}
66
67     if acceptfile:
68         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
69
70     print "upstream = %d downstream = %d" % (upstream, downstream)
71
72     hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
73     readlen = hitRDS.getReadSize()
74     normalizationFactor = 1.0
75     if normalize:
76         totalCount = len(hitRDS)
77         normalizationFactor = totalCount / 1000000.
78
79     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
80
81     hg = Genome(genome)
82     gidCount = {}
83     gidPos = {}
84     geneinfoDict = getGeneInfoDict(genome, cache=True)
85     locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
86
87     gidList = hg.allGIDs()
88     gidList.sort()
89     for chrom in acceptDict:
90         for region in acceptDict[chrom]:
91             if region.label not in gidList:
92                 gidList.append(region.label)
93
94     for gid in gidList:
95         gidCount[gid] = 0
96
97     for chrom in hitDict:
98         if chrom not in locusByChromDict:
99             continue
100
101         print chrom
102         for (start, stop, gid, glen) in locusByChromDict[chrom]:
103             gidCount[gid] = 0.
104             peak = findPeak(hitDict[chrom], start, glen, readlen)
105             if len(peak.topPos) > 0:
106                 gidCount[gid] = peak.smoothArray[peak.topPos[0]]
107                 gidPos[gid] = (chrom, start + peak.topPos[0])
108             else:
109                 gidPos[gid] = (chrom, start)
110
111     outfile = open(outfilename, "w")
112
113     for gid in gidList:
114         if "FAR" not in gid:
115             symbol = "LOC" + gid
116             geneinfo = ""
117             try:
118                 geneinfo = geneinfoDict[gid]
119                 symbol = geneinfo[0][0]
120             except:
121                 pass
122         else:
123             symbol = gid
124
125         if gid in gidCount and gid in gidPos:
126             (chrom, pos) = gidPos[gid]
127             outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
128
129     outfile.close()
130
131
132 if __name__ == "__main__":
133     main(sys.argv)