snapshot of 4.0a development. initial git repo commit
[erange.git] / normalizeExpandedExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys, optparse
8 from commoncode import readDataset
9 from cistematic.genomes import Genome
10 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
11
12 print "%prog: version 5.6"
13
14
15 def main(argv=None):
16     if not argv:
17         argv = sys.argv
18
19     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
20
21     parser = optparse.OptionParser(usage=usage)
22     parser.add_option("--gidField", type="int", dest="fieldID")
23     parser.add_option("--maxLength", type="float", dest="maxLength")
24     parser.add_option("--cache", action="store_true", dest="doCache")
25     parser.add_option("--models", dest="extendGenome")
26     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
27     parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
28                         replaceModels=False)
29
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(sys.argv) < 6:
33         print usage
34         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
35         sys.exit(1)
36
37     genome = args[0]
38     hitfile = args[1]
39     uniquecountfile = args[2]
40     splicecountfile = args[3]
41     outfile = args[4]
42
43     candidateLines = []
44     acceptedfilename = ""
45     if len(args) > 5:
46         try:
47             candidatefile = open(args[5])
48             candidateLines = candidatefile.readlines()
49             candidatefile.close()
50             acceptedfilename = args[6]
51         except IndexError:
52             pass
53
54     normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
55                             candidateLines, acceptedfilename, options.fieldID,
56                             options.maxLength, options.doCache, options.extendGenome,
57                             options.replaceModels)
58
59
60 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
61                             outfilename, candidateLines=[], acceptedfilename="",
62                             fieldID=0, maxLength=1000000000., doCache=False,
63                             extendGenome="", replaceModels=False):
64
65     uniquecountfile = open(uniquecountfilename)
66
67     if acceptedfilename:
68         acceptedfile = open(acceptedfilename, "w")
69
70     dosplicecount = False
71     if splicecountfilename != "none":
72         dosplicecount = True
73         splicecountfile = open(splicecountfilename)
74
75     if extendGenome:
76         if replaceModels:
77             print "will replace gene models with %s" % extendGenome
78         else:
79             print "will extend gene models with %s" % extendGenome
80
81     if doCache:
82         cacheGeneDB(genome)
83         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
84         print "%s cached" % genome
85     else:
86         hg = Genome(genome, inRAM=True)
87
88     if extendGenome != "":
89         hg.extendFeatures(extendGenome, replace=replaceModels)
90
91     RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
92     uniqcount = RDS.getUniqsCount()
93     print "%d unique reads" % uniqcount
94
95     splicecount = 0
96     countDict = {}
97     gidList = []
98     farList = []
99     candidateDict = {}
100
101     gidToGeneDict = {}
102
103     featuresDict = hg.getallGeneFeatures()
104     print "got featuresDict"
105
106     outfile = open(outfilename, "w")
107
108     for line in uniquecountfile:
109         fields = line.strip().split()
110         gid = fields[fieldID]
111         gene = fields[1]
112         countDict[gid] = float(fields[-1])
113         gidList.append(gid)
114         gidToGeneDict[gid] = gene
115
116     uniquecountfile.close()
117
118     if dosplicecount:
119         for line in splicecountfile:
120             fields = line.strip().split()
121             gid = fields[fieldID]
122             try:
123                 countDict[gid] += float(fields[-1])
124             except:
125                 print fields
126                 continue
127
128             splicecount += float(fields[-1])
129
130         splicecountfile.close()
131
132     for line in candidateLines:
133         if "#" in line:
134             continue
135
136         fields = line.strip().split()
137         gid = fields[1]
138         gene = fields[0]
139         if gid not in gidList:
140             if gid not in farList:
141                 farList.append(gid)
142                 gidToGeneDict[gid] = gene
143
144             if gid not in countDict:
145                 countDict[gid] = 0
146
147             countDict[gid] += float(fields[6])
148
149         if gid not in candidateDict:
150             candidateDict[gid] = []
151
152         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
153
154     totalCount = (uniqcount + splicecount) / 1000000.
155     uniqScale = uniqcount / 1000000.
156     for gid in gidList:
157         gene = gidToGeneDict[gid]
158         featureList = []
159         try:
160             featureList = featuresDict[gid]
161         except:
162             try:
163                 featureList = featuresDict[gene]
164             except:
165                 print gene, gid
166
167         newfeatureList = []
168         geneLength = 0.
169         for (ftype, chrom, start, stop, sense) in featureList:
170             if (start, stop) not in newfeatureList:
171                 newfeatureList.append((start, stop))
172                 geneLength += (abs(start - stop) + 1.) / 1000.
173
174         if geneLength < 0.1:
175             geneLength = 0.1
176         elif geneLength > maxLength:
177             geneLength = maxLength
178
179         rpm = countDict[gid] / totalCount
180         rpkm = rpm / geneLength
181         if gid in candidateDict:
182             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
183                 cratio = cCount / (cLength / 1000.)
184                 cratio = (uniqScale * cratio) / totalCount
185                 if 10. * cratio < rpkm:
186                     continue
187
188                 countDict[gid] += cCount
189                 geneLength += cLength / 1000.
190                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
191
192         rpm = countDict[gid] / totalCount
193         rpkm = rpm / geneLength
194         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
195
196     for gid in farList:
197         gene = gidToGeneDict[gid]
198         geneLength = 0
199         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
200             geneLength += cLength / 1000.
201
202         if geneLength < 0.1:
203             continue
204
205         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
206             cratio = cCount / (cLength / 1000.)
207             cratio = cratio / totalCount
208             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
209
210         rpm = countDict[gid] / totalCount
211         rpkm = rpm / geneLength
212         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
213
214     outfile.close()
215     try:
216         acceptedfile.close()
217     except:
218         pass
219
220     if doCache:
221         uncacheGeneDB(genome)
222
223
224 if __name__ == "__main__":
225     main(sys.argv)