rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / normalizeExpandedExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import ReadDataset
10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
13
14 print "normalizeExpandedExonic: version 5.7"
15
16
17 def main(argv=None):
18     if not argv:
19         argv = sys.argv
20
21     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
22
23     parser = makeParser(usage)
24     (options, args) = parser.parse_args(argv[1:])
25
26     if len(sys.argv) < 6:
27         print usage
28         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
29         sys.exit(1)
30
31     genome = args[0]
32     hitfile = args[1]
33     uniquecountfile = args[2]
34     splicecountfile = args[3]
35     outfile = args[4]
36
37     candidateLines = []
38     acceptedfilename = ""
39     if len(args) > 5:
40         try:
41             candidatefile = open(args[5])
42             candidateLines = candidatefile.readlines()
43             candidatefile.close()
44             acceptedfilename = args[6]
45         except IndexError:
46             pass
47
48     normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
49                             candidateLines, acceptedfilename, options.fieldID,
50                             options.maxLength, options.doCache, options.extendGenome,
51                             options.replaceModels)
52
53
54 def makeParser(usage=""):
55     parser = optparse.OptionParser(usage=usage)
56     parser.add_option("--gidField", type="int", dest="fieldID")
57     parser.add_option("--maxLength", type="float", dest="maxLength")
58     parser.add_option("--cache", action="store_true", dest="doCache")
59     parser.add_option("--models", dest="extendGenome")
60     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
61
62     configParser = getConfigParser()
63     section = "normalizeExpandedExonic"
64     fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
65     maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
66     doCache = getConfigBoolOption(configParser, section, "doCache", False)
67     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
68     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
69
70     parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
71                         replaceModels=replaceModels)
72
73     return parser
74
75
76 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
77                             outfilename, candidateLines=[], acceptedfilename="",
78                             fieldID=0, maxLength=1000000000., doCache=False,
79                             extendGenome="", replaceModels=False):
80
81     uniquecountfile = open(uniquecountfilename)
82
83     if acceptedfilename:
84         acceptedfile = open(acceptedfilename, "w")
85
86     dosplicecount = False
87     if splicecountfilename != "none":
88         dosplicecount = True
89         splicecountfile = open(splicecountfilename)
90
91     if extendGenome:
92         if replaceModels:
93             print "will replace gene models with %s" % extendGenome
94         else:
95             print "will extend gene models with %s" % extendGenome
96
97     if doCache:
98         cacheGeneDB(genome)
99         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
100         print "%s cached" % genome
101     else:
102         hg = Genome(genome, inRAM=True)
103
104     if extendGenome != "":
105         hg.extendFeatures(extendGenome, replace=replaceModels)
106
107     RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
108     uniqcount = RDS.getUniqsCount()
109     print "%d unique reads" % uniqcount
110
111     splicecount = 0
112     countDict = {}
113     gidList = []
114     farList = []
115     candidateDict = {}
116
117     gidToGeneDict = {}
118
119     featuresDict = hg.getallGeneFeatures()
120     print "got featuresDict"
121
122     outfile = open(outfilename, "w")
123
124     for line in uniquecountfile:
125         fields = line.strip().split()
126         gid = fields[fieldID]
127         gene = fields[1]
128         countDict[gid] = float(fields[-1])
129         gidList.append(gid)
130         gidToGeneDict[gid] = gene
131
132     uniquecountfile.close()
133
134     if dosplicecount:
135         for line in splicecountfile:
136             fields = line.strip().split()
137             gid = fields[fieldID]
138             try:
139                 countDict[gid] += float(fields[-1])
140             except:
141                 print fields
142                 continue
143
144             splicecount += float(fields[-1])
145
146         splicecountfile.close()
147
148     for line in candidateLines:
149         if "#" in line:
150             continue
151
152         fields = line.strip().split()
153         gid = fields[1]
154         gene = fields[0]
155         if gid not in gidList:
156             if gid not in farList:
157                 farList.append(gid)
158                 gidToGeneDict[gid] = gene
159
160             if gid not in countDict:
161                 countDict[gid] = 0
162
163             countDict[gid] += float(fields[6])
164
165         if gid not in candidateDict:
166             candidateDict[gid] = []
167
168         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
169
170     totalCount = (uniqcount + splicecount) / 1000000.
171     uniqScale = uniqcount / 1000000.
172     for gid in gidList:
173         gene = gidToGeneDict[gid]
174         featureList = []
175         try:
176             featureList = featuresDict[gid]
177         except:
178             try:
179                 featureList = featuresDict[gene]
180             except:
181                 print gene, gid
182
183         newfeatureList = []
184         geneLength = 0.
185         for (ftype, chrom, start, stop, sense) in featureList:
186             if (start, stop) not in newfeatureList:
187                 newfeatureList.append((start, stop))
188                 geneLength += (abs(start - stop) + 1.) / 1000.
189
190         if geneLength < 0.1:
191             geneLength = 0.1
192         elif geneLength > maxLength:
193             geneLength = maxLength
194
195         rpm = countDict[gid] / totalCount
196         rpkm = rpm / geneLength
197         if gid in candidateDict:
198             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
199                 cratio = cCount / (cLength / 1000.)
200                 cratio = (uniqScale * cratio) / totalCount
201                 if 10. * cratio < rpkm:
202                     continue
203
204                 countDict[gid] += cCount
205                 geneLength += cLength / 1000.
206                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
207
208         rpm = countDict[gid] / totalCount
209         rpkm = rpm / geneLength
210         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
211
212     for gid in farList:
213         gene = gidToGeneDict[gid]
214         geneLength = 0
215         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
216             geneLength += cLength / 1000.
217
218         if geneLength < 0.1:
219             continue
220
221         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
222             cratio = cCount / (cLength / 1000.)
223             cratio = cratio / totalCount
224             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
225
226         rpm = countDict[gid] / totalCount
227         rpkm = rpm / geneLength
228         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
229
230     outfile.close()
231     try:
232         acceptedfile.close()
233     except:
234         pass
235
236     if doCache:
237         uncacheGeneDB(genome)
238
239
240 if __name__ == "__main__":
241     main(sys.argv)