development release: conversion of ReadDataset to use BAM files
[erange.git] / normalizeExpandedExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import ReadDataset
10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
13
14 print "normalizeExpandedExonic: version 5.7"
15
16
17 def main(argv=None):
18     if not argv:
19         argv = sys.argv
20
21     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
22
23     parser = makeParser(usage)
24     (options, args) = parser.parse_args(argv[1:])
25
26     if len(sys.argv) < 6:
27         print usage
28         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
29         sys.exit(1)
30
31     genome = args[0]
32     hitfile = args[1]
33     uniquecountfile = args[2]
34     splicecountfile = args[3]
35     outfile = args[4]
36
37     candidateLines = []
38     acceptedfilename = ""
39     if len(args) > 5:
40         try:
41             candidatefile = open(args[5])
42             candidateLines = candidatefile.readlines()
43             candidatefile.close()
44             acceptedfilename = args[6]
45         except IndexError:
46             pass
47
48     RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
49     uniqcount = RDS.getUniqsCount()
50     
51     normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
52                             candidateLines, acceptedfilename, options.fieldID,
53                             options.maxLength, options.doCache, options.extendGenome,
54                             options.replaceModels)
55
56
57 def makeParser(usage=""):
58     parser = optparse.OptionParser(usage=usage)
59     parser.add_option("--gidField", type="int", dest="fieldID")
60     parser.add_option("--maxLength", type="float", dest="maxLength")
61     parser.add_option("--cache", action="store_true", dest="doCache")
62     parser.add_option("--models", dest="extendGenome")
63     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
64
65     configParser = getConfigParser()
66     section = "normalizeExpandedExonic"
67     fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
68     maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
69     doCache = getConfigBoolOption(configParser, section, "doCache", False)
70     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
71     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
72
73     parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
74                         replaceModels=replaceModels)
75
76     return parser
77
78
79 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
80                             outfilename, candidateLines=[], acceptedfilename="",
81                             fieldID=0, maxLength=1000000000., doCache=False,
82                             extendGenome="", replaceModels=False):
83
84     uniquecountfile = open(uniquecountfilename)
85
86     if acceptedfilename:
87         acceptedfile = open(acceptedfilename, "w")
88
89     dosplicecount = False
90     if splicecountfilename != "none":
91         dosplicecount = True
92         splicecountfile = open(splicecountfilename)
93
94     if extendGenome:
95         if replaceModels:
96             print "will replace gene models with %s" % extendGenome
97         else:
98             print "will extend gene models with %s" % extendGenome
99
100     if doCache:
101         cacheGeneDB(genome)
102         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
103         print "%s cached" % genome
104     else:
105         hg = Genome(genome, inRAM=True)
106
107     if extendGenome != "":
108         hg.extendFeatures(extendGenome, replace=replaceModels)
109
110     
111     print "%d unique reads" % uniqcount
112
113     splicecount = 0
114     countDict = {}
115     gidList = []
116     farList = []
117     candidateDict = {}
118
119     gidToGeneDict = {}
120
121     featuresDict = hg.getallGeneFeatures()
122     print "got featuresDict"
123
124     outfile = open(outfilename, "w")
125
126     for line in uniquecountfile:
127         fields = line.strip().split()
128         gid = fields[fieldID]
129         gene = fields[1]
130         countDict[gid] = float(fields[-1])
131         gidList.append(gid)
132         gidToGeneDict[gid] = gene
133
134     uniquecountfile.close()
135
136     if dosplicecount:
137         for line in splicecountfile:
138             fields = line.strip().split()
139             gid = fields[fieldID]
140             try:
141                 countDict[gid] += float(fields[-1])
142             except:
143                 print fields
144                 continue
145
146             splicecount += float(fields[-1])
147
148         splicecountfile.close()
149
150     for line in candidateLines:
151         if "#" in line:
152             continue
153
154         fields = line.strip().split()
155         gid = fields[1]
156         gene = fields[0]
157         if gid not in gidList:
158             if gid not in farList:
159                 farList.append(gid)
160                 gidToGeneDict[gid] = gene
161
162             if gid not in countDict:
163                 countDict[gid] = 0
164
165             countDict[gid] += float(fields[6])
166
167         if gid not in candidateDict:
168             candidateDict[gid] = []
169
170         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
171
172     totalCount = (uniqcount + splicecount) / 1000000.
173     uniqScale = uniqcount / 1000000.
174     for gid in gidList:
175         gene = gidToGeneDict[gid]
176         featureList = []
177         try:
178             featureList = featuresDict[gid]
179         except:
180             try:
181                 featureList = featuresDict[gene]
182             except:
183                 print gene, gid
184
185         newfeatureList = []
186         geneLength = 0.
187         for (ftype, chrom, start, stop, sense) in featureList:
188             if (start, stop) not in newfeatureList:
189                 newfeatureList.append((start, stop))
190                 geneLength += (abs(start - stop) + 1.) / 1000.
191
192         if geneLength < 0.1:
193             geneLength = 0.1
194         elif geneLength > maxLength:
195             geneLength = maxLength
196
197         rpm = countDict[gid] / totalCount
198         rpkm = rpm / geneLength
199         if gid in candidateDict:
200             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
201                 cratio = cCount / (cLength / 1000.)
202                 cratio = (uniqScale * cratio) / totalCount
203                 if 10. * cratio < rpkm:
204                     continue
205
206                 countDict[gid] += cCount
207                 geneLength += cLength / 1000.
208                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
209
210         rpm = countDict[gid] / totalCount
211         rpkm = rpm / geneLength
212         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
213
214     for gid in farList:
215         gene = gidToGeneDict[gid]
216         geneLength = 0
217         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
218             geneLength += cLength / 1000.
219
220         if geneLength < 0.1:
221             continue
222
223         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
224             cratio = cCount / (cLength / 1000.)
225             cratio = cratio / totalCount
226             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
227
228         rpm = countDict[gid] / totalCount
229         rpkm = rpm / geneLength
230         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
231
232     outfile.close()
233     try:
234         acceptedfile.close()
235     except:
236         pass
237
238     if doCache:
239         uncacheGeneDB(genome)
240
241
242 if __name__ == "__main__":
243     main(sys.argv)