first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / normalizeExpandedExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import pysam
10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
13
14 print "normalizeExpandedExonic: version 5.7"
15
16
17 def main(argv=None):
18     if not argv:
19         argv = sys.argv
20
21     usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
22
23     parser = makeParser(usage)
24     (options, args) = parser.parse_args(argv[1:])
25
26     if len(sys.argv) < 6:
27         print usage
28         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
29         sys.exit(1)
30
31     genome = args[0]
32     hitfile = args[1]
33     uniquecountfile = args[2]
34     splicecountfile = args[3]
35     outfile = args[4]
36
37     candidatefilename = ""
38     acceptedfilename = ""
39     try:
40         candidatefilename = args[5]
41         acceptedfilename = args[6]
42     except IndexError:
43         pass
44
45     bamfile = pysam.Samfile(hitfile, "rb")    
46     uniqcount = getHeaderComment(bamfile.header, "Unique")
47     
48     normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
49                             candidatefilename, acceptedfilename, options.fieldID,
50                             options.maxLength, options.doCache, options.extendGenome,
51                             options.replaceModels)
52
53
54 def makeParser(usage=""):
55     parser = optparse.OptionParser(usage=usage)
56     parser.add_option("--gidField", type="int", dest="fieldID")
57     parser.add_option("--maxLength", type="float", dest="maxLength")
58     parser.add_option("--cache", action="store_true", dest="doCache")
59     parser.add_option("--models", dest="extendGenome")
60     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
61
62     configParser = getConfigParser()
63     section = "normalizeExpandedExonic"
64     fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
65     maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
66     doCache = getConfigBoolOption(configParser, section, "doCache", False)
67     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
68     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
69
70     parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
71                         replaceModels=replaceModels)
72
73     return parser
74
75
76 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
77                             outfilename, candidatefilename="", acceptedfilename="",
78                             fieldID=0, maxLength=1000000000., doCache=False,
79                             extendGenome="", replaceModels=False):
80   
81     print "%d unique reads" % uniqcount
82     gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
83     totalCount = (uniqcount + splicecount) / 1000000.
84     uniqScale = uniqcount / 1000000.
85     if acceptedfilename:
86         acceptedfile = open(acceptedfilename, "w")
87
88     outfile = open(outfilename, "w")
89     featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
90     print "got featuresDict"
91     for gid in gidList:
92         gene = gidToGeneDict[gid]["name"]
93         featureList = []
94         try:
95             featureList = featuresDict[gid]
96         except:
97             try:
98                 featureList = featuresDict[gene]
99             except:
100                 print gene, gid
101
102         newfeatureList = []
103         geneLength = 0.
104         for (ftype, chrom, start, stop, sense) in featureList:
105             if (start, stop) not in newfeatureList:
106                 newfeatureList.append((start, stop))
107                 geneLength += (abs(start - stop) + 1.) / 1000.
108
109         if geneLength < 0.1:
110             geneLength = 0.1
111         elif geneLength > maxLength:
112             geneLength = maxLength
113
114         rpm = gidToGeneDict[gid]["count"] / totalCount
115         rpkm = rpm / geneLength
116         if gid in candidateDict:
117             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
118                 kilobaseLength = cLength / 1000.
119                 cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
120                 if 10. * cratio < rpkm:
121                     continue
122
123                 gidToGeneDict[gid]["count"] += cCount
124                 geneLength += kilobaseLength
125                 try:
126                     acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
127                 except TypeError:
128                     pass
129
130         rpm = gidToGeneDict[gid]["count"] / totalCount
131         rpkm = rpm / geneLength
132         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
133
134     for (gid, geneLength) in farList:
135         if geneLength < 0.1:
136             continue
137
138         gene = gidToGeneDict[gid]["name"]
139         if acceptedfilename:
140             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
141                 kilobaseLength = cLength / 1000.
142                 cratio = (cCount / kilobaseLength) / totalCount
143                 try:
144                     acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
145                 except TypeError:
146                     pass
147
148         rpm = gidToGeneDict[gid]["count"] / totalCount
149         rpkm = rpm / geneLength
150         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
151
152     outfile.close()
153     try:
154         acceptedfile.close()
155     except:
156         pass
157
158
159 def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
160
161     gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
162     uniqueGidList = gidToGeneDict.keys()
163     if candidatefilename:
164         candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
165     else:
166         candidateDict = {}
167         geneDictUpdates = {}
168
169     farList = []
170     for gid in geneDictUpdates:
171         gidToGeneDict[gid] = geneDictUpdates[gid]
172         geneLength = 0.
173         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
174             geneLength += cLength / 1000.
175
176         farList.append((gid, geneLength))
177
178     return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
179
180
181 def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
182
183     gidToGeneDict = {}
184     uniquecountfile = open(uniquecountfilename)
185     for line in uniquecountfile:
186         fields = line.strip().split()
187         gid = fields[fieldID]
188         gene = fields[1]
189         gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
190
191     uniquecountfile.close()
192     splicecount = 0.
193     if splicecountfilename != "none":
194         splicecountfile = open(splicecountfilename)
195         for line in splicecountfile:
196             fields = line.strip().split()
197             gid = fields[fieldID]
198             try:
199                 gidToGeneDict[gid]["count"] += float(fields[-1])
200             except:
201                 print fields
202                 continue
203
204             splicecount += float(fields[-1])
205
206         splicecountfile.close()
207
208     return gidToGeneDict, splicecount
209
210
211 def processCandidatesFile(candidatefilename, fieldID, gidList):
212     """ Reads entries from the candiates file
213         gid entries not in gidList are returned for inclusion
214         creates and returns a dictionary keyed off gid with a single list of
215         tuples (count, length, chrom, start, stop) for each gid
216     """
217     geneDictUpdates = {}
218     candidateDict = {}
219     candidatefile = open(candidatefilename)
220     for line in candidatefile.readlines():
221         if "#" in line:
222             continue
223
224         fields = line.strip().split()
225         gid = fields[1]
226         gene = fields[0]
227         if gid not in gidList:
228             try:
229                 geneDictUpdates[gid]["count"] += float(fields[6])
230             except KeyError:
231                 geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
232
233         try:
234             candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
235         except KeyError:
236             candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
237
238     candidatefile.close()
239
240     return candidateDict, geneDictUpdates
241
242
243 def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
244     if extendGenome:
245         if replaceModels:
246             print "will replace gene models with %s" % extendGenome
247         else:
248             print "will extend gene models with %s" % extendGenome
249
250     if doCache:
251         cacheGeneDB(genome)
252         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
253         print "%s cached" % genome
254     else:
255         hg = Genome(genome, inRAM=True)
256
257     if extendGenome != "":
258         hg.extendFeatures(extendGenome, replace=replaceModels)
259
260     featuresDict = hg.getallGeneFeatures()
261
262     if doCache:
263         uncacheGeneDB(genome)
264
265     return featuresDict
266
267
268 if __name__ == "__main__":
269     main(sys.argv)