first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / getallgenes.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 import sys
8 import optparse
9 from cistematic.core import genesIntersecting, featuresIntersecting
10 from commoncode import getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption, getGeneInfoDict, getGeneAnnotDict, getExtendedGeneAnnotDict
11
12 print "getallgenes: version 5.6"
13
14
15 def main(argv=None):
16     if not argv:
17         argv = sys.argv
18
19     usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
20
21     parser = getParser(usage)
22     (options, args) = parser.parse_args(argv[1:])
23
24     if len(args) < 3:
25         print usage
26         sys.exit(2)
27
28     genome = args[0]
29     infilename = args[1]
30     outfilename = args[2]
31
32     getallgenes(genome, infilename, outfilename, options.maxRadius,
33                 options.nomatchfilename, options.step, options.trackFar,
34                 options.trackStrand, options.compact, options.colID,
35                 options.doCache, options.extendGenome, options.replaceModels)
36
37
38 def getParser(usage):
39     parser = optparse.OptionParser(usage=usage)
40     parser.add_option("--radius", type="int", dest="maxRadius")
41     parser.add_option("--nomatch", dest="nomatchfilename")
42     parser.add_option("--trackfar", action="store_true", dest="trackFar")
43     parser.add_option("--stranded", action="store_true", dest="trackStrand")
44     parser.add_option("--cache", action="store_true", dest="cachePages")
45     parser.add_option("--compact", action="store_true", dest="compact")
46     parser.add_option("--step", type="int", dest="step")
47     parser.add_option("--startField", type="int", dest="colID")
48     parser.add_option("--models", dest="extendGenome")
49     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
50
51     configParser = getConfigParser()
52     section = "getallgenes"
53     maxRadius = getConfigIntOption(configParser, section, "maxRadius", 20002)
54     nomatchfilename = getConfigOption(configParser, section, "nomatchfilename", "")
55     step = getConfigOption(configParser, section, "step", None)
56     trackFar = getConfigBoolOption(configParser, section, "trackFar", False)
57     trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
58     compact = getConfigBoolOption(configParser, section, "compact", False)
59     colID = getConfigIntOption(configParser, section, "colID", 1)
60     doCache = getConfigBoolOption(configParser, section, "doCache", False)
61     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
62     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
63
64     parser.set_defaults(maxRadius=maxRadius, nomatchfilename=nomatchfilename, step=step, trackFar=trackFar,
65                         trackStrand=trackStrand, compact=compact, colID=colID, doCache=doCache,
66                         extendGenome=extendGenome, replaceModels=replaceModels)
67
68     return parser
69
70
71 def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
72                 step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
73                 doCache=False, extendGenome="", replaceModels=False):
74
75     if not step:
76         step = maxRadius - 2
77
78     if extendGenome and replaceModels:
79         replaceModels = True
80     else:
81         replaceModels = False
82
83     infile = open(infilename)
84     outfile = open(outfilename,"w")
85
86     geneinfoDict = getGeneInfoDict(genome, cache=doCache)
87
88     posList = []
89     altPosDict = {}
90     altPosRevDict = {}
91     posLine = {}
92     posStrand = {}
93     altPosList = []
94
95     for line in infile:
96         if line[0] == "#":
97             continue
98
99         fields = line.split("\t")
100         if compact:
101             (chrom, pos) = fields[colID].split(":")
102             chrom = chrom[3:]
103             (start, stop) = pos.split("-")
104             pos = (chrom, int(start))
105             altPos = (chrom, int(stop))
106         else:
107             try:
108                 chrom = fields[colID][3:]
109             except:
110                 print line
111                 continue
112
113             pos = (chrom, int(fields[colID + 1]))
114             altPos = (chrom, int(fields[colID + 2]))
115
116         altPosDict[pos] = altPos
117         altPosRevDict[altPos] = pos
118         posList.append(pos)
119         posList.append(altPos)
120         altPosList.append(altPos)
121         posLine[pos] = line
122         if trackStrand:
123             if "RNAFARP" in line:
124                 posStrand[pos] = "+"
125                 posStrand[altPos] = "+"
126             else:
127                 posStrand[pos] = "-"
128                 posStrand[altPos] = "-"
129
130     geneList = []
131     geneDict = {}
132     if maxRadius < step:
133         step = maxRadius - 2
134
135     if extendGenome != "":
136         geneannotDict = getExtendedGeneAnnotDict(genome, extendGenome, replace=replaceModels, inRAM=True)
137     else:
138         geneannotDict = getGeneAnnotDict(genome, inRAM=True)
139
140     for radius in range(1, maxRadius, step):
141         print "radius %d" % radius
142         print len(posList)
143         if radius == 1:
144             posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
145         else:
146             posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels) 
147             posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
148             for apos in posDict2:
149                 try: 
150                     posDict[apos] += posDict2[apos]
151                     posDict[apos].sort()
152                 except:
153                     posDict[apos] = posDict2[apos]
154
155         for pos in posDict:
156             geneID  = ""
157             if len(posDict[pos]) == 1:
158                 if trackStrand:
159                     if posStrand[pos] == posDict[pos][0][-1]:
160                         geneID = posDict[pos][0][0]
161                 else:
162                     geneID = posDict[pos][0][0]
163             elif len(posDict[pos]) > 1 and not trackStrand:
164                 (chrom, loc) = pos
165                 bestres = posDict[pos][0]
166                 dist1 = abs(bestres[3] - loc)
167                 dist2 = abs(bestres[4] - loc)
168                 if dist1 < dist2:
169                     bestdist = dist1
170                 else:
171                     bestdist = dist2
172
173                 for testres in posDict[pos]:
174                     testdist1 = abs(testres[3] - loc)
175                     testdist2 = abs(testres[4] - loc)
176                     if testdist1 < testdist2:
177                         testdist = testdist1
178                     else:
179                         testdist = testdist2
180
181                     if testdist < bestdist:
182                         bestdist = testdist
183                         bestres = testres
184
185                 geneID = bestres[0]
186             elif len(posDict[pos]) > 1:
187                 (chrom, loc) = pos
188                 bestres = posDict[pos][0]
189                 dist1 = abs(bestres[3] - loc)
190                 dist2 = abs(bestres[4] - loc)
191                 bestStrand = posDict[pos][-1]
192                 if dist1 < dist2:
193                     bestdist = dist1
194                 else:
195                     bestdist = dist2
196
197                 for testres in posDict[pos]:
198                     testdist1 = abs(testres[3] - loc)
199                     testdist2 = abs(testres[4] - loc)
200                     testStrand = testres[-1]
201                     if testdist1 < testdist2:
202                         testdist = testdist1
203                     else:
204                         testdist = testdist2
205
206                     if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
207                         bestdist = testdist
208                         bestres = testres
209                         bestStrand = testStrand
210                     elif testdist < bestdist:
211                         bestdist = testdist
212                         bestres = testres
213
214                 if bestStrand == posStrand[pos]:
215                     geneID = bestres[0]
216
217             if geneID != "":
218                 try:
219                     if genome == "dmelanogaster":
220                         symbol = geneinfoDict["Dmel_" + geneID][0][0]
221                     else:
222                         symbol = geneinfoDict[geneID][0][0]
223                 except:
224                     try:
225                         symbol = geneannotDict[(genome, geneID)][0]
226                     except:
227                         symbol = "LOC" + geneID
228             else:
229                 continue
230
231             if pos in altPosList and pos in posList:
232                 posList.remove(pos)
233                 if pos not in altPosRevDict:
234                     continue
235
236                 if altPosRevDict[pos] in posList:
237                     posList.remove(altPosRevDict[pos])
238
239                 pos = altPosRevDict[pos]
240             elif pos in posList:
241                 posList.remove(pos)
242                 if pos not in altPosDict:
243                     print pos
244                     continue
245
246                 if altPosDict[pos] in posList:
247                     posList.remove(altPosDict[pos])
248             else:
249                 continue
250
251             if (symbol, geneID) not in geneList:
252                 geneList.append((symbol, geneID))
253                 geneDict[(symbol, geneID)] = []
254
255             if pos not in geneDict[(symbol, geneID)]:
256                 geneDict[(symbol, geneID)].append(pos)
257
258     for (symbol, geneID) in geneList:
259         geneDict[(symbol, geneID)].sort()
260         seenLine = []
261         for pos in geneDict[(symbol, geneID)]:
262             if pos in altPosRevDict:
263                 pos = altPosRevDict[pos]
264
265             if posLine[pos] in seenLine:
266                 continue
267
268             if "\t" in symbol:
269                 symbol = symbol.replace("\t","|")
270
271             if " " in symbol:
272                 symbol = symbol.replace(" ","_")
273
274             line = "%s %s %s" % (symbol, geneID, posLine[pos])
275             seenLine.append(posLine[pos])
276             outfile.write(line)
277
278     matchIndex = 0
279     if nomatchfilename != "":
280         nomatchfile = open(nomatchfilename, "w")
281
282     prevStart = 0
283     prevChrom = ""
284     farIndex = 0
285     start = 0
286     for pos in posList:
287         if pos not in altPosList:
288             if nomatchfilename != "":
289                 nomatchfile.write(posLine[pos])
290
291             matchIndex += 1
292             # need to add strand tracking here.....
293             if trackFar:
294                 (chrom, start) = pos
295                 if chrom != prevChrom:
296                     farIndex += 1
297                     prevChrom = chrom
298                 elif abs(int(start) - prevStart) > maxRadius:
299                     farIndex += 1
300
301                 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
302                 outfile.write(line)
303             prevStart = int(start)
304
305     if nomatchfilename != "":
306         nomatchfile.close()
307
308     print "%d sites without a gene within radius of %d" % (matchIndex, radius)
309
310
311 if __name__ == "__main__":
312     main(sys.argv)