snapshot of 4.0a development. initial git repo commit
[erange.git] / getallgenes.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 import sys, optparse
8 from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
9 from cistematic.core.geneinfo import geneinfoDB
10 from cistematic.genomes import Genome
11
12 print "%prog: version 5.5"
13
14
15 def main(argv=None):
16     if not argv:
17         argv = sys.argv
18
19     usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
20
21     parser = optparse.OptionParser(usage=usage)
22     parser.add_option("--radius", type="int", dest="maxRadius")
23     parser.add_option("--nomatch", dest="nomatchfilename")
24     parser.add_option("--trackfar", action="store_true", dest="trackFar")
25     parser.add_option("--stranded", action="store_true", dest="trackStrand")
26     parser.add_option("--cache", action="store_true", dest="cachePages")
27     parser.add_option("--compact", action="store_true", dest="compact")
28     parser.add_option("--step", type="int", dest="step")
29     parser.add_option("--startField", type="int", dest="colID")
30     parser.add_option("--models", dest="extendGenome")
31     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
32     parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
33                         trackStrand=False, compact=False, colID=1, doCache=False,
34                         extendGenome="", replaceModels=False)
35     (options, args) = parser.parse_args(argv[1:])
36
37     if len(args) < 3:
38         print usage
39         sys.exit(2)
40
41     genome = args[0]
42     infilename = args[1]
43     outfilename = args[2]
44
45     getallgenes(genome, infilename, outfilename, options.maxRadius,
46                 options.nomatchfilename, options.step, options.trackFar,
47                 options.trackStrand, options.compact, options.colID,
48                 options.doCache, options.extendgenome, options.replaceModels)
49
50
51 def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
52                 step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
53                 doCache=False, extendGenome="", replaceModels=False):
54
55     if doCache:
56         idb = geneinfoDB(cache=True)
57     else:
58         idb = geneinfoDB()
59
60     if not step:
61         step = maxRadius - 2
62
63     if extendGenome and replaceModels:
64         replaceModels = True
65     else:
66         replaceModels = False
67
68     infile = open(infilename)
69     outfile = open(outfilename,"w")
70
71     if genome == "dmelanogaster":
72         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
73     else:
74         geneinfoDict = idb.getallGeneInfo(genome)
75
76     posList = []
77     altPosDict = {}
78     altPosRevDict = {}
79     posLine = {}
80     posStrand = {}
81     altPosList = []
82
83     for line in infile:
84         if line[0] == "#":
85             continue
86
87         fields = line.split("\t")
88         if compact:
89             (chrom, pos) = fields[colID].split(":")
90             chrom = chrom[3:]
91             (start, stop) = pos.split("-")
92             pos = (chrom, int(start))
93             altPos = (chrom, int(stop))
94         else:
95             try:
96                 chrom = fields[colID][3:]
97             except:
98                 print line
99                 continue
100
101             pos = (chrom, int(fields[colID + 1]))
102             altPos = (chrom, int(fields[colID + 2]))
103
104         altPosDict[pos] = altPos
105         altPosRevDict[altPos] = pos
106         posList.append(pos)
107         posList.append(altPos)
108         altPosList.append(altPos)
109         posLine[pos] = line
110         if trackStrand:
111             if "RNAFARP" in line:
112                 posStrand[pos] = "+"
113                 posStrand[altPos] = "+"
114             else:
115                 posStrand[pos] = "-"
116                 posStrand[altPos] = "-"
117
118     geneList = []
119     geneDict = {}
120     if maxRadius < step:
121         step = maxRadius - 2
122
123     hg = Genome(genome, inRAM=True)
124     if extendGenome != "":
125         hg.extendFeatures(extendGenome, replace = replaceModels)
126
127     geneannotDict = hg.allAnnotInfo()
128
129     for radius in range(1, maxRadius, step):
130         print "radius %d" % radius
131         print len(posList)
132         if radius == 1:
133             posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
134         else:
135             posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels) 
136             posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
137             for apos in posDict2:
138                 try: 
139                     posDict[apos] += posDict2[apos]
140                     posDict[apos].sort()
141                 except:
142                     posDict[apos] = posDict2[apos]
143
144         for pos in posDict:
145             geneID  = ""
146             if len(posDict[pos]) == 1:
147                 if trackStrand:
148                     if posStrand[pos] == posDict[pos][0][-1]:
149                         geneID = posDict[pos][0][0]
150                 else:
151                     geneID = posDict[pos][0][0]
152             elif len(posDict[pos]) > 1 and not trackStrand:
153                 (chrom, loc) = pos
154                 bestres = posDict[pos][0]
155                 dist1 = abs(bestres[3] - loc)
156                 dist2 = abs(bestres[4] - loc)
157                 if dist1 < dist2:
158                     bestdist = dist1
159                 else:
160                     bestdist = dist2
161
162                 for testres in posDict[pos]:
163                     testdist1 = abs(testres[3] - loc)
164                     testdist2 = abs(testres[4] - loc)
165                     if testdist1 < testdist2:
166                         testdist = testdist1
167                     else:
168                         testdist = testdist2
169
170                     if testdist < bestdist:
171                         bestdist = testdist
172                         bestres = testres
173
174                 geneID = bestres[0]
175             elif len(posDict[pos]) > 1:
176                 (chrom, loc) = pos
177                 bestres = posDict[pos][0]
178                 dist1 = abs(bestres[3] - loc)
179                 dist2 = abs(bestres[4] - loc)
180                 bestStrand = posDict[pos][-1]
181                 if dist1 < dist2:
182                     bestdist = dist1
183                 else:
184                     bestdist = dist2
185
186                 for testres in posDict[pos]:
187                     testdist1 = abs(testres[3] - loc)
188                     testdist2 = abs(testres[4] - loc)
189                     testStrand = testres[-1]
190                     if testdist1 < testdist2:
191                         testdist = testdist1
192                     else:
193                         testdist = testdist2
194
195                     if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
196                         bestdist = testdist
197                         bestres = testres
198                         bestStrand = testStrand
199                     elif testdist < bestdist:
200                         bestdist = testdist
201                         bestres = testres
202
203                 if bestStrand == posStrand[pos]:
204                     geneID = bestres[0]
205
206             if geneID != "":
207                 try:
208                     if genome == "dmelanogaster":
209                         symbol = geneinfoDict["Dmel_" + geneID][0][0]
210                     else:
211                         symbol = geneinfoDict[geneID][0][0]
212                 except:
213                     try:
214                         symbol = geneannotDict[(genome, geneID)][0]
215                     except:
216                         symbol = "LOC" + geneID
217             else:
218                 continue
219
220             if pos in altPosList and pos in posList:
221                 posList.remove(pos)
222                 if pos not in altPosRevDict:
223                     continue
224
225                 if altPosRevDict[pos] in posList:
226                     posList.remove(altPosRevDict[pos])
227
228                 pos = altPosRevDict[pos]
229             elif pos in posList:
230                 posList.remove(pos)
231                 if pos not in altPosDict:
232                     print pos
233                     continue
234
235                 if altPosDict[pos] in posList:
236                     posList.remove(altPosDict[pos])
237             else:
238                 continue
239
240             if (symbol, geneID) not in geneList:
241                 geneList.append((symbol, geneID))
242                 geneDict[(symbol, geneID)] = []
243
244             if pos not in geneDict[(symbol, geneID)]:
245                 geneDict[(symbol, geneID)].append(pos)
246
247     for (symbol, geneID) in geneList:
248         geneDict[(symbol, geneID)].sort()
249         seenLine = []
250         for pos in geneDict[(symbol, geneID)]:
251             if pos in altPosRevDict:
252                 pos = altPosRevDict[pos]
253
254             if posLine[pos] in seenLine:
255                 continue
256
257             if "\t" in symbol:
258                 symbol = symbol.replace("\t","|")
259
260             if " " in symbol:
261                 symbol = symbol.replace(" ","_")
262
263             line = "%s %s %s" % (symbol, geneID, posLine[pos])
264             seenLine.append(posLine[pos])
265             outfile.write(line)
266
267     matchIndex = 0
268     if nomatchfilename != "":
269         nomatchfile = open(nomatchfilename, "w")
270
271     prevStart = 0
272     prevChrom = ""
273     farIndex = 0
274     start = 0
275     for pos in posList:
276         if pos not in altPosList:
277             if nomatchfilename != "":
278                 nomatchfile.write(posLine[pos])
279
280             matchIndex += 1
281             # need to add strand tracking here.....
282             if trackFar:
283                 (chrom, start) = pos
284                 if chrom != prevChrom:
285                     farIndex += 1
286                     prevChrom = chrom
287                 elif abs(int(start) - prevStart) > maxRadius:
288                     farIndex += 1
289
290                 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
291                 outfile.write(line)
292             prevStart = int(start)
293
294     if nomatchfilename != "":
295         nomatchfile.close()
296
297     print "%d sites without a gene within radius of %d" % (matchIndex, radius)
298
299
300 if __name__ == "__main__":
301     main(sys.argv)