5 print 'psyco not running'
8 from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
9 from cistematic.core.geneinfo import geneinfoDB
10 from cistematic.genomes import Genome
12 print "%prog: version 5.5"
19 usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
21 parser = optparse.OptionParser(usage=usage)
22 parser.add_option("--radius", type="int", dest="maxRadius")
23 parser.add_option("--nomatch", dest="nomatchfilename")
24 parser.add_option("--trackfar", action="store_true", dest="trackFar")
25 parser.add_option("--stranded", action="store_true", dest="trackStrand")
26 parser.add_option("--cache", action="store_true", dest="cachePages")
27 parser.add_option("--compact", action="store_true", dest="compact")
28 parser.add_option("--step", type="int", dest="step")
29 parser.add_option("--startField", type="int", dest="colID")
30 parser.add_option("--models", dest="extendGenome")
31 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
32 parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
33 trackStrand=False, compact=False, colID=1, doCache=False,
34 extendGenome="", replaceModels=False)
35 (options, args) = parser.parse_args(argv[1:])
45 getallgenes(genome, infilename, outfilename, options.maxRadius,
46 options.nomatchfilename, options.step, options.trackFar,
47 options.trackStrand, options.compact, options.colID,
48 options.doCache, options.extendgenome, options.replaceModels)
51 def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
52 step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
53 doCache=False, extendGenome="", replaceModels=False):
56 idb = geneinfoDB(cache=True)
63 if extendGenome and replaceModels:
68 infile = open(infilename)
69 outfile = open(outfilename,"w")
71 if genome == "dmelanogaster":
72 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
74 geneinfoDict = idb.getallGeneInfo(genome)
87 fields = line.split("\t")
89 (chrom, pos) = fields[colID].split(":")
91 (start, stop) = pos.split("-")
92 pos = (chrom, int(start))
93 altPos = (chrom, int(stop))
96 chrom = fields[colID][3:]
101 pos = (chrom, int(fields[colID + 1]))
102 altPos = (chrom, int(fields[colID + 2]))
104 altPosDict[pos] = altPos
105 altPosRevDict[altPos] = pos
107 posList.append(altPos)
108 altPosList.append(altPos)
111 if "RNAFARP" in line:
113 posStrand[altPos] = "+"
116 posStrand[altPos] = "-"
123 hg = Genome(genome, inRAM=True)
124 if extendGenome != "":
125 hg.extendFeatures(extendGenome, replace = replaceModels)
127 geneannotDict = hg.allAnnotInfo()
129 for radius in range(1, maxRadius, step):
130 print "radius %d" % radius
133 posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
135 posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels)
136 posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
137 for apos in posDict2:
139 posDict[apos] += posDict2[apos]
142 posDict[apos] = posDict2[apos]
146 if len(posDict[pos]) == 1:
148 if posStrand[pos] == posDict[pos][0][-1]:
149 geneID = posDict[pos][0][0]
151 geneID = posDict[pos][0][0]
152 elif len(posDict[pos]) > 1 and not trackStrand:
154 bestres = posDict[pos][0]
155 dist1 = abs(bestres[3] - loc)
156 dist2 = abs(bestres[4] - loc)
162 for testres in posDict[pos]:
163 testdist1 = abs(testres[3] - loc)
164 testdist2 = abs(testres[4] - loc)
165 if testdist1 < testdist2:
170 if testdist < bestdist:
175 elif len(posDict[pos]) > 1:
177 bestres = posDict[pos][0]
178 dist1 = abs(bestres[3] - loc)
179 dist2 = abs(bestres[4] - loc)
180 bestStrand = posDict[pos][-1]
186 for testres in posDict[pos]:
187 testdist1 = abs(testres[3] - loc)
188 testdist2 = abs(testres[4] - loc)
189 testStrand = testres[-1]
190 if testdist1 < testdist2:
195 if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
198 bestStrand = testStrand
199 elif testdist < bestdist:
203 if bestStrand == posStrand[pos]:
208 if genome == "dmelanogaster":
209 symbol = geneinfoDict["Dmel_" + geneID][0][0]
211 symbol = geneinfoDict[geneID][0][0]
214 symbol = geneannotDict[(genome, geneID)][0]
216 symbol = "LOC" + geneID
220 if pos in altPosList and pos in posList:
222 if pos not in altPosRevDict:
225 if altPosRevDict[pos] in posList:
226 posList.remove(altPosRevDict[pos])
228 pos = altPosRevDict[pos]
231 if pos not in altPosDict:
235 if altPosDict[pos] in posList:
236 posList.remove(altPosDict[pos])
240 if (symbol, geneID) not in geneList:
241 geneList.append((symbol, geneID))
242 geneDict[(symbol, geneID)] = []
244 if pos not in geneDict[(symbol, geneID)]:
245 geneDict[(symbol, geneID)].append(pos)
247 for (symbol, geneID) in geneList:
248 geneDict[(symbol, geneID)].sort()
250 for pos in geneDict[(symbol, geneID)]:
251 if pos in altPosRevDict:
252 pos = altPosRevDict[pos]
254 if posLine[pos] in seenLine:
258 symbol = symbol.replace("\t","|")
261 symbol = symbol.replace(" ","_")
263 line = "%s %s %s" % (symbol, geneID, posLine[pos])
264 seenLine.append(posLine[pos])
268 if nomatchfilename != "":
269 nomatchfile = open(nomatchfilename, "w")
276 if pos not in altPosList:
277 if nomatchfilename != "":
278 nomatchfile.write(posLine[pos])
281 # need to add strand tracking here.....
284 if chrom != prevChrom:
287 elif abs(int(start) - prevStart) > maxRadius:
290 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
292 prevStart = int(start)
294 if nomatchfilename != "":
297 print "%d sites without a gene within radius of %d" % (matchIndex, radius)
300 if __name__ == "__main__":