5 print 'psyco not running'
9 from cistematic.core import genesIntersecting, featuresIntersecting
10 from commoncode import getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption, getGeneInfoDict, getGeneAnnotDict, getExtendedGeneAnnotDict
12 print "getallgenes: version 5.6"
19 usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
21 parser = getParser(usage)
22 (options, args) = parser.parse_args(argv[1:])
32 getallgenes(genome, infilename, outfilename, options.maxRadius,
33 options.nomatchfilename, options.step, options.trackFar,
34 options.trackStrand, options.compact, options.colID,
35 options.doCache, options.extendGenome, options.replaceModels)
39 parser = optparse.OptionParser(usage=usage)
40 parser.add_option("--radius", type="int", dest="maxRadius")
41 parser.add_option("--nomatch", dest="nomatchfilename")
42 parser.add_option("--trackfar", action="store_true", dest="trackFar")
43 parser.add_option("--stranded", action="store_true", dest="trackStrand")
44 parser.add_option("--cache", action="store_true", dest="cachePages")
45 parser.add_option("--compact", action="store_true", dest="compact")
46 parser.add_option("--step", type="int", dest="step")
47 parser.add_option("--startField", type="int", dest="colID")
48 parser.add_option("--models", dest="extendGenome")
49 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
51 configParser = getConfigParser()
52 section = "getallgenes"
53 maxRadius = getConfigIntOption(configParser, section, "maxRadius", 20002)
54 nomatchfilename = getConfigOption(configParser, section, "nomatchfilename", "")
55 step = getConfigOption(configParser, section, "step", None)
56 trackFar = getConfigBoolOption(configParser, section, "trackFar", False)
57 trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
58 compact = getConfigBoolOption(configParser, section, "compact", False)
59 colID = getConfigIntOption(configParser, section, "colID", 1)
60 doCache = getConfigBoolOption(configParser, section, "doCache", False)
61 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
62 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
64 parser.set_defaults(maxRadius=maxRadius, nomatchfilename=nomatchfilename, step=step, trackFar=trackFar,
65 trackStrand=trackStrand, compact=compact, colID=colID, doCache=doCache,
66 extendGenome=extendGenome, replaceModels=replaceModels)
71 def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
72 step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
73 doCache=False, extendGenome="", replaceModels=False):
78 if extendGenome and replaceModels:
83 infile = open(infilename)
84 outfile = open(outfilename,"w")
86 geneinfoDict = getGeneInfoDict(genome, cache=doCache)
99 fields = line.split("\t")
101 (chrom, pos) = fields[colID].split(":")
103 (start, stop) = pos.split("-")
104 pos = (chrom, int(start))
105 altPos = (chrom, int(stop))
108 chrom = fields[colID][3:]
113 pos = (chrom, int(fields[colID + 1]))
114 altPos = (chrom, int(fields[colID + 2]))
116 altPosDict[pos] = altPos
117 altPosRevDict[altPos] = pos
119 posList.append(altPos)
120 altPosList.append(altPos)
123 if "RNAFARP" in line:
125 posStrand[altPos] = "+"
128 posStrand[altPos] = "-"
135 if extendGenome != "":
136 geneannotDict = getExtendedGeneAnnotDict(genome, extendGenome, replace=replaceModels, inRAM=True)
138 geneannotDict = getGeneAnnotDict(genome, inRAM=True)
140 for radius in range(1, maxRadius, step):
141 print "radius %d" % radius
144 posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
146 posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels)
147 posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
148 for apos in posDict2:
150 posDict[apos] += posDict2[apos]
153 posDict[apos] = posDict2[apos]
157 if len(posDict[pos]) == 1:
159 if posStrand[pos] == posDict[pos][0][-1]:
160 geneID = posDict[pos][0][0]
162 geneID = posDict[pos][0][0]
163 elif len(posDict[pos]) > 1 and not trackStrand:
165 bestres = posDict[pos][0]
166 dist1 = abs(bestres[3] - loc)
167 dist2 = abs(bestres[4] - loc)
173 for testres in posDict[pos]:
174 testdist1 = abs(testres[3] - loc)
175 testdist2 = abs(testres[4] - loc)
176 if testdist1 < testdist2:
181 if testdist < bestdist:
186 elif len(posDict[pos]) > 1:
188 bestres = posDict[pos][0]
189 dist1 = abs(bestres[3] - loc)
190 dist2 = abs(bestres[4] - loc)
191 bestStrand = posDict[pos][-1]
197 for testres in posDict[pos]:
198 testdist1 = abs(testres[3] - loc)
199 testdist2 = abs(testres[4] - loc)
200 testStrand = testres[-1]
201 if testdist1 < testdist2:
206 if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
209 bestStrand = testStrand
210 elif testdist < bestdist:
214 if bestStrand == posStrand[pos]:
219 if genome == "dmelanogaster":
220 symbol = geneinfoDict["Dmel_" + geneID][0][0]
222 symbol = geneinfoDict[geneID][0][0]
225 symbol = geneannotDict[(genome, geneID)][0]
227 symbol = "LOC" + geneID
231 if pos in altPosList and pos in posList:
233 if pos not in altPosRevDict:
236 if altPosRevDict[pos] in posList:
237 posList.remove(altPosRevDict[pos])
239 pos = altPosRevDict[pos]
242 if pos not in altPosDict:
246 if altPosDict[pos] in posList:
247 posList.remove(altPosDict[pos])
251 if (symbol, geneID) not in geneList:
252 geneList.append((symbol, geneID))
253 geneDict[(symbol, geneID)] = []
255 if pos not in geneDict[(symbol, geneID)]:
256 geneDict[(symbol, geneID)].append(pos)
258 for (symbol, geneID) in geneList:
259 geneDict[(symbol, geneID)].sort()
261 for pos in geneDict[(symbol, geneID)]:
262 if pos in altPosRevDict:
263 pos = altPosRevDict[pos]
265 if posLine[pos] in seenLine:
269 symbol = symbol.replace("\t","|")
272 symbol = symbol.replace(" ","_")
274 line = "%s %s %s" % (symbol, geneID, posLine[pos])
275 seenLine.append(posLine[pos])
279 if nomatchfilename != "":
280 nomatchfile = open(nomatchfilename, "w")
287 if pos not in altPosList:
288 if nomatchfilename != "":
289 nomatchfile.write(posLine[pos])
292 # need to add strand tracking here.....
295 if chrom != prevChrom:
298 elif abs(int(start) - prevStart) > maxRadius:
301 line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
303 prevStart = int(start)
305 if nomatchfilename != "":
308 print "%d sites without a gene within radius of %d" % (matchIndex, radius)
311 if __name__ == "__main__":