5 # This script look for the gene info and expression level for the snps.
6 # Written by: Wendy Lee
7 # Written on: August 7th, 2008
13 print 'psyco not running'
18 from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
19 from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
21 print "getSNPGeneInfo: version 4.6"
27 usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
29 parser = makeParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
38 rpkmfilename = args[2]
39 outfilename = args [3]
41 writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
44 def makeParser(usage=""):
45 parser = optparse.OptionParser(usage=usage)
46 parser.add_option("--cache", action="store_true", dest="cachePages")
47 parser.add_option("--withoutsense", action="store_false", dest="withSense")
48 parser.add_option("--flank", type="int", dest="flankBP")
50 configParser = getConfigParser()
51 section = "getSNPGeneInfo"
52 doCache = getConfigBoolOption(configParser, section, "doCache", False)
53 withSense = getConfigBoolOption(configParser, section, "withSense", True)
54 flankBP = getConfigIntOption(configParser, section, "flankBP", 0)
56 parser.set_defaults(doCache=doCache, withSense=withSense, flankBP=flankBP)
61 def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
63 outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
64 outfile = open(outfilename, "w")
66 for outputLine in outList:
67 outfile.write("%s\n" % outputLine)
72 def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
76 if rpkmfilename != "NONE":
77 rpkmfile = open(rpkmfilename, "r")
79 lineFields = line.split()
80 rpkmDict[lineFields[0]] = lineFields[rpkmField]
84 infile = open(infilename)
89 if doNotProcessLine(line):
92 fields = line.split("\t")
94 start = int(fields[3])
95 chromosomePosition = (chrom, start)
96 snpPositionList.append(chromosomePosition)
97 snpDict[chromosomePosition] = line
101 print "cached %s" % genome
103 geneinfoDict = getGeneInfoDict(genome, cache=doCache)
107 matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
109 matchingGenesDict = genesIntersecting(genome, snpPositionList)
111 for pos in matchingGenesDict:
112 geneID = matchingGenesDict[pos][0][0]
114 symbol = geneinfoDict[geneID][0][0]
116 symbol = "LOC%s" % geneID
118 geneDescriptor = (symbol, geneID)
119 if geneDescriptor in geneDict:
120 geneDict[geneDescriptor]["position"].append(pos)
122 geneDict[geneDescriptor] = {"position": [pos],
123 "sense": matchingGenesDict[pos][0][-1]}
126 uncacheGeneDB(genome)
128 return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
131 def doNotProcessLine(line):
132 return line[0] == "#"
135 def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
136 snpGeneOutputList = []
137 snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
139 for snpEntry in snpGeneInfoList:
140 outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
142 outputItems.append(snpEntry["sense"])
144 line = string.join(outputItems, "\t")
145 snpGeneOutputList.append(line)
147 snpGeneOutputList.sort(reverse=True)
149 return snpGeneOutputList
152 def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
156 for geneDescriptor in geneDict.keys():
158 (symbol, geneID) = geneDescriptor
159 genePositionList = geneDict[geneDescriptor]["position"]
160 genePositionList.sort()
162 for position in genePositionList:
163 if snpDict[position] in alreadyDoneList:
166 snpGeneInfoDict = {"symbol": symbol,
170 if geneID in rpkmDict:
171 rpkm = str(rpkmDict[geneID])
173 snpGeneInfoDict["rpkm"] = rpkm
174 snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
176 snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
178 alreadyDoneList.append(snpDict[position])
179 snpGeneInfoList.append(snpGeneInfoDict)
181 snpGeneInfoList.sort(reverse=True)
183 return snpGeneInfoList
186 if __name__ == "__main__":