5 # This script look for the gene info and expression level for the snps.
6 # Written by: Wendy Lee
7 # Written on: August 7th, 2008
13 print 'psyco not running'
18 from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
19 from cistematic.core.geneinfo import geneinfoDB
21 print "%prog: version 4.5"
27 usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
29 parser = optparse.OptionParser(usage=usage)
30 parser.add_option("--cache", action="store_true", dest="cachePages")
31 parser.add_option("--withoutsense", action="store_false", dest="withSense")
32 parser.add_option("--flank", type="int", dest="flankBP")
33 parser.set_defaults(doCache=False, withSense=True, flankBP=0)
34 (options, args) = parser.parse_args(argv[1:])
42 rpkmfilename = args[2]
43 outfilename = args [3]
45 writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
48 def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
50 outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
51 outfile = open(outfilename, "w")
53 for outputLine in outList:
54 outfile.write("%s\n" % outputLine)
59 def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
63 if rpkmfilename != "NONE":
64 rpkmfile = open(rpkmfilename, "r")
66 lineFields = line.split()
67 rpkmDict[lineFields[0]] = lineFields[rpkmField]
71 infile = open(infilename)
76 if doNotProcessLine(line):
79 fields = line.split("\t")
81 start = int(fields[3])
82 chromosomePosition = (chrom, start)
83 snpPositionList.append(chromosomePosition)
84 snpDict[chromosomePosition] = line
88 idb = geneinfoDB(cache=True)
89 print "cached %s" % genome
93 geneinfoDict = idb.getallGeneInfo(genome)
97 matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
99 matchingGenesDict = genesIntersecting(genome, snpPositionList)
101 for pos in matchingGenesDict:
102 geneID = matchingGenesDict[pos][0][0]
104 symbol = geneinfoDict[geneID][0][0]
106 symbol = "LOC%s" % geneID
108 geneDescriptor = (symbol, geneID)
109 if geneDict.has_key(geneDescriptor):
110 geneDict[geneDescriptor]["position"].append(pos)
112 geneDict[geneDescriptor] = {"position": [pos],
113 "sense": matchingGenesDict[pos][0][-1]}
116 uncacheGeneDB(genome)
118 return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
121 def doNotProcessLine(line):
122 return line[0] == "#"
125 def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
126 snpGeneOutputList = []
127 snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
129 for snpEntry in snpGeneInfoList:
130 outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
132 outputItems.append(snpEntry["sense"])
134 line = string.join(outputItems, "\t")
135 snpGeneOutputList.append(line)
137 snpGeneOutputList.sort(reverse=True)
139 return snpGeneOutputList
142 def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
146 for geneDescriptor in geneDict.keys():
148 (symbol, geneID) = geneDescriptor
149 genePositionList = geneDict[geneDescriptor]["position"]
150 genePositionList.sort()
152 for position in genePositionList:
153 if snpDict[position] in alreadyDoneList:
156 snpGeneInfoDict = {"symbol": symbol,
160 if rpkmDict.has_key(geneID):
161 rpkm = str(rpkmDict[geneID])
163 snpGeneInfoDict["rpkm"] = rpkm
164 snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
166 snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
168 alreadyDoneList.append(snpDict[position])
169 snpGeneInfoList.append(snpGeneInfoDict)
171 snpGeneInfoList.sort(reverse=True)
173 return snpGeneInfoList
176 if __name__ == "__main__":