snapshot of 4.0a development. initial git repo commit
[erange.git] / getSNPGeneInfo.py
1 #
2 #  getSNPGeneInfo.py
3 #  ENRAGE
4 #
5 # This script look for the gene info and expression level for the snps.
6 # Written by: Wendy Lee
7 # Written on: August 7th, 2008
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     print 'psyco not running'
14
15 import sys
16 import optparse
17 import string
18 from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
19 from cistematic.core.geneinfo import geneinfoDB
20
21 print "%prog: version 4.5"
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
28
29     parser = optparse.OptionParser(usage=usage)
30     parser.add_option("--cache", action="store_true", dest="cachePages")
31     parser.add_option("--withoutsense", action="store_false", dest="withSense")
32     parser.add_option("--flank", type="int", dest="flankBP")
33     parser.set_defaults(doCache=False, withSense=True, flankBP=0)
34     (options, args) = parser.parse_args(argv[1:])
35
36     if len(args) < 4:
37         print usage
38         sys.exit(2)
39
40     genome = args[0]
41     infilename = args[1]
42     rpkmfilename = args[2]
43     outfilename = args [3]
44
45     writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
46
47
48 def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
49
50     outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
51     outfile = open(outfilename, "w")
52
53     for outputLine in outList:
54         outfile.write("%s\n" % outputLine)
55
56     outfile.close()
57
58
59 def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
60
61     rpkmDict = {}
62     rpkmField = 3
63     if rpkmfilename != "NONE":
64         rpkmfile = open(rpkmfilename, "r")
65         for line in rpkmfile:
66             lineFields = line.split()
67             rpkmDict[lineFields[0]] = lineFields[rpkmField]
68
69         rpkmfile.close()
70
71     infile = open(infilename)
72     snpPositionList = []
73     snpDict = {}
74
75     for line in infile:
76         if doNotProcessLine(line):
77             continue
78
79         fields = line.split("\t")
80         chrom = fields[2][3:]
81         start = int(fields[3])
82         chromosomePosition = (chrom, start)
83         snpPositionList.append(chromosomePosition)
84         snpDict[chromosomePosition] = line
85
86     if doCache:
87         cacheGeneDB(genome)
88         idb = geneinfoDB(cache=True)
89         print "cached %s" % genome
90     else:
91         idb = geneinfoDB()
92
93     geneinfoDict = idb.getallGeneInfo(genome)
94     geneDict = {}
95
96     if flankBP > 0:
97         matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
98     else:
99         matchingGenesDict = genesIntersecting(genome, snpPositionList)
100
101     for pos in matchingGenesDict:
102         geneID = matchingGenesDict[pos][0][0]
103         try:
104             symbol = geneinfoDict[geneID][0][0]
105         except:
106             symbol = "LOC%s" % geneID
107
108         geneDescriptor = (symbol, geneID)
109         if geneDict.has_key(geneDescriptor):
110             geneDict[geneDescriptor]["position"].append(pos)
111         else:
112             geneDict[geneDescriptor] = {"position": [pos],
113                                         "sense": matchingGenesDict[pos][0][-1]}
114
115     if doCache:
116         uncacheGeneDB(genome)
117
118     return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
119
120
121 def doNotProcessLine(line):
122     return line[0] == "#"
123
124
125 def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
126     snpGeneOutputList = []
127     snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
128
129     for snpEntry in snpGeneInfoList:
130         outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
131         if withSense:
132             outputItems.append(snpEntry["sense"])
133
134         line = string.join(outputItems, "\t")
135         snpGeneOutputList.append(line)
136
137     snpGeneOutputList.sort(reverse=True)
138
139     return snpGeneOutputList
140
141
142 def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
143
144     snpGeneInfoList = []
145
146     for geneDescriptor in geneDict.keys():
147         alreadyDoneList = []
148         (symbol, geneID) = geneDescriptor
149         genePositionList = geneDict[geneDescriptor]["position"]
150         genePositionList.sort()
151
152         for position in genePositionList:
153             if snpDict[position] in alreadyDoneList:
154                 continue
155
156             snpGeneInfoDict = {"symbol": symbol,
157                                "geneID": geneID}
158
159             rpkm = "N\A"
160             if rpkmDict.has_key(geneID):
161                 rpkm = str(rpkmDict[geneID])
162
163             snpGeneInfoDict["rpkm"] = rpkm
164             snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
165             if withSense:
166                 snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
167
168             alreadyDoneList.append(snpDict[position])
169             snpGeneInfoList.append(snpGeneInfoDict)
170
171     snpGeneInfoList.sort(reverse=True)
172
173     return snpGeneInfoList
174
175
176 if __name__ == "__main__":
177     main(sys.argv)