first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / getSNPGeneInfo.py
1 #
2 #  getSNPGeneInfo.py
3 #  ENRAGE
4 #
5 # This script look for the gene info and expression level for the snps.
6 # Written by: Wendy Lee
7 # Written on: August 7th, 2008
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     print 'psyco not running'
14
15 import sys
16 import optparse
17 import string
18 from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
19 from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
20
21 print "getSNPGeneInfo: version 4.6"
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
28
29     parser = makeParser(usage)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 4:
33         print usage
34         sys.exit(2)
35
36     genome = args[0]
37     infilename = args[1]
38     rpkmfilename = args[2]
39     outfilename = args [3]
40
41     writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
42
43
44 def makeParser(usage=""):
45     parser = optparse.OptionParser(usage=usage)
46     parser.add_option("--cache", action="store_true", dest="cachePages")
47     parser.add_option("--withoutsense", action="store_false", dest="withSense")
48     parser.add_option("--flank", type="int", dest="flankBP")
49
50     configParser = getConfigParser()
51     section = "getSNPGeneInfo"
52     doCache = getConfigBoolOption(configParser, section, "doCache", False)
53     withSense = getConfigBoolOption(configParser, section, "withSense", True)
54     flankBP = getConfigIntOption(configParser, section, "flankBP", 0)
55
56     parser.set_defaults(doCache=doCache, withSense=withSense, flankBP=flankBP)
57
58     return parser
59
60
61 def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
62
63     outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
64     outfile = open(outfilename, "w")
65
66     for outputLine in outList:
67         outfile.write("%s\n" % outputLine)
68
69     outfile.close()
70
71
72 def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
73
74     rpkmDict = {}
75     rpkmField = 3
76     if rpkmfilename != "NONE":
77         rpkmfile = open(rpkmfilename, "r")
78         for line in rpkmfile:
79             lineFields = line.split()
80             rpkmDict[lineFields[0]] = lineFields[rpkmField]
81
82         rpkmfile.close()
83
84     infile = open(infilename)
85     snpPositionList = []
86     snpDict = {}
87
88     for line in infile:
89         if doNotProcessLine(line):
90             continue
91
92         fields = line.split("\t")
93         chrom = fields[2][3:]
94         start = int(fields[3])
95         chromosomePosition = (chrom, start)
96         snpPositionList.append(chromosomePosition)
97         snpDict[chromosomePosition] = line
98
99     if doCache:
100         cacheGeneDB(genome)
101         print "cached %s" % genome
102
103     geneinfoDict = getGeneInfoDict(genome, cache=doCache)
104     geneDict = {}
105
106     if flankBP > 0:
107         matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
108     else:
109         matchingGenesDict = genesIntersecting(genome, snpPositionList)
110
111     for pos in matchingGenesDict:
112         geneID = matchingGenesDict[pos][0][0]
113         try:
114             symbol = geneinfoDict[geneID][0][0]
115         except:
116             symbol = "LOC%s" % geneID
117
118         geneDescriptor = (symbol, geneID)
119         if geneDescriptor in geneDict:
120             geneDict[geneDescriptor]["position"].append(pos)
121         else:
122             geneDict[geneDescriptor] = {"position": [pos],
123                                         "sense": matchingGenesDict[pos][0][-1]}
124
125     if doCache:
126         uncacheGeneDB(genome)
127
128     return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
129
130
131 def doNotProcessLine(line):
132     return line[0] == "#"
133
134
135 def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
136     snpGeneOutputList = []
137     snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
138
139     for snpEntry in snpGeneInfoList:
140         outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
141         if withSense:
142             outputItems.append(snpEntry["sense"])
143
144         line = string.join(outputItems, "\t")
145         snpGeneOutputList.append(line)
146
147     snpGeneOutputList.sort(reverse=True)
148
149     return snpGeneOutputList
150
151
152 def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
153
154     snpGeneInfoList = []
155
156     for geneDescriptor in geneDict.keys():
157         alreadyDoneList = []
158         (symbol, geneID) = geneDescriptor
159         genePositionList = geneDict[geneDescriptor]["position"]
160         genePositionList.sort()
161
162         for position in genePositionList:
163             if snpDict[position] in alreadyDoneList:
164                 continue
165
166             snpGeneInfoDict = {"symbol": symbol,
167                                "geneID": geneID}
168
169             rpkm = "N\A"
170             if geneID in rpkmDict:
171                 rpkm = str(rpkmDict[geneID])
172
173             snpGeneInfoDict["rpkm"] = rpkm
174             snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
175             if withSense:
176                 snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
177
178             alreadyDoneList.append(snpDict[position])
179             snpGeneInfoList.append(snpGeneInfoDict)
180
181     snpGeneInfoList.sort(reverse=True)
182
183     return snpGeneInfoList
184
185
186 if __name__ == "__main__":
187     main(sys.argv)