snapshot of 4.0a development. initial git repo commit
[erange.git] / chksnp.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import tempfile
10 import shutil
11 import os
12 import string
13 import sqlite3 as sqlite
14
15 print "version 3.6: %s" % sys.argv[0]
16
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %prog dbfile snpsfile dbsnp_outfile [--cache numPages] [--snpDB dbfile]"
23
24     parser = optparse.OptionParser(usage=usage)
25     parser.add_option("--cache", type="int", dest="cachePages")
26     parser.add_option("--snpDB", action="append", dest="snpDBList",
27                       help="additional snp db files to check will be searched in order given")
28     parser.set_defaults(cachePages=None, snpDBList=[])
29     (options, args) = parser.parse_args(argv[1:])
30
31     if len(args) < 3:
32         print usage
33         sys.exit(1)
34
35     dbfile = args[0]
36     infile = args[1]
37     outfile = args[2]
38
39     chkSNPFile(dbfile, infile, outfile, options.cachePages, options.snpDBList)
40
41
42 def chkSNPFile(dbfile, inputFileName, outputFileName, cachePages=None, snpDBList=[]):
43
44     snpInputFile = open(inputFileName)
45     snpLocationList, snpDict = getSNPLocationInfo(snpInputFile)
46
47     dbList = [dbfile]
48     for dbFileName in snpDBList:
49         dbList.append(dbFileName)
50
51     annotatedSnpDict = annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
52
53     outputFile = open(outputFileName, "w")
54     outputLine = ""
55     outputFile.write(outputLine)
56     for key,value in annotatedSnpDict.iteritems():
57         outputLine = "%s\n" % str(value)
58         outputFile.write(outputLine)
59
60     outputFile.close()
61
62
63 def chkSNP(dbList, snpPropertiesList, cachePages=None):
64
65     snpLocationList, snpDict = getSNPLocationInfo(snpPropertiesList)
66     return annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
67
68
69 def getSNPLocationInfo(snpPropertiesList):
70     snpLocationList = []
71     snpDict = {}
72
73     for line in snpPropertiesList:
74         if doNotProcessLine(line):
75             continue
76
77         fields = line.strip().split("\t")
78         chromosome = fields[2][3:]
79         position = int(fields[3])
80         snpLocation = (chromosome, position)
81         snpLocationList.append(snpLocation)
82         snpDict[snpLocation] = line.strip()
83
84     snpLocationList.sort()
85
86     return snpLocationList, snpDict
87
88
89 def doNotProcessLine(line):
90     return line[0] == "#"
91
92
93 def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None):
94     return annotateSNPFromDBList(snpLocationList, snpDict, [dbFileName], cachePages)
95
96
97 def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
98     if os.environ.get("CISTEMATIC_TEMP"):
99         cisTemp = os.environ.get("CISTEMATIC_TEMP")
100     else:
101         cisTemp = "/tmp"
102
103     tempfile.tempdir = cisTemp
104
105     for dbFileName in dbList:
106         if cachePages is not None:
107             print "caching locally..."
108             cachefile = "%s.db" % tempfile.mktemp()
109             shutil.copyfile(dbFileName, cachefile)
110             db = sqlite.connect(cachefile)
111             doCache = True
112             print "cached..."
113         else:
114             db = sqlite.connect(dbFileName)
115             doCache = False
116
117         cacheSize = max(cachePages, 500000)
118         sql = db.cursor()
119         sql.execute("PRAGMA CACHE_SIZE = %d" % cacheSize)
120         sql.execute("PRAGMA temp_store = MEMORY")
121
122         index = 0
123         foundEntries = []
124         for chromosomePosition in snpLocationList:
125             (chromosome, position) = chromosomePosition
126             found = False
127             results = []
128             index += 1
129             startPosition = position - 1
130             sql.execute("select func, name from snp where chrom = '%s' and start = %d and stop = %d" % (chromosome, startPosition, position)) 
131             results = sql.fetchall()
132             try:
133                 (func, name) = results[0]
134                 found = True
135             except IndexError:
136                 sql.execute("select func, name from snp where chrom = '%s' and start <= %d and stop >= %d" % (chromosome, startPosition, position))
137                 results = sql.fetchall()
138                 try:
139                     (func, name) = results[0]
140                     found = True
141                 except IndexError:
142                     pass
143
144             if found:
145                 snpEntry = snpDict[chromosomePosition]
146                 snpDict[chromosomePosition] = string.join([snpEntry, str(name), str(func)], "\t")
147                 foundEntries.append(chromosomePosition)
148
149             if index % 100 == 0:
150                 print ".",
151                 sys.stdout.flush()
152
153         for chromosomePosition in foundEntries:
154             del snpLocationList[snpLocationList.index(chromosomePosition)]
155
156         if doCache:
157             print "\nremoving cache"
158             del db
159             os.remove(cachefile)
160
161     for chromosomePosition in snpLocationList:
162         snpEntry = snpDict[chromosomePosition]
163         snpDict[chromosomePosition] = string.join([snpEntry, "N\A", "N\A"], "\t")
164
165     return snpDict
166
167
168 if __name__ == "__main__":
169     main(sys.argv)