first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / chksnp.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import tempfile
10 import shutil
11 import os
12 import string
13 import sqlite3 as sqlite
14 from commoncode import getConfigParser, getConfigOption
15
16 print "chksnp: version 3.7"
17
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     usage = "usage: python %prog dbfile snpsfile dbsnp_outfile [--cache numPages] [--snpDB dbfile]"
24
25     parser = optparse.OptionParser(usage=usage)
26     parser.add_option("--cache", type="int", dest="cachePages")
27     parser.add_option("--snpDB", action="append", dest="snpDBList",
28                       help="additional snp db files to check will be searched in order given")
29     parser.set_defaults(cachePages=None, snpDBList=[])
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         sys.exit(1)
35
36     dbfile = args[0]
37     infile = args[1]
38     outfile = args[2]
39
40     chkSNPFile(dbfile, infile, outfile, options.cachePages, options.snpDBList)
41
42
43 def chkSNPFile(dbfile, inputFileName, outputFileName, cachePages=None, snpDBList=[]):
44
45     snpInputFile = open(inputFileName)
46     snpLocationList, snpDict = getSNPLocationInfo(snpInputFile)
47
48     dbList = [dbfile]
49     for dbFileName in snpDBList:
50         dbList.append(dbFileName)
51
52     annotatedSnpDict = annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
53
54     outputFile = open(outputFileName, "w")
55     outputLine = ""
56     outputFile.write(outputLine)
57     for key,value in annotatedSnpDict.iteritems():
58         outputLine = "%s\n" % str(value)
59         outputFile.write(outputLine)
60
61     outputFile.close()
62
63
64 def chkSNP(dbList, snpPropertiesList, cachePages=None):
65
66     snpLocationList, snpDict = getSNPLocationInfo(snpPropertiesList)
67     return annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
68
69
70 def getSNPLocationInfo(snpPropertiesList):
71     snpLocationList = []
72     snpDict = {}
73
74     for line in snpPropertiesList:
75         if doNotProcessLine(line):
76             continue
77
78         fields = line.strip().split("\t")
79         chromosome = fields[2][3:]
80         position = int(fields[3])
81         snpLocation = (chromosome, position)
82         snpLocationList.append(snpLocation)
83         snpDict[snpLocation] = line.strip()
84
85     snpLocationList.sort()
86
87     return snpLocationList, snpDict
88
89
90 def doNotProcessLine(line):
91     return line[0] == "#"
92
93
94 def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None):
95     return annotateSNPFromDBList(snpLocationList, snpDict, [dbFileName], cachePages)
96
97
98 def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
99
100     configParser = getConfigParser()
101     cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
102     tempfile.tempdir = cisTemp
103
104     for dbFileName in dbList:
105         if cachePages is not None:
106             print "caching locally..."
107             cachefile = "%s.db" % tempfile.mktemp()
108             shutil.copyfile(dbFileName, cachefile)
109             db = sqlite.connect(cachefile)
110             doCache = True
111             print "cached..."
112         else:
113             db = sqlite.connect(dbFileName)
114             doCache = False
115
116         cacheSize = max(cachePages, 500000)
117         sql = db.cursor()
118         sql.execute("PRAGMA CACHE_SIZE = %d" % cacheSize)
119         sql.execute("PRAGMA temp_store = MEMORY")
120
121         index = 0
122         foundEntries = []
123         for chromosomePosition in snpLocationList:
124             (chromosome, position) = chromosomePosition
125             found = False
126             results = []
127             index += 1
128             startPosition = position - 1
129             sql.execute("select func, name from snp where chrom = '%s' and start = %d and stop = %d" % (chromosome, startPosition, position)) 
130             results = sql.fetchall()
131             try:
132                 (func, name) = results[0]
133                 found = True
134             except IndexError:
135                 sql.execute("select func, name from snp where chrom = '%s' and start <= %d and stop >= %d" % (chromosome, startPosition, position))
136                 results = sql.fetchall()
137                 try:
138                     (func, name) = results[0]
139                     found = True
140                 except IndexError:
141                     pass
142
143             if found:
144                 snpEntry = snpDict[chromosomePosition]
145                 snpDict[chromosomePosition] = string.join([snpEntry, str(name), str(func)], "\t")
146                 foundEntries.append(chromosomePosition)
147
148             if index % 100 == 0:
149                 print ".",
150                 sys.stdout.flush()
151
152         for chromosomePosition in foundEntries:
153             del snpLocationList[snpLocationList.index(chromosomePosition)]
154
155         if doCache:
156             print "\nremoving cache"
157             del db
158             os.remove(cachefile)
159
160     for chromosomePosition in snpLocationList:
161         snpEntry = snpDict[chromosomePosition]
162         snpDict[chromosomePosition] = string.join([snpEntry, "N\A", "N\A"], "\t")
163
164     return snpDict
165
166
167 if __name__ == "__main__":
168     main(sys.argv)