erange version 4.0a dev release
[erange.git] / checkrmask.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import string
9 import optparse
10 import os.path
11 import sqlite3 as sqlite
12 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
13
14 versionString = "checkrmask: version 3.6"
15 print versionString
16
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
23
24     parser = makeParser(usage)
25     (options, args) = parser.parse_args(argv[1:])
26
27     if len(args) < 4:
28         print usage
29         sys.exit(1)
30
31     dbfile = args[0]
32     filename = args[1]
33     outfile = args[2]
34     goodfile = args[3]
35
36     checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
37
38
39 def makeParser(usage=""):
40     parser = optparse.OptionParser(usage=usage)
41     parser.add_option("--cache", type="int", dest="cachePages")
42     parser.add_option("--startField", type="int", dest="startField")
43     parser.add_option("--log", dest="logfilename")
44
45     configParser = getConfigParser()
46     section = "checkrmask"
47     cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
48     startField = getConfigIntOption(configParser, section, "startField", 0)
49     logfilename = getConfigOption(configParser, section, "logfilename", None)
50
51     parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
52
53     return parser
54
55
56 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
57
58     outfile = open(outFileName, "w")
59     goodfile = open(goodFileName, "w")
60     if startField < 0:
61         startField = 0
62
63     if cachePages < 250000:
64         cachePages = 250000
65
66     doLog = False
67     if logfilename is not None:
68         writeLog(logfilename, versionString, string.join(sys.argv[1:]))
69         doLog = True
70
71     infile = open(filename)
72     if os.path.isfile(dbfile):
73         db = sqlite.connect(dbfile)
74         sql = db.cursor()
75         sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
76         sql.execute("PRAGMA temp_store = MEMORY")
77     else:
78         print "No database - passing through"
79         if doLog:
80             writeLog(logfilename, versionString, "No database - passing through")
81
82         for line in infile:
83             outfile.write("%s\tNR\tNR\t0.00\n" % line)
84             goodfile.write(line)
85
86         outfile.close()
87         goodfile.close()
88         sys.exit(0)
89
90     featureList = []
91     featureDict = {}
92
93     for line in infile:
94         if line[0] == "#":
95             continue
96
97         fields = line.strip().split("\t")
98         chrom = fields[startField][3:]
99         start = int(fields[startField + 1])
100         stop = int(fields[startField + 2])
101         featureList.append((chrom,start, stop))
102         featureDict[(chrom, start, stop)] = line.strip()
103
104     infile.close()
105
106     featureList.sort()
107     currentChrom = ""
108     currentMax = 0
109     increment = 20000000
110     for (chrom, start, stop) in featureList:
111         if chrom != currentChrom:
112             currentMax = 0
113
114         if start > currentMax:
115             currentChrom = chrom
116             currentMin = currentMax
117             currentMax += increment
118             print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
119             try:
120                 del con
121             except:
122                 pass
123
124             con = sqlite.connect(":memory:")
125             sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
126             results = sql.fetchall()
127             results2 = []
128             con.execute("create table repeats(name, family, start, stop)")
129             con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
130             con.execute("PRAGMA temp_store = MEMORY")
131             for (rstart, rstop, name, family) in results:
132                 results2.append((name, family, int(rstart), int(rstop)))
133
134             con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
135             con.execute("CREATE INDEX posIndex on repeats(start, stop)")
136             print chrom, len(results2)
137             sql2 = con.cursor()
138
139         featureLength = abs(stop - start)
140         results = []
141         finalresults = []
142         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
143         results = sql2.fetchall()
144         for (rstart, rstop, name, family) in results:
145             overlapLength = float(abs(rstop - start))
146             if overlapLength > featureLength:
147                 overlapLength = featureLength
148
149             ratio = overlapLength / featureLength
150             if (name, family, ratio) not in finalresults:
151                 finalresults.append((name, family, ratio))
152
153         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
154         results = sql2.fetchall()
155         for (rstart, rstop, name, family) in results:
156             overlapLength = float(abs(rstart - stop))
157             if overlapLength > featureLength:
158                 overlapLength = featureLength
159
160             ratio = overlapLength / featureLength
161             if (name, family, ratio) not in finalresults:
162                 finalresults.append((name, family, ratio))
163
164         sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
165         results = sql2.fetchall()
166         for (rstart, rstop, name, family) in results:
167             overlapLength = float(abs(rstop - rstart))
168             if overlapLength > featureLength:
169                 overlapLength = featureLength
170
171             ratio = overlapLength / featureLength
172             if (name, family, ratio) not in finalresults:
173                 finalresults.append((name, family, ratio))
174
175         sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
176         results = sql2.fetchall()
177         for (rstart, rstop, name, family) in results:
178             overlapLength = float(abs(rstop - rstart))
179             if overlapLength > featureLength:
180                 overlapLength = featureLength
181
182             ratio = overlapLength / featureLength
183             if (name, family, ratio) not in finalresults:
184                 finalresults.append((name, family, ratio))
185
186         line = featureDict[(chrom, start, stop)]
187         total = 0.
188         for (name, family, fraction) in finalresults:
189             outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
190             total += fraction
191             print outline
192             outfile.write(outline + "\n")
193
194         if len(finalresults) == 0:
195             outline = "%s\tNR\tNR\t%0.00" % line
196             print outline
197             outfile.write(outline + "\n")
198
199         if total < 0.2:
200             goodfile.write(line + "\n")
201
202
203 if __name__ == "__main__":
204     main(sys.argv)