first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / checkrmask.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import string
9 import optparse
10 import os.path
11 import sqlite3 as sqlite
12 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
13
14 versionString = "checkrmask: version 3.6"
15 print versionString
16
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
23
24     parser = makeParser(usage)
25     (options, args) = parser.parse_args(argv[1:])
26
27     if len(args) < 4:
28         print usage
29         sys.exit(1)
30
31     dbfile = args[0]
32     filename = args[1]
33     outfile = args[2]
34     goodfile = args[3]
35
36     checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
37
38
39 def makeParser(usage=""):
40     parser = optparse.OptionParser(usage=usage)
41     parser.add_option("--cache", type="int", dest="cachePages")
42     parser.add_option("--startField", type="int", dest="startField")
43     parser.add_option("--log", dest="logfilename")
44
45     configParser = getConfigParser()
46     section = "checkrmask"
47     cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
48     startField = getConfigIntOption(configParser, section, "startField", 0)
49     logfilename = getConfigOption(configParser, section, "logfilename", None)
50
51     parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
52
53     return parser
54
55
56 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
57
58     if os.path.isfile(dbfile):
59         checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
60     else:
61         outfile = open(outFileName, "w")
62         goodfile = open(goodFileName, "w")
63         infile = open(filename)
64         print "No database - passing through"
65         if logfilename is not None:
66             writeLog(logfilename, versionString, string.join(sys.argv[1:]))
67             writeLog(logfilename, versionString, "No database - passing through")
68
69         for line in infile:
70             outfile.write("%s\tNR\tNR\t0.00\n" % line)
71             goodfile.write(line)
72
73         outfile.close()
74         goodfile.close()
75
76
77 def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
78
79     outfile = open(outFileName, "w")
80     goodfile = open(goodFileName, "w")
81     if startField < 0:
82         startField = 0
83
84     if cachePages < 250000:
85         cachePages = 250000
86
87     if logfilename is not None:
88         writeLog(logfilename, versionString, string.join(sys.argv[1:]))
89
90     infile = open(filename)
91     db = sqlite.connect(dbfile)
92     sql = db.cursor()
93     sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
94     sql.execute("PRAGMA temp_store = MEMORY")
95
96     featureList = []
97     featureDict = {}
98
99     for line in infile:
100         if line[0] == "#":
101             continue
102
103         fields = line.strip().split("\t")
104         chrom = fields[startField][3:]
105         start = int(fields[startField + 1])
106         stop = int(fields[startField + 2])
107         featureList.append((chrom,start, stop))
108         featureDict[(chrom, start, stop)] = line.strip()
109
110     infile.close()
111
112     featureList.sort()
113     currentChrom = ""
114     currentMax = 0
115     increment = 20000000
116     for (chrom, start, stop) in featureList:
117         if chrom != currentChrom:
118             currentMax = 0
119
120         if start > currentMax:
121             currentChrom = chrom
122             currentMin = currentMax
123             currentMax += increment
124             print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
125             try:
126                 del con
127             except:
128                 pass
129
130             con = sqlite.connect(":memory:")
131             sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
132             results = sql.fetchall()
133             results2 = []
134             con.execute("create table repeats(name, family, start, stop)")
135             con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
136             con.execute("PRAGMA temp_store = MEMORY")
137             for (rstart, rstop, name, family) in results:
138                 results2.append((name, family, int(rstart), int(rstop)))
139
140             con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
141             con.execute("CREATE INDEX posIndex on repeats(start, stop)")
142             print chrom, len(results2)
143             sql2 = con.cursor()
144
145         featureLength = abs(stop - start)
146         results = []
147         finalresults = []
148         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
149         results = sql2.fetchall()
150         for (rstart, rstop, name, family) in results:
151             overlapLength = float(abs(rstop - start))
152             if overlapLength > featureLength:
153                 overlapLength = featureLength
154
155             ratio = overlapLength / featureLength
156             if (name, family, ratio) not in finalresults:
157                 finalresults.append((name, family, ratio))
158
159         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
160         results = sql2.fetchall()
161         for (rstart, rstop, name, family) in results:
162             overlapLength = float(abs(rstart - stop))
163             if overlapLength > featureLength:
164                 overlapLength = featureLength
165
166             ratio = overlapLength / featureLength
167             if (name, family, ratio) not in finalresults:
168                 finalresults.append((name, family, ratio))
169
170         sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
171         results = sql2.fetchall()
172         for (rstart, rstop, name, family) in results:
173             overlapLength = float(abs(rstop - rstart))
174             if overlapLength > featureLength:
175                 overlapLength = featureLength
176
177             ratio = overlapLength / featureLength
178             if (name, family, ratio) not in finalresults:
179                 finalresults.append((name, family, ratio))
180
181         sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
182         results = sql2.fetchall()
183         for (rstart, rstop, name, family) in results:
184             overlapLength = float(abs(rstop - rstart))
185             if overlapLength > featureLength:
186                 overlapLength = featureLength
187
188             ratio = overlapLength / featureLength
189             if (name, family, ratio) not in finalresults:
190                 finalresults.append((name, family, ratio))
191
192         line = featureDict[(chrom, start, stop)]
193         total = 0.
194         for (name, family, fraction) in finalresults:
195             outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
196             total += fraction
197             print outline
198             outfile.write(outline + "\n")
199
200         if len(finalresults) == 0:
201             outline = "%s\tNR\tNR\t0.00" % line
202             print outline
203             outfile.write(outline + "\n")
204
205         if total < 0.2:
206             goodfile.write(line + "\n")
207
208
209 if __name__ == "__main__":
210     main(sys.argv)