11 import sqlite3 as sqlite
12 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
14 versionString = "checkrmask: version 3.6"
22 usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
36 checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
39 def makeParser(usage=""):
40 parser = optparse.OptionParser(usage=usage)
41 parser.add_option("--cache", type="int", dest="cachePages")
42 parser.add_option("--startField", type="int", dest="startField")
43 parser.add_option("--log", dest="logfilename")
45 configParser = getConfigParser()
46 section = "checkrmask"
47 cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
48 startField = getConfigIntOption(configParser, section, "startField", 0)
49 logfilename = getConfigOption(configParser, section, "logfilename", None)
51 parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
56 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
58 if os.path.isfile(dbfile):
59 checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
61 outfile = open(outFileName, "w")
62 goodfile = open(goodFileName, "w")
63 infile = open(filename)
64 print "No database - passing through"
65 if logfilename is not None:
66 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
67 writeLog(logfilename, versionString, "No database - passing through")
70 outfile.write("%s\tNR\tNR\t0.00\n" % line)
77 def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
79 outfile = open(outFileName, "w")
80 goodfile = open(goodFileName, "w")
84 if cachePages < 250000:
87 if logfilename is not None:
88 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
90 infile = open(filename)
91 db = sqlite.connect(dbfile)
93 sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
94 sql.execute("PRAGMA temp_store = MEMORY")
103 fields = line.strip().split("\t")
104 chrom = fields[startField][3:]
105 start = int(fields[startField + 1])
106 stop = int(fields[startField + 2])
107 featureList.append((chrom,start, stop))
108 featureDict[(chrom, start, stop)] = line.strip()
116 for (chrom, start, stop) in featureList:
117 if chrom != currentChrom:
120 if start > currentMax:
122 currentMin = currentMax
123 currentMax += increment
124 print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
130 con = sqlite.connect(":memory:")
131 sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
132 results = sql.fetchall()
134 con.execute("create table repeats(name, family, start, stop)")
135 con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
136 con.execute("PRAGMA temp_store = MEMORY")
137 for (rstart, rstop, name, family) in results:
138 results2.append((name, family, int(rstart), int(rstop)))
140 con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
141 con.execute("CREATE INDEX posIndex on repeats(start, stop)")
142 print chrom, len(results2)
145 featureLength = abs(stop - start)
148 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
149 results = sql2.fetchall()
150 for (rstart, rstop, name, family) in results:
151 overlapLength = float(abs(rstop - start))
152 if overlapLength > featureLength:
153 overlapLength = featureLength
155 ratio = overlapLength / featureLength
156 if (name, family, ratio) not in finalresults:
157 finalresults.append((name, family, ratio))
159 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
160 results = sql2.fetchall()
161 for (rstart, rstop, name, family) in results:
162 overlapLength = float(abs(rstart - stop))
163 if overlapLength > featureLength:
164 overlapLength = featureLength
166 ratio = overlapLength / featureLength
167 if (name, family, ratio) not in finalresults:
168 finalresults.append((name, family, ratio))
170 sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
171 results = sql2.fetchall()
172 for (rstart, rstop, name, family) in results:
173 overlapLength = float(abs(rstop - rstart))
174 if overlapLength > featureLength:
175 overlapLength = featureLength
177 ratio = overlapLength / featureLength
178 if (name, family, ratio) not in finalresults:
179 finalresults.append((name, family, ratio))
181 sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
182 results = sql2.fetchall()
183 for (rstart, rstop, name, family) in results:
184 overlapLength = float(abs(rstop - rstart))
185 if overlapLength > featureLength:
186 overlapLength = featureLength
188 ratio = overlapLength / featureLength
189 if (name, family, ratio) not in finalresults:
190 finalresults.append((name, family, ratio))
192 line = featureDict[(chrom, start, stop)]
194 for (name, family, fraction) in finalresults:
195 outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
198 outfile.write(outline + "\n")
200 if len(finalresults) == 0:
201 outline = "%s\tNR\tNR\t0.00" % line
203 outfile.write(outline + "\n")
206 goodfile.write(line + "\n")
209 if __name__ == "__main__":