11 import sqlite3 as sqlite
12 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
14 versionString = "checkrmask: version 3.6"
22 usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
36 checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
39 def makeParser(usage=""):
40 parser = optparse.OptionParser(usage=usage)
41 parser.add_option("--cache", type="int", dest="cachePages")
42 parser.add_option("--startField", type="int", dest="startField")
43 parser.add_option("--log", dest="logfilename")
45 configParser = getConfigParser()
46 section = "checkrmask"
47 cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
48 startField = getConfigIntOption(configParser, section, "startField", 0)
49 logfilename = getConfigOption(configParser, section, "logfilename", None)
51 parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
56 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
58 outfile = open(outFileName, "w")
59 goodfile = open(goodFileName, "w")
63 if cachePages < 250000:
67 if logfilename is not None:
68 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
71 infile = open(filename)
72 if os.path.isfile(dbfile):
73 db = sqlite.connect(dbfile)
75 sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
76 sql.execute("PRAGMA temp_store = MEMORY")
78 print "No database - passing through"
80 writeLog(logfilename, versionString, "No database - passing through")
83 outfile.write("%s\tNR\tNR\t0.00\n" % line)
97 fields = line.strip().split("\t")
98 chrom = fields[startField][3:]
99 start = int(fields[startField + 1])
100 stop = int(fields[startField + 2])
101 featureList.append((chrom,start, stop))
102 featureDict[(chrom, start, stop)] = line.strip()
110 for (chrom, start, stop) in featureList:
111 if chrom != currentChrom:
114 if start > currentMax:
116 currentMin = currentMax
117 currentMax += increment
118 print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
124 con = sqlite.connect(":memory:")
125 sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
126 results = sql.fetchall()
128 con.execute("create table repeats(name, family, start, stop)")
129 con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
130 con.execute("PRAGMA temp_store = MEMORY")
131 for (rstart, rstop, name, family) in results:
132 results2.append((name, family, int(rstart), int(rstop)))
134 con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
135 con.execute("CREATE INDEX posIndex on repeats(start, stop)")
136 print chrom, len(results2)
139 featureLength = abs(stop - start)
142 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
143 results = sql2.fetchall()
144 for (rstart, rstop, name, family) in results:
145 overlapLength = float(abs(rstop - start))
146 if overlapLength > featureLength:
147 overlapLength = featureLength
149 ratio = overlapLength / featureLength
150 if (name, family, ratio) not in finalresults:
151 finalresults.append((name, family, ratio))
153 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
154 results = sql2.fetchall()
155 for (rstart, rstop, name, family) in results:
156 overlapLength = float(abs(rstart - stop))
157 if overlapLength > featureLength:
158 overlapLength = featureLength
160 ratio = overlapLength / featureLength
161 if (name, family, ratio) not in finalresults:
162 finalresults.append((name, family, ratio))
164 sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
165 results = sql2.fetchall()
166 for (rstart, rstop, name, family) in results:
167 overlapLength = float(abs(rstop - rstart))
168 if overlapLength > featureLength:
169 overlapLength = featureLength
171 ratio = overlapLength / featureLength
172 if (name, family, ratio) not in finalresults:
173 finalresults.append((name, family, ratio))
175 sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
176 results = sql2.fetchall()
177 for (rstart, rstop, name, family) in results:
178 overlapLength = float(abs(rstop - rstart))
179 if overlapLength > featureLength:
180 overlapLength = featureLength
182 ratio = overlapLength / featureLength
183 if (name, family, ratio) not in finalresults:
184 finalresults.append((name, family, ratio))
186 line = featureDict[(chrom, start, stop)]
188 for (name, family, fraction) in finalresults:
189 outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
192 outfile.write(outline + "\n")
194 if len(finalresults) == 0:
195 outline = "%s\tNR\tNR\t0.00" % line
197 outfile.write(outline + "\n")
200 goodfile.write(line + "\n")
203 if __name__ == "__main__":