7 import sqlite3 as sqlite
8 import sys, string, optparse
10 from commoncode import writeLog
12 versionString = "%prog: version 3.5"
20 usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
22 parser = optparse.OptionParser(usage=usage)
23 parser.add_option("--cache", type="int", dest="cachePages")
24 parser.add_option("--startField", type="int", dest="startField")
25 parser.add_option("--log", dest="logfilename")
26 parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
27 (options, args) = parser.parse_args(argv[1:])
38 checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
41 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
43 outfile = open(outFileName, "w")
44 goodfile = open(goodFileName, "w")
48 if cachePages < 250000:
52 if logfilename is not None:
53 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
56 infile = open(filename)
57 if os.path.isfile(dbfile):
58 db = sqlite.connect(dbfile)
60 sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
61 sql.execute("PRAGMA temp_store = MEMORY")
63 print "No database - passing through"
65 writeLog(logfilename, versionString, "No database - passing through")
68 outfile.write("%s\tNR\tNR\t0.00\n" % line)
82 fields = line.strip().split("\t")
83 chrom = fields[startField][3:]
84 start = int(fields[startField + 1])
85 stop = int(fields[startField + 2])
86 featureList.append((chrom,start, stop))
87 featureDict[(chrom, start, stop)] = line.strip()
95 for (chrom, start, stop) in featureList:
96 if chrom != currentChrom:
99 if start > currentMax:
101 currentMin = currentMax
102 currentMax += increment
103 print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
109 con = sqlite.connect(":memory:")
110 sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
111 results = sql.fetchall()
113 con.execute("create table repeats(name, family, start, stop)")
114 con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
115 con.execute("PRAGMA temp_store = MEMORY")
116 for (rstart, rstop, name, family) in results:
117 results2.append((name, family, int(rstart), int(rstop)))
119 con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
120 con.execute("CREATE INDEX posIndex on repeats(start, stop)")
121 print chrom, len(results2)
124 featureLength = abs(stop - start)
127 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
128 results = sql2.fetchall()
129 for (rstart, rstop, name, family) in results:
130 overlapLength = float(abs(rstop - start))
131 if overlapLength > featureLength:
132 overlapLength = featureLength
134 ratio = overlapLength / featureLength
135 if (name, family, ratio) not in finalresults:
136 finalresults.append((name, family, ratio))
138 sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
139 results = sql2.fetchall()
140 for (rstart, rstop, name, family) in results:
141 overlapLength = float(abs(rstart - stop))
142 if overlapLength > featureLength:
143 overlapLength = featureLength
145 ratio = overlapLength / featureLength
146 if (name, family, ratio) not in finalresults:
147 finalresults.append((name, family, ratio))
149 sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
150 results = sql2.fetchall()
151 for (rstart, rstop, name, family) in results:
152 overlapLength = float(abs(rstop - rstart))
153 if overlapLength > featureLength:
154 overlapLength = featureLength
156 ratio = overlapLength / featureLength
157 if (name, family, ratio) not in finalresults:
158 finalresults.append((name, family, ratio))
160 sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
161 results = sql2.fetchall()
162 for (rstart, rstop, name, family) in results:
163 overlapLength = float(abs(rstop - rstart))
164 if overlapLength > featureLength:
165 overlapLength = featureLength
167 ratio = overlapLength / featureLength
168 if (name, family, ratio) not in finalresults:
169 finalresults.append((name, family, ratio))
171 line = featureDict[(chrom, start, stop)]
173 for (name, family, fraction) in finalresults:
174 outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
177 outfile.write(outline + "\n")
179 if len(finalresults) == 0:
180 outline = "%s\tNR\tNR\t%0.00" % line
182 outfile.write(outline + "\n")
185 goodfile.write(line + "\n")
188 if __name__ == "__main__":