snapshot of 4.0a development. initial git repo commit
[erange.git] / checkrmask.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sqlite3 as sqlite
8 import sys, string, optparse
9 import os.path
10 from commoncode import writeLog
11
12 versionString = "%prog: version 3.5"
13 print versionString
14
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
21
22     parser = optparse.OptionParser(usage=usage)
23     parser.add_option("--cache", type="int", dest="cachePages")
24     parser.add_option("--startField", type="int", dest="startField")
25     parser.add_option("--log", dest="logfilename")
26     parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
27     (options, args) = parser.parse_args(argv[1:])
28
29     if len(args) < 4:
30         print usage
31         sys.exit(1)
32
33     dbfile = args[0]
34     filename = args[1]
35     outfile = args[2]
36     goodfile = args[3]
37
38     checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
39
40
41 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
42
43     outfile = open(outFileName, "w")
44     goodfile = open(goodFileName, "w")
45     if startField < 0:
46         startField = 0
47
48     if cachePages < 250000:
49         cachePages = 250000
50
51     doLog = False
52     if logfilename is not None:
53         writeLog(logfilename, versionString, string.join(sys.argv[1:]))
54         doLog = True
55
56     infile = open(filename)
57     if os.path.isfile(dbfile):
58         db = sqlite.connect(dbfile)
59         sql = db.cursor()
60         sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
61         sql.execute("PRAGMA temp_store = MEMORY")
62     else:
63         print "No database - passing through"
64         if doLog:
65             writeLog(logfilename, versionString, "No database - passing through")
66
67         for line in infile:
68             outfile.write("%s\tNR\tNR\t0.00\n" % line)
69             goodfile.write(line)
70
71         outfile.close()
72         goodfile.close()
73         sys.exit(0)
74
75     featureList = []
76     featureDict = {}
77
78     for line in infile:
79         if line[0] == "#":
80             continue
81
82         fields = line.strip().split("\t")
83         chrom = fields[startField][3:]
84         start = int(fields[startField + 1])
85         stop = int(fields[startField + 2])
86         featureList.append((chrom,start, stop))
87         featureDict[(chrom, start, stop)] = line.strip()
88
89     infile.close()
90
91     featureList.sort()
92     currentChrom = ""
93     currentMax = 0
94     increment = 20000000
95     for (chrom, start, stop) in featureList:
96         if chrom != currentChrom:
97             currentMax = 0
98
99         if start > currentMax:
100             currentChrom = chrom
101             currentMin = currentMax
102             currentMax += increment
103             print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
104             try:
105                 del con
106             except:
107                 pass
108
109             con = sqlite.connect(":memory:")
110             sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
111             results = sql.fetchall()
112             results2 = []
113             con.execute("create table repeats(name, family, start, stop)")
114             con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
115             con.execute("PRAGMA temp_store = MEMORY")
116             for (rstart, rstop, name, family) in results:
117                 results2.append((name, family, int(rstart), int(rstop)))
118
119             con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
120             con.execute("CREATE INDEX posIndex on repeats(start, stop)")
121             print chrom, len(results2)
122             sql2 = con.cursor()
123
124         featureLength = abs(stop - start)
125         results = []
126         finalresults = []
127         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
128         results = sql2.fetchall()
129         for (rstart, rstop, name, family) in results:
130             overlapLength = float(abs(rstop - start))
131             if overlapLength > featureLength:
132                 overlapLength = featureLength
133
134             ratio = overlapLength / featureLength
135             if (name, family, ratio) not in finalresults:
136                 finalresults.append((name, family, ratio))
137
138         sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
139         results = sql2.fetchall()
140         for (rstart, rstop, name, family) in results:
141             overlapLength = float(abs(rstart - stop))
142             if overlapLength > featureLength:
143                 overlapLength = featureLength
144
145             ratio = overlapLength / featureLength
146             if (name, family, ratio) not in finalresults:
147                 finalresults.append((name, family, ratio))
148
149         sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
150         results = sql2.fetchall()
151         for (rstart, rstop, name, family) in results:
152             overlapLength = float(abs(rstop - rstart))
153             if overlapLength > featureLength:
154                 overlapLength = featureLength
155
156             ratio = overlapLength / featureLength
157             if (name, family, ratio) not in finalresults:
158                 finalresults.append((name, family, ratio))
159
160         sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
161         results = sql2.fetchall()
162         for (rstart, rstop, name, family) in results:
163             overlapLength = float(abs(rstop - rstart))
164             if overlapLength > featureLength:
165                 overlapLength = featureLength
166
167             ratio = overlapLength / featureLength
168             if (name, family, ratio) not in finalresults:
169                 finalresults.append((name, family, ratio))
170
171         line = featureDict[(chrom, start, stop)]
172         total = 0.
173         for (name, family, fraction) in finalresults:
174             outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
175             total += fraction
176             print outline
177             outfile.write(outline + "\n")
178
179         if len(finalresults) == 0:
180             outline = "%s\tNR\tNR\t%0.00" % line
181             print outline
182             outfile.write(outline + "\n")
183
184         if total < 0.2:
185             goodfile.write(line + "\n")
186
187
188 if __name__ == "__main__":
189     main(sys.argv)