7 from commoncode import regionsOverlap
9 print "siteintersects: version 2.1"
13 def __init__(self, line, doExpanded=False):
14 fields = line.strip().split()
16 self.chrom = fields[1][3:]
17 self.start = int(fields[2])
18 self.stop = int(fields[3])
19 self.rest = fields[4:]
21 (chromosome, pos) = fields[0].split(":")
22 self.chrom = chromosome[3:]
23 (start, stop) = pos.split("-")
24 self.start = int(start)
26 self.rest = fields[1:]
35 print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
38 sitefilename1 = argv[1]
39 sitefilename2 = argv[2]
43 if "--reject" in sys.argv:
44 reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
45 reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
49 if "--expanded" in sys.argv:
52 siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
55 def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
57 siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
58 commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
60 if doReject and siteRejectFilename is not None:
61 writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
66 def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
69 processedLineCount = 0
70 infile = open(sitefilename)
73 if doNotProcessLine(line):
76 processedLineCount += 1
77 site = Site(line, doExpanded)
84 siteDict[chrom].append((start, stop, rest))
86 siteDict[chrom] = [(start, stop, rest)]
89 rejectSiteDict[str((chrom, start, stop, rest))] = line
92 print "file1: %d" % processedLineCount
94 return siteDict, rejectSiteDict
97 def doNotProcessLine(line):
101 def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
102 processedLineCount = 0
103 infile = open(siteFilename)
107 if doReject and rejectFilename is not None:
108 rejectfile = open(rejectFilename, "w")
112 outfile = open(outfilename, "w")
114 if doNotProcessLine(line):
117 processedLineCount += 1
118 site = Site(line, doExpanded)
122 if chrom not in siteDict:
124 rejectfile.write(line)
129 #TODO: as handed to me this is O(n^m) - probably could be better...
130 #TODO: if there are multiple matches will count them but only write the first match to the file
131 for (rstart, rstop, rline) in siteDict[chrom]:
132 if regionsOverlap(start, stop, rstart, rstop):
135 outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
136 siteNotCommon = False
140 del rejectSiteDict[str((chrom, rstart, rstop, rline))]
144 if doReject and siteNotCommon:
145 rejectfile.write(line)
152 print "file2: %d" % processedLineCount
157 def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
158 rejectfile = open(siteRejectFilename, "w")
160 for key in rejectSiteDict:
161 rejectfile.write(rejectSiteDict[key])
166 if __name__ == "__main__":