#
import sys
+from commoncode import regionsOverlap
print "siteintersects: version 2.1"
+class Site():
+ def __init__(self, line, doExpanded=False):
+ fields = line.strip().split()
+ if doExpanded:
+ self.chrom = fields[1][3:]
+ self.start = int(fields[2])
+ self.stop = int(fields[3])
+ self.rest = fields[4:]
+ else:
+ (chromosome, pos) = fields[0].split(":")
+ self.chrom = chromosome[3:]
+ (start, stop) = pos.split("-")
+ self.start = int(start)
+ self.stop = int(stop)
+ self.rest = fields[1:]
+
+
+
def main(argv=None):
if not argv:
argv = sys.argv
siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
- siteDict = {}
- file1Dict = {}
+ siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+ commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
- infile1count = 0
- infile = open(sitefilename1)
+ if doReject and siteRejectFilename is not None:
+ writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+ print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+ siteDict = {}
+ rejectSiteDict = {}
+ processedLineCount = 0
+ infile = open(sitefilename)
infile.readline()
for line in infile:
- if line[0] == "#":
+ if doNotProcessLine(line):
continue
- infile1count += 1
- fields = line.strip().split()
- if doExpanded:
- chrom = fields[1][3:]
- start = int(fields[2])
- stop = int(fields[3])
- rest = fields[4:]
- else:
- (chrom, pos) = fields[0].split(":")
- chrom = chrom[3:]
- (start, stop) = pos.split("-")
- start = int(start)
- stop = int(stop)
- rest = fields[1:]
+ processedLineCount += 1
+ site = Site(line, doExpanded)
+ chrom = site.chrom
+ start = site.start
+ stop = site.stop
+ rest = site.fieldList
try:
siteDict[chrom].append((start, stop, rest))
- except:
+ except KeyError:
siteDict[chrom] = [(start, stop, rest)]
if doReject:
- file1Dict[str((chrom, start, stop, rest))] = line
+ rejectSiteDict[str((chrom, start, stop, rest))] = line
infile.close()
+ print "file1: %d" % processedLineCount
- print "file1: %d" % infile1count
+ return siteDict, rejectSiteDict
- infile2count = 0
- infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+ processedLineCount = 0
+ infile = open(siteFilename)
infile.readline()
commonSites = 0
- unique2List = []
+ if doReject and rejectFilename is not None:
+ rejectfile = open(rejectFilename, "w")
+ else:
+ doReject=False
+
outfile = open(outfilename, "w")
for line in infile:
- if line[0] == "#":
+ if doNotProcessLine(line):
continue
- infile2count += 1
- fields = line.strip().split()
- if doExpanded:
- chrom = fields[1][3:]
- start = int(fields[2])
- stop = int(fields[3])
- rest = fields[4:]
- else:
- (chrom, pos) = fields[0].split(":")
- chrom = chrom[3:]
- (start, stop) = pos.split("-")
- rest = str(fields[1:])
-
- start = int(start)
- stop = int(stop)
- mid = start + abs(stop - start)/2
+ processedLineCount += 1
+ site = Site(line, doExpanded)
+ chrom = site.chrom
+ start = site.start
+ stop = site.stop
if chrom not in siteDict:
if doReject:
- unique2List.append(line)
- continue
+ rejectfile.write(line)
+
+ continue
- twoNotCommon = True
+ siteNotCommon = True
for (rstart, rstop, rline) in siteDict[chrom]:
- rsize = abs(rstart - rstop) /2
- rmid = rstart + abs(rstop - rstart)/2
- if abs(mid - rmid) < rsize:
+ if regionsOverlap(start, stop, rstart, rstop):
commonSites += 1
- if twoNotCommon:
- outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
- twoNotCommon = False
+ if siteNotCommon:
+ outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+ siteNotCommon = False
try:
if doReject:
- del file1Dict[str((chrom, rstart, rstop, rline))]
- except:
+ del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+ except KeyError:
pass
- if doReject and twoNotCommon:
- unique2List.append(line)
+ if doReject and siteNotCommon:
+ rejectfile.write(line)
- outfile.close()
+ if doReject:
+ rejectfile.close()
- print "file2: %d" % infile2count
+ outfile.close()
- if doReject:
- reject1file = open(reject1filename, "w")
- reject2file = open(reject2filename, "w")
+ print "file2: %d" % processedLineCount
- for key in file1Dict:
- reject1file.write(file1Dict[key])
+ return commonSites
- for line in unique2List:
- reject2file.write(line)
- reject1file.close()
- reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+ rejectfile = open(siteRejectFilename, "w")
- print commonSites
+ for key in rejectSiteDict:
+ rejectfile.write(rejectSiteDict[key])
+ rejectfile.close()
+
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)