X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=siteintersects.py;fp=siteintersects.py;h=e012bb49436c92f50a1d8722182d0cd72a754897;hp=4826a13bbb31824ffea9b187d383f42b35bbf682;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/siteintersects.py b/siteintersects.py index 4826a13..e012bb4 100755 --- a/siteintersects.py +++ b/siteintersects.py @@ -4,10 +4,29 @@ # import sys +from commoncode import regionsOverlap print "siteintersects: version 2.1" +class Site(): + def __init__(self, line, doExpanded=False): + fields = line.strip().split() + if doExpanded: + self.chrom = fields[1][3:] + self.start = int(fields[2]) + self.stop = int(fields[3]) + self.rest = fields[4:] + else: + (chromosome, pos) = fields[0].split(":") + self.chrom = chromosome[3:] + (start, stop) = pos.split("-") + self.start = int(start) + self.stop = int(stop) + self.rest = fields[1:] + + + def main(argv=None): if not argv: argv = sys.argv @@ -33,115 +52,114 @@ def main(argv=None): siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded) -def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False): +def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False): - siteDict = {} - file1Dict = {} + siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject) + commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject) - infile1count = 0 - infile = open(sitefilename1) + if doReject and siteRejectFilename is not None: + writeRejectSiteFile(siteRejectFilename, rejectSiteDict) + + print commonSiteCount + + +def getSiteDicts(sitefilename, doExpanded=False, doReject=False): + siteDict = {} + rejectSiteDict = {} + processedLineCount = 0 + infile = open(sitefilename) infile.readline() for line in infile: - if line[0] == "#": + if doNotProcessLine(line): continue - infile1count += 1 - fields = line.strip().split() - if doExpanded: - chrom = fields[1][3:] - start = int(fields[2]) - stop = int(fields[3]) - rest = fields[4:] - else: - (chrom, pos) = fields[0].split(":") - chrom = chrom[3:] - (start, stop) = pos.split("-") - start = int(start) - stop = int(stop) - rest = fields[1:] + processedLineCount += 1 + site = Site(line, doExpanded) + chrom = site.chrom + start = site.start + stop = site.stop + rest = site.fieldList try: siteDict[chrom].append((start, stop, rest)) - except: + except KeyError: siteDict[chrom] = [(start, stop, rest)] if doReject: - file1Dict[str((chrom, start, stop, rest))] = line + rejectSiteDict[str((chrom, start, stop, rest))] = line infile.close() + print "file1: %d" % processedLineCount - print "file1: %d" % infile1count + return siteDict, rejectSiteDict - infile2count = 0 - infile = open(sitefilename2) + +def doNotProcessLine(line): + return line[0] == "#" + + +def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject): + processedLineCount = 0 + infile = open(siteFilename) infile.readline() commonSites = 0 - unique2List = [] + if doReject and rejectFilename is not None: + rejectfile = open(rejectFilename, "w") + else: + doReject=False + outfile = open(outfilename, "w") for line in infile: - if line[0] == "#": + if doNotProcessLine(line): continue - infile2count += 1 - fields = line.strip().split() - if doExpanded: - chrom = fields[1][3:] - start = int(fields[2]) - stop = int(fields[3]) - rest = fields[4:] - else: - (chrom, pos) = fields[0].split(":") - chrom = chrom[3:] - (start, stop) = pos.split("-") - rest = str(fields[1:]) - - start = int(start) - stop = int(stop) - mid = start + abs(stop - start)/2 + processedLineCount += 1 + site = Site(line, doExpanded) + chrom = site.chrom + start = site.start + stop = site.stop if chrom not in siteDict: if doReject: - unique2List.append(line) - continue + rejectfile.write(line) + + continue - twoNotCommon = True + siteNotCommon = True for (rstart, rstop, rline) in siteDict[chrom]: - rsize = abs(rstart - rstop) /2 - rmid = rstart + abs(rstop - rstart)/2 - if abs(mid - rmid) < rsize: + if regionsOverlap(start, stop, rstart, rstop): commonSites += 1 - if twoNotCommon: - outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest)) - twoNotCommon = False + if siteNotCommon: + outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList)) + siteNotCommon = False try: if doReject: - del file1Dict[str((chrom, rstart, rstop, rline))] - except: + del rejectSiteDict[str((chrom, rstart, rstop, rline))] + except KeyError: pass - if doReject and twoNotCommon: - unique2List.append(line) + if doReject and siteNotCommon: + rejectfile.write(line) - outfile.close() + if doReject: + rejectfile.close() - print "file2: %d" % infile2count + outfile.close() - if doReject: - reject1file = open(reject1filename, "w") - reject2file = open(reject2filename, "w") + print "file2: %d" % processedLineCount - for key in file1Dict: - reject1file.write(file1Dict[key]) + return commonSites - for line in unique2List: - reject2file.write(line) - reject1file.close() - reject2file.close() +def writeRejectSiteFile(siteRejectFilename, rejectSiteDict): + rejectfile = open(siteRejectFilename, "w") - print commonSites + for key in rejectSiteDict: + rejectfile.write(rejectSiteDict[key]) + rejectfile.close() + if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv)