X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=regionintersects.py;h=49a0e328af4dc8bf09bf3c7dc3a511e56b4e4a38;hp=340d2f88f0cda1177cf0b8c772d5e808c8e39f94;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/regionintersects.py b/regionintersects.py index 340d2f8..49a0e32 100755 --- a/regionintersects.py +++ b/regionintersects.py @@ -8,10 +8,12 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, findPeak +import sys +import optparse +from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption +import ReadDataset -print "%prog: version 3.0" +print "regionintersects: version 3.1" def main(argv=None): if not argv: @@ -19,16 +21,7 @@ def main(argv=None): usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--reject1", dest="rejectOneName") - parser.add_option("--reject2", dest="rejectTwoName") - parser.add_option("--union", action="store_true", dest="trackReject") - parser.add_option("--cache", action="store_true", dest="doCache") - parser.add_option("--raw", action="store_false", dest="normalize") - parser.add_option("--verbose", action="store_true", dest="doVerbose") - parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False, - doCache=False, normalize=True, doVerbose=False) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 5: @@ -47,6 +40,31 @@ def main(argv=None): options.doVerbose) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--reject1", dest="rejectOneName") + parser.add_option("--reject2", dest="rejectTwoName") + parser.add_option("--union", action="store_true", dest="trackReject") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + + configParser = getConfigParser() + section = "regionintersects" + rejectOneName = getConfigOption(configParser, section, "rejectOneName", None) + rejectTwoName = getConfigOption(configParser, section, "rejectTwoName", None) + trackReject = getConfigBoolOption(configParser, section, "trackReject", False) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + normalize = getConfigBoolOption(configParser, section, "normalize", True) + doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) + + parser.set_defaults(rejectOneName=rejectOneName, rejectTwoName=rejectTwoName, + trackReject=trackReject, doCache=doCache, normalize=normalize, + doVerbose=doVerbose) + + return parser + + def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, outfilename, rejectOneName=None, rejectTwoName=None, trackReject=False, doCache=False, normalize=True, doVerbose=False): @@ -69,8 +87,8 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose) twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose) - oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) - twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache) + oneRDS = ReadDataset.ReadDataset(readOneName, verbose=doVerbose, cache=doCache) + twoRDS = ReadDataset.ReadDataset(readTwoName, verbose=doVerbose, cache=doCache) if normalize: normalize1 = len(oneRDS) / 1000000. @@ -88,36 +106,37 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, numRegionsOne = 0 numRegionsTwo = 0 + commonChromosomeList = set(oneDict.keys()) for rchrom in oneDict: numRegionsOne += len(oneDict[rchrom]) for rchrom in twoDict: + commonChromosomeList.add(rchrom) numRegionsTwo += len(twoDict[rchrom]) outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName)) - for rchrom in oneDict: - if rchrom not in twoDict: - continue - - print rchrom + for chromosome in commonChromosomeList: + print chromosome rindex = 0 rindex2 = 0 - fullchrom = "chr" + rchrom + fullchrom = "chr%s" % chromosome oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True) dictLen1 = len(oneReads[fullchrom]) twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True) dictLen2 = len(twoReads[fullchrom]) - chrom = rchrom - onePeaksDict[chrom] = [] - oneFoundDict[chrom] = [] - for (start, stop, length) in oneDict[chrom]: + onePeaksDict[chromosome] = [] + oneFoundDict[chromosome] = [] + for region in oneDict[chromosome]: + start = region.start + stop = region.stop + length = region.length readList = [] for localIndex in xrange(rindex, dictLen1): read = oneReads[fullchrom][localIndex] - if read[0] < start: + if read["start"] < start: rindex += 1 - elif start <= read[0] <= stop: + elif start <= read["start"] <= stop: readList.append(read) else: break @@ -127,17 +146,20 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, readList.sort() - (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True) - onePeakScore = smoothArray[topPos[0]] - onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1)) + peak = findPeak(readList, start, length, doWeight=True) + onePeakScore = peak.smoothArray[peak.topPos[0]] + onePeaksDict[chromosome].append((peak.topPos[0] + start, length/2, start, stop, peak.numHits/normalize1, onePeakScore/normalize1)) - for (start, stop, length) in twoDict[chrom]: + for region in twoDict[chromosome]: + start = region.start + stop = region.stop + length = region.length readList2 = [] for localIndex in xrange(rindex2, dictLen2): read = twoReads[fullchrom][localIndex] - if read[0] < start: + if read["start"] < start: rindex2 += 1 - elif start <= read[0] <= stop: + elif start <= read["start"] <= stop: readList2.append(read) else: break @@ -146,45 +168,46 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, continue readList2.sort() - (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True) + peak2 = findPeak(readList2, start, length, doWeight=True) + numHits = peak2.numHits numHits /= normalize2 twoIsCommon = False - twoPeak = topPos[0] + start + twoPeak = peak2.topPos[0] + start twoRadius = length/2 - twoPeakScore = smoothArray[topPos[0]] / normalize2 - for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]: + twoPeakScore = peak2.smoothArray[peak2.topPos[0]] / normalize2 + for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]: if abs(twoPeak - onePeak) < (twoRadius + oneRadius): if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict: - oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits)) + oneFoundDict[chromosome].append((onePeak, oneRadius, ostart, ostop, ohits)) twoIsCommon = True commonRegions += 1 - outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore) + outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chromosome, ostart, ostop, ohits, opeakScore, chromosome, start, stop, numHits, twoPeakScore) if doVerbose: print outline - outfile.write(outline + "\n") + print >> outfile, outline if trackReject and not twoIsCommon: twoRejectIndex += 1 - outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore) + outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chromosome, start, stop, numHits, twoPeakScore) if doReject: - rejectTwo.write(outline + "\n") + print >> rejectTwo, outline else: - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print outline if trackReject: - for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]: - if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]: + for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]: + if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chromosome]: oneRejectIndex += 1 - outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore) + outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chromosome, ostart, ostop, ohits, opeakScore) if doReject: - rejectOne.write(outline + "\n") + print >> rejectOne, outline else: - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print outline