X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=farPairs.py;fp=farPairs.py;h=712e60eff9815f9728b6a03983385f83d96bc6f5;hp=00cc91844b23084d712573261e996524739c9870;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/farPairs.py b/farPairs.py index 00cc918..712e60e 100644 --- a/farPairs.py +++ b/farPairs.py @@ -14,8 +14,9 @@ except: import sys import time import optparse +import string import ReadDataset -from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption +from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList print "farPairs: version 1.4" @@ -38,14 +39,13 @@ def main(argv=None): outfilename = args[1] outbedname = args[2] - farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose, + farPairs(rdsfile, outfilename, outbedname, options.doVerbose, options.cachePages, options.minDist, options.maxDist, options.minCount, options.label) def getParser(usage): parser = optparse.OptionParser(usage=usage) - parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly") parser.add_option("--cache", type="int", dest="cachePages") parser.add_option("--verbose", action="store_true", dest="doVerbose") parser.add_option("--minDist", type="int", dest="minDist") @@ -55,7 +55,6 @@ def getParser(usage): configParser = getConfigParser section = "farPairs" - sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False) doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) cachePages = getConfigOption(configParser, section, "cachePages", None) minDist = getConfigIntOption(configParser, section, "minDist", 1000) @@ -63,15 +62,33 @@ def getParser(usage): minCount = getConfigIntOption(configParser, section, "minCount", 2) label = getConfigOption(configParser, section, "label", None) - parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages, + parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages, minDist=minDist, maxDist=maxDist, minCount=minCount, label=label) return parser -def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False, +def farPairs(rdsfile, outfilename, outbedname, doVerbose=False, cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None): + flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose) + + outfile = open(outfilename, "w") + for region in flagDict: + regionConnections = countDuplicatesInList(flagDict[region]) + for (connectedRegion, count) in regionConnections: + if count >= minCount: + outline = "%s\t%s\t%d" % (region, connectedRegion, count) + print >> outfile, outline + if doVerbose: + print outline + + outfile.close() + if doVerbose: + print "finished: ", time.ctime() + + +def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose): doCache = False if cachePages is not None: doCache = True @@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if doVerbose: print time.ctime() - - total = 0 - outfile = open(outfilename, "w") + outbed = open(outbedname, "w") outbed.write('track name="%s distal pairs" color=0,255,0\n' % label) + outbed.close() readlen = RDS.getReadSize() flagDict = {} @@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if doNotProcessChromosome(chromosome): continue - print chromosome - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) - if doVerbose: - print len(uniqDict), time.ctime() - - for readID in uniqDict: - readList = uniqDict[readID] - if len(readList) == 2: - total += 1 - start1 = readList[0]["start"] - flag1 = readList[0]["flag"] - start2 = readList[1]["start"] - flag2 = readList[1]["flag"] - - if flag1 != flag2: - dist = abs(start1 - start2) - startList = [start1, start2] - stopList = [start1 + readlen, start2 + readlen] - startList.sort() - stopList.sort() - if flag1 != "" and flag2 != "" and minDist < dist < maxDist: - outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0") - outbed.write(outputLine) - if doVerbose: - print flag1, flag2, dist - - try: - flagDict[flag1].append((flag2, start1, start2)) - except KeyError: - flagDict[flag1] = [(flag2, start1, start2)] - - try: - flagDict[flag2].append((flag1, start1, start2)) - except KeyError: - flagDict[flag2] = [(flag2, start1, start2)] + writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist) print "%d connected regions" % len(flagDict) - for region in flagDict: - flagDict[region].sort() - regionConnections = {} - for (region2, start1, start2) in flagDict[region]: + return flagDict + + +def doNotProcessChromosome(chrom): + return chrom == "chrM" + + +def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist): + outbed = open(outbedname, "a") + print chromosome + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) + if doVerbose: + print len(uniqDict), time.ctime() + + for readID in uniqDict: + readList = uniqDict[readID] + if readsAreFarPair(readList, minDist, maxDist): + start1 = readList[0]["start"] + start2 = readList[1]["start"] + startList = [start1, start2] + startList.sort() + outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0") + outbed.write(outputLine) + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if doVerbose: + print flag1, flag2, abs(start1 - start2) + try: - regionConnections[region2] += 1 + flagDict[flag1].append(flag2) except KeyError: - regionConnections[region2] = 1 + flagDict[flag1] = [flag2] - for region2 in regionConnections: - if regionConnections[region2] >= minCount: - outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2])) - if doVerbose: - print "%s\t%s\t%d" % (region, region2, regionConnections[region2]) + try: + flagDict[flag2].append(flag1) + except KeyError: + flagDict[flag2] = [flag1] - outfile.close() outbed.close() - if doVerbose: - print "finished: ", time.ctime() -def doNotProcessChromosome(chrom): - return chrom == "chrM" +def readsAreFarPair(readList, minDist, maxDist): + isFarPair = False + if len(readList) == 2: + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if flag1 != flag2 and flag1 != "" and flag2 != "": + start1 = readList[0]["start"] + start2 = readList[1]["start"] + dist = abs(start1 - start2) + if minDist < dist < maxDist: + isFarPair = True + return isFarPair -def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense): - readSizes = "%d" % (stopList[0] - startList[0]) - readCoords = "0" + +def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor): + sizes = ["%d" % readlen] + coords = ["0"] leftStart = startList[0] - 1 - rightStop = stopList[-1] + rightStop = startList[-1] for index in range(1, numPieces): - readSizes += ",%d" % (stopList[index] - startList[index] + 1) - readCoords += ",%d" % (startList[index] - startList[0]) + sizes.append("%d" % (readlen + 1)) + coords.append("%d" % (startList[index] - startList[0])) if rsense == "+": - senseCode = plusSense + senseCode = plusSenseColor else: - senseCode = minusSense + senseCode = minusSenseColor + readSizes = string.join(sizes, ",") + readCoords = string.join(coords, ",") outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords) + return outline