X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=farPairs.py;h=712e60eff9815f9728b6a03983385f83d96bc6f5;hp=73dd3ca851e2ca64cc71e3148a8eced359e44def;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/farPairs.py b/farPairs.py index 73dd3ca..712e60e 100644 --- a/farPairs.py +++ b/farPairs.py @@ -11,11 +11,14 @@ try: except: pass -import sys, time +import sys +import time import optparse -from commoncode import readDataset +import string +import ReadDataset +from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList -print "%prog: version 1.3" +print "farPairs: version 1.4" def main(argv=None): @@ -24,16 +27,7 @@ def main(argv=None): usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly") - parser.add_option("--cache", type="int", dest="cachePages") - parser.add_option("--verbose", action="store_true", dest="doVerbose") - parser.add_option("--minDist", type="int", dest="minDist") - parser.add_option("--maxDist", type="int", dest="maxDist") - parser.add_option("--minCount", type="int", dest="minCount") - parser.add_option("--label", dest="label") - parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None, - minDist=1000, maxDist=500000, minCount=2, label=None) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -45,14 +39,56 @@ def main(argv=None): outfilename = args[1] outbedname = args[2] - farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose, + farPairs(rdsfile, outfilename, outbedname, options.doVerbose, options.cachePages, options.minDist, options.maxDist, options.minCount, options.label) -def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False, +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--minDist", type="int", dest="minDist") + parser.add_option("--maxDist", type="int", dest="maxDist") + parser.add_option("--minCount", type="int", dest="minCount") + parser.add_option("--label", dest="label") + + configParser = getConfigParser + section = "farPairs" + doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) + cachePages = getConfigOption(configParser, section, "cachePages", None) + minDist = getConfigIntOption(configParser, section, "minDist", 1000) + maxDist = getConfigIntOption(configParser, section, "maxDist", 500000) + minCount = getConfigIntOption(configParser, section, "minCount", 2) + label = getConfigOption(configParser, section, "label", None) + + parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages, + minDist=minDist, maxDist=maxDist, minCount=minCount, label=label) + + return parser + + +def farPairs(rdsfile, outfilename, outbedname, doVerbose=False, cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None): + flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose) + + outfile = open(outfilename, "w") + for region in flagDict: + regionConnections = countDuplicatesInList(flagDict[region]) + for (connectedRegion, count) in regionConnections: + if count >= minCount: + outline = "%s\t%s\t%d" % (region, connectedRegion, count) + print >> outfile, outline + if doVerbose: + print outline + + outfile.close() + if doVerbose: + print "finished: ", time.ctime() + + +def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose): doCache = False if cachePages is not None: doCache = True @@ -62,16 +98,15 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if label is None: label = rdsfile - RDS = readDataset(rdsfile, verbose=True, cache=doCache) + RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache) rdsChromList = RDS.getChromosomes() if doVerbose: print time.ctime() - - total = 0 - outfile = open(outfilename, "w") + outbed = open(outbedname, "w") outbed.write('track name="%s distal pairs" color=0,255,0\n' % label) + outbed.close() readlen = RDS.getReadSize() flagDict = {} @@ -79,82 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if doNotProcessChromosome(chromosome): continue - print chromosome - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True) - if doVerbose: - print len(uniqDict), time.ctime() - - for readID in uniqDict: - readList = uniqDict[readID] - if len(readList) == 2: - total += 1 - (start1, flag1, pair1) = readList[0] - (start2, flag2, pair2) = readList[1] - - if flag1 != flag2: - dist = abs(start1 - start2) - startList = [start1, start2] - stopList = [start1 + readlen, start2 + readlen] - startList.sort() - stopList.sort() - if flag1 != "" and flag2 != "" and minDist < dist < maxDist: - outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0") - outbed.write(outputLine) - if doVerbose: - print flag1, flag2, dist - - try: - flagDict[flag1].append((flag2, start1, start2)) - except KeyError: - flagDict[flag1] = [(flag2, start1, start2)] - - try: - flagDict[flag2].append((flag1, start1, start2)) - except KeyError: - flagDict[flag2] = [(flag2, start1, start2)] + writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist) print "%d connected regions" % len(flagDict) - for region in flagDict: - flagDict[region].sort() - regionConnections = {} - for (region2, start1, start2) in flagDict[region]: + return flagDict + + +def doNotProcessChromosome(chrom): + return chrom == "chrM" + + +def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist): + outbed = open(outbedname, "a") + print chromosome + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) + if doVerbose: + print len(uniqDict), time.ctime() + + for readID in uniqDict: + readList = uniqDict[readID] + if readsAreFarPair(readList, minDist, maxDist): + start1 = readList[0]["start"] + start2 = readList[1]["start"] + startList = [start1, start2] + startList.sort() + outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0") + outbed.write(outputLine) + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if doVerbose: + print flag1, flag2, abs(start1 - start2) + try: - regionConnections[region2] += 1 + flagDict[flag1].append(flag2) except KeyError: - regionConnections[region2] = 1 + flagDict[flag1] = [flag2] - for region2 in regionConnections: - if regionConnections[region2] >= minCount: - outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2])) - if doVerbose: - print "%s\t%s\t%d" % (region, region2, regionConnections[region2]) + try: + flagDict[flag2].append(flag1) + except KeyError: + flagDict[flag2] = [flag1] - outfile.close() outbed.close() - if doVerbose: - print "finished: ", time.ctime() -def doNotProcessChromosome(chrom): - return chrom == "chrM" +def readsAreFarPair(readList, minDist, maxDist): + isFarPair = False + if len(readList) == 2: + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if flag1 != flag2 and flag1 != "" and flag2 != "": + start1 = readList[0]["start"] + start2 = readList[1]["start"] + dist = abs(start1 - start2) + if minDist < dist < maxDist: + isFarPair = True + + return isFarPair -def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense): - readSizes = "%d" % (stopList[0] - startList[0]) - readCoords = "0" +def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor): + sizes = ["%d" % readlen] + coords = ["0"] leftStart = startList[0] - 1 - rightStop = stopList[-1] + rightStop = startList[-1] for index in range(1, numPieces): - readSizes += ",%d" % (stopList[index] - startList[index] + 1) - readCoords += ",%d" % (startList[index] - startList[0]) + sizes.append("%d" % (readlen + 1)) + coords.append("%d" % (startList[index] - startList[0])) if rsense == "+": - senseCode = plusSense + senseCode = plusSenseColor else: - senseCode = minusSense + senseCode = minusSenseColor + readSizes = string.join(sizes, ",") + readCoords = string.join(coords, ",") outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords) + return outline