X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=distalPairs.py;h=dc836501c6b2b98f53b5a4a84ce8778c9673a1a7;hp=d24781a0db7c0ee3702e71cf97f6c2d62ea21c41;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/distalPairs.py b/distalPairs.py index d24781a..dc83650 100755 --- a/distalPairs.py +++ b/distalPairs.py @@ -12,15 +12,18 @@ try: except: pass -from commoncode import readDataset -import sys, time, optparse +import sys +import time +import optparse +import ReadDataset +from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption def main(argv=None): if not argv: argv = sys.argv - print "%prog: version 3.3" + print "distalPairs: version 3.4" print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM" usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]" @@ -44,6 +47,27 @@ def main(argv=None): distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages) +def makeParser(usage=""): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--maxDist", type="int", dest="maxDist") + parser.add_option("--cache", type="int", dest="cachePages") + + configParser = getConfigParser() + section = "distalPairs" + sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False) + doSplices = getConfigBoolOption(configParser, section, "doSplices", False) + doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) + maxDist = getConfigIntOption(configParser, section, "maxDist", 1000000000) + cachePages = getConfigOption(configParser, section, "cachePages", None) + + parser.set_defaults(sameChromOnly=sameChromOnly, doSplices=doSplices, doVerbose=doVerbose, maxDist=maxDist, cachePages=cachePages) + + return parser + + def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None): if cachePages is not None: doCache = True @@ -51,7 +75,7 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa doCache = False cachePages = -1 - RDS = readDataset(rdsfile, verbose = True, cache=doCache) + RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache) if not RDS.hasIndex(): print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun" sys.exit(1) @@ -61,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa print time.ctime() - if doSplices: - print "getting splices" - splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True) - print "got splices" - print "getting uniq reads" uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True) print "got uniqs" if doSplices: - for readID in splicesDict: - theRead = splicesDict[readID] - read0 = theRead[0] - del read0[1] - try: - uniqDict[readID].append(read0) - except: - if len(theRead) == 4: - read2 = theRead[2] - del read2[1] - uniqDict[readID] = [read0,read2] + addSplicesToUniqReads(RDS, uniqDict) if doVerbose: print len(uniqDict), time.ctime() outfile = open(outfilename,"w") - diffChrom = 0 distal = 0 total = 0 @@ -95,8 +103,12 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa readList = uniqDict[readID] if len(readList) == 2: total += 1 - (start1, sense1, chrom1, pair1) = readList[0] - (start2, sense2, chrom2, pair2) = readList[1] + start1 = readList[0]["start"] + sense1 = readList[0]["sense"] + chrom1 = readList[0]["chrom"] + start2 = readList[1]["start"] + sense2 = readList[1]["sense"] + chrom2 = readList[1]["chrom"] if chrom1 != chrom2: diffChrom += 1 @@ -104,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa continue else: outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2) - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print diffChrom, outline else: dist = abs(start1 - start2) - if minDist < dist < maxDist: distal += 1 outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist) - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print distal, outline @@ -129,5 +140,24 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa print time.ctime() +def addSplicesToUniqReads(RDS, uniqDict): + print "getting splices" + splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True) + print "got splices" + for readID in splicesDict: + theRead = splicesDict[readID] + read0 = theRead[0] + read0["start"] = read0["startL"] + del read0["stopL"] + try: + uniqDict[readID].append(read0) + except: + if len(theRead) == 4: + read2 = theRead[2] + read2["start"] = read2["startL"] + del read2["stopL"] + uniqDict[readID] = [read0,read2] + + if __name__ == "__main__": main(sys.argv) \ No newline at end of file