X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=rnafarPairs.py;fp=rnafarPairs.py;h=4d70c4965ef77c2eea4f646e58526db83682ffa6;hp=d1baebd3e7e6e61278c4c141a84fe2396e323f28;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/rnafarPairs.py b/rnafarPairs.py index d1baebd..4d70c49 100755 --- a/rnafarPairs.py +++ b/rnafarPairs.py @@ -13,26 +13,21 @@ try: except: pass -import sys, time, optparse -from commoncode import readDataset -from cistematic.core.geneinfo import geneinfoDB -from cistematic.genomes import Genome +import sys +import time +import optparse +import ReadDataset +from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption + def main(argv=None): if not argv: argv = sys.argv - print "%prog: version 3.6" + print "rnafarPairs: version 3.7" usage = "usage: python %prog genome goodfile rdsfile outfile [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--verbose", action="store_true", dest="doVerbose", - help="verbose output") - parser.add_option("--cache", action="store_true", dest="doCache", - help="use cache") - parser.add_option("--maxDist", type="int", dest="maxDist", - help="maximum distance") - parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000) + parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 4: @@ -45,7 +40,27 @@ def main(argv=None): outfilename = args[3] rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist) - + + +def makeParser(usage=""): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--verbose", action="store_true", dest="doVerbose", + help="verbose output") + parser.add_option("--cache", action="store_true", dest="doCache", + help="use cache") + parser.add_option("--maxDist", type="int", dest="maxDist", + help="maximum distance") + + configParser = getConfigParser() + section = "rnafarPairs" + doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + maxDist = getConfigIntOption(configParser, section, "maxDist", 500000) + + parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist) + + return parser + def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): goodDict = {} @@ -54,33 +69,25 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC fields = line.split() goodDict[fields[0]] = line - RDS = readDataset(rdsfile, verbose = True, cache=doCache) - rdsChromList = RDS.getChromosomes() - + goodfile.close() + RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache) + chromosomeList = RDS.getChromosomes() if doVerbose: print time.ctime() distinct = 0 total = 0 outfile = open(outfilename,"w") - - idb = geneinfoDB() - if genome == "dmelanogaster": - geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus") - else: - geneinfoDict = idb.getallGeneInfo(genome) - - hg = Genome(genome) - geneannotDict = hg.allAnnotInfo() - + geneinfoDict = getGeneInfoDict(genome) + geneannotDict = getGeneAnnotDict(genome) assigned = {} farConnected = {} - for achrom in rdsChromList: - if achrom == "chrM": + for chromosome in chromosomeList: + if doNotProcessChromosome(chromosome): continue - print achrom - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True) + print chromosome + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) if doVerbose: print len(uniqDict), time.ctime() @@ -88,82 +95,124 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC readList = uniqDict[readID] if len(readList) == 2: total += 1 - (start1, flag1, pair1) = readList[0] - (start2, flag2, pair2) = readList[1] - - if flag1 != flag2: - dist = abs(start1 - start2) - if flag1 != "NM" and flag2 != "NM" and dist < maxDist: - geneID = "" - saw1 = False - saw2 = False - if flag1 in goodDict: - geneID = flag2 - farFlag = flag1 - saw1 = True - - if flag2 in goodDict: - geneID = flag1 - farFlag = flag2 - saw2 = True - - if saw1 or saw2: - total += 1 - - if saw1 and saw2: - if flag1 < flag2: - geneID = flag1 - farFlag = flag2 - else: - geneID = flag2 - farFlag = flag1 - - if geneID in farConnected: - farConnected[geneID].append(farFlag) - else: - farConnected[geneID] = [farFlag] - elif geneID != "": - try: - if genome == "dmelanogaster": - symbol = geneinfoDict["Dmel_" + geneID][0][0] - else: - symbol = geneinfoDict[geneID][0][0] - except: - try: - symbol = geneannotDict[(genome, geneID)][0] - except: - symbol = "LOC" + geneID - - symbol = symbol.strip() - symbol = symbol.replace(" ","|") - symbol = symbol.replace("\t","|") - if farFlag not in assigned: - assigned[farFlag] = (symbol, geneID) - print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip()) - outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag])) - distinct += 1 - - farIndex = 0 + if processReads(readList[:2], maxDist): + flags = (readList[0]["flag"], readList[1]["flag"]) + processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected) + total += processed + distinct += distinctPairs + + entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile) + distinct += entriesWritten + outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total)) + outfile.close() + print "distinct: %d\ttotal: %d" % (distinct, total) + print time.ctime() + + +def doNotProcessChromosome(chromosome): + return chromosome == "chrM" + + +def processReads(reads, maxDist): + process = False + start1 = reads[0]["start"] + start2 = reads[1]["start"] + dist = abs(start1 - start2) + flag1 = reads[0]["flag"] + flag2 = reads[1]["flag"] + + if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist: + process = True + + return process + + +def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected): + flag1, flag2 = flags + total = 0 + distinct = 0 + read1IsGood = flag1 in goodDict + read2IsGood = flag2 in goodDict + + if read1IsGood and read2IsGood: + if flag1 < flag2: + geneID = flag1 + farFlag = flag2 + else: + geneID = flag2 + farFlag = flag1 + + try: + farConnected[geneID].append(farFlag) + except KeyError: + farConnected[geneID] = [farFlag] + elif read1IsGood or read2IsGood: + total += 1 + if read2IsGood: + farFlag = flag2 + geneID = flag1 + else: + farFlag = flag1 + geneID = flag2 + + try: + if genome == "dmelanogaster": + symbol = geneInfoDict["Dmel_%s" % geneID][0][0] + else: + symbol = geneInfoDict[geneID][0][0] + except (KeyError, IndexError): + try: + symbol = geneAnnotDict[(genome, geneID)][0] + except (KeyError, IndexError): + symbol = "LOC%s" % geneID + + symbol = symbol.strip() + symbol = symbol.replace(" ","|") + symbol = symbol.replace("\t","|") + + if farFlag not in assigned: + assigned[farFlag] = (symbol, geneID) + print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip()) + outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag])) + distinct += 1 + + return total, distinct + + +def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile): + total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile) + writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile) + + return written + + +def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile): + total = 0 + written = 0 for farFlag in farConnected: geneID = "" symbol = "" idList = [farFlag] + farConnected[farFlag] - for oneID in idList: - if oneID in assigned: - (symbol, geneID) = assigned[oneID] + for ID in idList: + if ID in assigned: + (symbol, geneID) = assigned[ID] if geneID == "": - farIndex += 1 - symbol = "FAR%d" % farIndex - geneID = -1 * farIndex + total += 1 + symbol = "FAR%d" % total + geneID = -1 * total - for oneID in idList: - if oneID not in assigned: - print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip()) - outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID])) - distinct += 1 - assigned[oneID] = (symbol, geneID) + for ID in idList: + if ID not in assigned: + print "%s %s %s" % (symbol, geneID, goodDict[ID].strip()) + outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID])) + written += 1 + assigned[ID] = (symbol, geneID) + return total, written + + +def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile): for farFlag in goodDict: if farFlag not in assigned: farIndex += 1 @@ -171,10 +220,6 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC print line.strip() outfile.write(line) - outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total)) - outfile.close() - print "distinct: %d\ttotal: %d" % (distinct, total) - print time.ctime() if __name__ == "__main__": main(sys.argv) \ No newline at end of file