X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=rnafarPairs.py;fp=rnafarPairs.py;h=2c5aff43ced96fe650d5cc52562cb998033b4d04;hp=4d70c4965ef77c2eea4f646e58526db83682ffa6;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/rnafarPairs.py b/rnafarPairs.py index 4d70c49..2c5aff4 100755 --- a/rnafarPairs.py +++ b/rnafarPairs.py @@ -1,9 +1,3 @@ -# -# RNAFARpairs.py -# ENRAGE -# -# Created by Ali Mortazavi on 11/2/08. -# """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options] looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM """ @@ -16,8 +10,8 @@ except: import sys import time import optparse -import ReadDataset -from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption +import pysam +from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry def main(argv=None): @@ -36,10 +30,12 @@ def main(argv=None): genome = args[0] goodfilename = args[1] - rdsfile = args[2] + bamfilename = args[2] outfilename = args[3] - rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist) + bamfile = pysam.Samfile(bamfilename, "rb") + + rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist) def makeParser(usage=""): @@ -62,16 +58,26 @@ def makeParser(usage=""): return parser -def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): +def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): + """ map all candidate regions that have paired ends overlapping with known genes + """ + + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + regions = {} + for chromosome in chromosomeList: + regions[chromosome] = [] + goodDict = {} goodfile = open(goodfilename) for line in goodfile: fields = line.split() - goodDict[fields[0]] = line + label = fields[0] + start = int(fields[2]) + stop = int(fields[3]) + goodDict[label] = line + regions[chromosome].append((start, stop, label)) goodfile.close() - RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache) - chromosomeList = RDS.getChromosomes() if doVerbose: print time.ctime() @@ -83,23 +89,18 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC assigned = {} farConnected = {} for chromosome in chromosomeList: - if doNotProcessChromosome(chromosome): - continue - print chromosome - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) + regionList = regions[chromosome].sort() + uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist) if doVerbose: print len(uniqDict), time.ctime() - for readID in uniqDict: - readList = uniqDict[readID] - if len(readList) == 2: - total += 1 - if processReads(readList[:2], maxDist): - flags = (readList[0]["flag"], readList[1]["flag"]) - processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected) - total += processed - distinct += distinctPairs + total += pairCount + for readID, readList in uniqDict.items(): + flags = (readList[0]["flag"], readList[1]["flag"]) + processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected) + total += processed + distinct += distinctPairs entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile) distinct += entriesWritten @@ -109,15 +110,55 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC print time.ctime() -def doNotProcessChromosome(chromosome): - return chromosome == "chrM" +def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist): + """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag. + Only returns unique non-spliced read pairs matching the criteria given in processReads(). + """ + start = 1 + readDict = {} + for regionstart, regionstop, regionname in regions: + for alignedread in bamfile.fetch(chromosome, start, regionstop): + if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar): + if alignedread.pos >= regionstart: + flag = regionname + else: + flag = alignedread.opt("ZG") + try: + readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag}) + except KeyError: + readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}] -def processReads(reads, maxDist): + start = regionstop + 1 + + for alignedread in bamfile.fetch(chromosome, start): + if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar): + flag = alignedread.opt("ZG") + + try: + readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag}) + except KeyError: + readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}] + + pairCount = len(readDict.keys()) + for readID, readList in readDict.items(): + if len(readList) != 2: + del readDict[readID] + pairCount -= 1 + elif not processReads(readList, maxDist): + del readDict[readID] + + return readDict, pairCount + + +def processReads(reads, maxDist=500000): + """ For a pair of readID's to be acceptable: + - flags must be different + - neither flag can be 'NM' + - the read starts have to be within maxDist + """ process = False - start1 = reads[0]["start"] - start2 = reads[1]["start"] - dist = abs(start1 - start2) + dist = abs(reads[0]["start"] - reads[1]["start"]) flag1 = reads[0]["flag"] flag2 = reads[1]["flag"] @@ -128,6 +169,11 @@ def processReads(reads, maxDist): def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected): + """ Writes out the region information along with symbol and geneID for paired reads where one read + of the pair is in a known gene and the other is in a new region. If both reads in the pair are + in new regions the region is added to farConnected. No action is taken if both reads in the + pair are in known genes. + """ flag1, flag2 = flags total = 0 distinct = 0 @@ -147,7 +193,7 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou except KeyError: farConnected[geneID] = [farFlag] elif read1IsGood or read2IsGood: - total += 1 + total = 1 if read2IsGood: farFlag = flag2 geneID = flag1 @@ -155,30 +201,35 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou farFlag = flag1 geneID = flag2 - try: - if genome == "dmelanogaster": - symbol = geneInfoDict["Dmel_%s" % geneID][0][0] - else: - symbol = geneInfoDict[geneID][0][0] - except (KeyError, IndexError): - try: - symbol = geneAnnotDict[(genome, geneID)][0] - except (KeyError, IndexError): - symbol = "LOC%s" % geneID - - symbol = symbol.strip() - symbol = symbol.replace(" ","|") - symbol = symbol.replace("\t","|") - + symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict) if farFlag not in assigned: assigned[farFlag] = (symbol, geneID) print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip()) outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag])) - distinct += 1 + distinct = 1 return total, distinct +def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict): + try: + if genome == "dmelanogaster": + symbol = geneInfoDict["Dmel_%s" % geneID][0][0] + else: + symbol = geneInfoDict[geneID][0][0] + except (KeyError, IndexError): + try: + symbol = geneAnnotDict[(genome, geneID)][0] + except (KeyError, IndexError): + symbol = "LOC%s" % geneID + + symbol = symbol.strip() + symbol = symbol.replace(" ","|") + symbol = symbol.replace("\t","|") + + return symbol + + def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile): total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile) writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)