-#
-# RNAFARpairs.py
-# ENRAGE
-#
-# Created by Ali Mortazavi on 11/2/08.
-#
""" usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
"""
import sys
import time
import optparse
-import ReadDataset
-from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+import pysam
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
def main(argv=None):
genome = args[0]
goodfilename = args[1]
- rdsfile = args[2]
+ bamfilename = args[2]
outfilename = args[3]
- rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+ bamfile = pysam.Samfile(bamfilename, "rb")
+
+ rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
def makeParser(usage=""):
return parser
-def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+ """ map all candidate regions that have paired ends overlapping with known genes
+ """
+
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ regions = {}
+ for chromosome in chromosomeList:
+ regions[chromosome] = []
+
goodDict = {}
goodfile = open(goodfilename)
for line in goodfile:
fields = line.split()
- goodDict[fields[0]] = line
+ label = fields[0]
+ start = int(fields[2])
+ stop = int(fields[3])
+ goodDict[label] = line
+ regions[chromosome].append((start, stop, label))
goodfile.close()
- RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
- chromosomeList = RDS.getChromosomes()
if doVerbose:
print time.ctime()
assigned = {}
farConnected = {}
for chromosome in chromosomeList:
- if doNotProcessChromosome(chromosome):
- continue
-
print chromosome
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+ regionList = regions[chromosome].sort()
+ uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
if doVerbose:
print len(uniqDict), time.ctime()
- for readID in uniqDict:
- readList = uniqDict[readID]
- if len(readList) == 2:
- total += 1
- if processReads(readList[:2], maxDist):
- flags = (readList[0]["flag"], readList[1]["flag"])
- processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
- total += processed
- distinct += distinctPairs
+ total += pairCount
+ for readID, readList in uniqDict.items():
+ flags = (readList[0]["flag"], readList[1]["flag"])
+ processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+ total += processed
+ distinct += distinctPairs
entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
distinct += entriesWritten
print time.ctime()
-def doNotProcessChromosome(chromosome):
- return chromosome == "chrM"
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+ """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+ Only returns unique non-spliced read pairs matching the criteria given in processReads().
+ """
+ start = 1
+ readDict = {}
+ for regionstart, regionstop, regionname in regions:
+ for alignedread in bamfile.fetch(chromosome, start, regionstop):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ if alignedread.pos >= regionstart:
+ flag = regionname
+ else:
+ flag = alignedread.opt("ZG")
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
-def processReads(reads, maxDist):
+ start = regionstop + 1
+
+ for alignedread in bamfile.fetch(chromosome, start):
+ if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+ flag = alignedread.opt("ZG")
+
+ try:
+ readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+ except KeyError:
+ readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+ pairCount = len(readDict.keys())
+ for readID, readList in readDict.items():
+ if len(readList) != 2:
+ del readDict[readID]
+ pairCount -= 1
+ elif not processReads(readList, maxDist):
+ del readDict[readID]
+
+ return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+ """ For a pair of readID's to be acceptable:
+ - flags must be different
+ - neither flag can be 'NM'
+ - the read starts have to be within maxDist
+ """
process = False
- start1 = reads[0]["start"]
- start2 = reads[1]["start"]
- dist = abs(start1 - start2)
+ dist = abs(reads[0]["start"] - reads[1]["start"])
flag1 = reads[0]["flag"]
flag2 = reads[1]["flag"]
def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+ """ Writes out the region information along with symbol and geneID for paired reads where one read
+ of the pair is in a known gene and the other is in a new region. If both reads in the pair are
+ in new regions the region is added to farConnected. No action is taken if both reads in the
+ pair are in known genes.
+ """
flag1, flag2 = flags
total = 0
distinct = 0
except KeyError:
farConnected[geneID] = [farFlag]
elif read1IsGood or read2IsGood:
- total += 1
+ total = 1
if read2IsGood:
farFlag = flag2
geneID = flag1
farFlag = flag1
geneID = flag2
- try:
- if genome == "dmelanogaster":
- symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
- else:
- symbol = geneInfoDict[geneID][0][0]
- except (KeyError, IndexError):
- try:
- symbol = geneAnnotDict[(genome, geneID)][0]
- except (KeyError, IndexError):
- symbol = "LOC%s" % geneID
-
- symbol = symbol.strip()
- symbol = symbol.replace(" ","|")
- symbol = symbol.replace("\t","|")
-
+ symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
if farFlag not in assigned:
assigned[farFlag] = (symbol, geneID)
print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
- distinct += 1
+ distinct = 1
return total, distinct
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+ else:
+ symbol = geneInfoDict[geneID][0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genome, geneID)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+
+ return symbol
+
+
def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)