X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=runStrandedAnalysis.py;fp=runStrandedAnalysis.py;h=35a67008d196ea0c942e2fb94614a7e76f612f7e;hp=55c87ca76e48a1e272bf1184ec6fd3bd5ff9b1e9;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py index 55c87ca..35a6700 100644 --- a/runStrandedAnalysis.py +++ b/runStrandedAnalysis.py @@ -1,7 +1,7 @@ import sys import optparse -import ReadDataset -from commoncode import writeLog +import pysam +from commoncode import writeLog, getHeaderComment from checkrmask import checkrmask from geneMrnaCounts import geneMrnaCounts from geneMrnaCountsWeighted import geneMrnaCountsWeighted @@ -40,58 +40,57 @@ def getParser(usage): return parser -def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius): +def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius): """ based on original script runStrandedAnalysis.sh usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius where rdsprefix is the name of the rds file without the .rds extension use "none" for the repeatmaskdb if you do not have one """ - rdsfile = "%s.rds" % rdsprefix + bamfilename = "%s.bam" % fileprefix + bamfile = pysam.Samfile(bamfilename, "rb") logfile = "rna.log" # log the parameters - message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius) writeLog(logfile, "runStrandedAnalysis.py", message) # count the unique reads falling on the gene models ; the nomatch files are # mappable reads that fell outside of the Cistematic gene models and not the # unmappable of Eland (i.e, the "NM" reads) - uniquecountfilename = "%s.uniqs.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True) + uniquecountfilename = "%s.uniqs.count" % fileprefix + geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True) # calculate a first-pass RPKM to re-weigh the unique reads, # using 'none' for the splice count - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) - (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix readCounts = {} - readCounts["uniq"] = ucount - readCounts["splice"] = mcount - readCounts["multi"] = scount + readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique")) + readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices")) + readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis")) normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True) # recount the unique reads with weights calculated during the first pass - initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix - uniquerecountfilename = "%s.uniqs.recount" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False) + initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix + uniquerecountfilename = "%s.uniqs.recount" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False) # count splice reads - splicecountfilename = "%s.splices.count" % rdsprefix - geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1) + splicecountfilename = "%s.splices.count" % fileprefix + geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False) # find new regions outside of gene models with reads piled up - newregionfilename = "%s.newregions.txt" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False) regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus") - findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a") + findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a") # filter out new regions that overlap repeats more than a certain fraction - newregionfilename = "%s.newregions.txt" % rdsprefix - outFileName = "%s.newregions.repstatus" % rdsprefix - goodFileName = "%s.newregions.good" % rdsprefix + newregionfilename = "%s.newregions.txt" % fileprefix + outFileName = "%s.newregions.repstatus" % fileprefix + goodFileName = "%s.newregions.good" % fileprefix checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) #TODO: these calls look wrong @@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius): #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good # map all candidate regions that are within a given radius of a gene in bp - candidatefilename = "%s.candidates.txt" % rdsprefix + candidatefilename = "%s.candidates.txt" % fileprefix getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True) - expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix # calculate expanded exonic read density - acceptedfilename = "%s.accepted.rpkm" % rdsprefix - try: - candidatefile = open(candidatefilename) - candidateLines = candidatefile.readlines() - candidatefile.close() - except IOError: - candidateLines = [] - - normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, - doCache=True) + acceptedfilename = "%s.accepted.rpkm" % fileprefix + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename, + acceptedfilename=acceptedfilename, doCache=True) # weigh multi-reads - multicountfilename = "%s.multi.count" % rdsprefix - geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1) + multicountfilename = "%s.multi.count" % fileprefix + geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1) # calculate final exonic read density - outfilename = "%s.final.rpkm" % rdsprefix + outfilename = "%s.final.rpkm" % fileprefix normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)