import sys
import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
from checkrmask import checkrmask
from geneMrnaCounts import geneMrnaCounts
from geneMrnaCountsWeighted import geneMrnaCountsWeighted
return parser
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
""" based on original script runStrandedAnalysis.sh
usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
where rdsprefix is the name of the rds file without the .rds extension
use "none" for the repeatmaskdb if you do not have one
"""
- rdsfile = "%s.rds" % rdsprefix
+ bamfilename = "%s.bam" % fileprefix
+ bamfile = pysam.Samfile(bamfilename, "rb")
logfile = "rna.log"
# log the parameters
- message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+ message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
writeLog(logfile, "runStrandedAnalysis.py", message)
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
- uniquecountfilename = "%s.uniqs.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+ uniquecountfilename = "%s.uniqs.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
- (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
readCounts = {}
- readCounts["uniq"] = ucount
- readCounts["splice"] = mcount
- readCounts["multi"] = scount
+ readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+ readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+ readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
# recount the unique reads with weights calculated during the first pass
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+ uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
# count splice reads
- splicecountfilename = "%s.splices.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+ splicecountfilename = "%s.splices.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
# find new regions outside of gene models with reads piled up
- newregionfilename = "%s.newregions.txt" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+ findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+ findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
# filter out new regions that overlap repeats more than a certain fraction
- newregionfilename = "%s.newregions.txt" % rdsprefix
- outFileName = "%s.newregions.repstatus" % rdsprefix
- goodFileName = "%s.newregions.good" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
+ outFileName = "%s.newregions.repstatus" % fileprefix
+ goodFileName = "%s.newregions.good" % fileprefix
checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
#TODO: these calls look wrong
#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
# map all candidate regions that are within a given radius of a gene in bp
- candidatefilename = "%s.candidates.txt" % rdsprefix
+ candidatefilename = "%s.candidates.txt" % fileprefix
getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
- expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
# calculate expanded exonic read density
- acceptedfilename = "%s.accepted.rpkm" % rdsprefix
- try:
- candidatefile = open(candidatefilename)
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- except IOError:
- candidateLines = []
-
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
- doCache=True)
+ acceptedfilename = "%s.accepted.rpkm" % fileprefix
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+ acceptedfilename=acceptedfilename, doCache=True)
# weigh multi-reads
- multicountfilename = "%s.multi.count" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+ multicountfilename = "%s.multi.count" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
# calculate final exonic read density
- outfilename = "%s.final.rpkm" % rdsprefix
+ outfilename = "%s.final.rpkm" % fileprefix
normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)