import sys
import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
from checkrmask import checkrmask
from geneMrnaCounts import geneMrnaCounts
from geneMrnaCountsWeighted import geneMrnaCountsWeighted
argv = sys.argv
print "runStandardAnalysis: version %s" % VERSION
- usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+ usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
sys.exit(1)
genome = args[0]
- rdsprefix = args[1]
+ fileprefix = args[1]
repeatmaskdb = args[2]
- bpradius = args[3]
+ bpradius = int(args[3])
try:
modelfile = args[4]
except IndexError:
modelfile = ""
- runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+ runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
def getParser(usage):
return parser
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
""" based on original script runStandardAnalysis.sh
- usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
- where rdsprefix is the name of the rds file without the .rds extension
+ usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+ where fileprefix is the name of the bam file without the .bam extension
use "none" for the repeatmaskdb if you do not have one
"""
- rdsfile = "%s.rds" % rdsprefix
+ bamfilename = "%s.bam" % fileprefix
+ bamfile = pysam.Samfile(bamfilename, "rb")
logfile = "rna.log"
# log the parameters
- message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+ message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
writeLog(logfile, "runStandardAnalysis.py", message)
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
- uniquecountfilename = "%s.uniqs.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+ #
+ # These will need to be marked by running the BAM preprocessor with the markGID option
+ uniquecountfilename = "%s.uniqs.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
- splicecountfilename = "none"
- initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
- RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
- (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+ initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
readCounts = {}
- readCounts["uniq"] = ucount
- readCounts["splice"] = mcount
- readCounts["multi"] = scount
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+ readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+ readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+ readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# recount the unique reads with weights calculated during the first pass
- uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+ uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
# count splice reads
- splicecountfilename = "%s.splices.count" % rdsprefix
- geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+ splicecountfilename = "%s.splices.count" % fileprefix
+ geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
# Alternative 1: find new regions outside of gene models with reads piled up
- newregionfilename = "%s.newregions.txt" % rdsprefix
+ newregionfilename = "%s.newregions.txt" % fileprefix
regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
- findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+ findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
- outFileName = "%s.newregions.repstatus" % rdsprefix
- goodFileName = "%s.newregions.good" % rdsprefix
+ outFileName = "%s.newregions.repstatus" % fileprefix
+ goodFileName = "%s.newregions.good" % fileprefix
checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
# map all candidate regions that are within a given radius of a gene in bp
- candidatefilename = "%s.candidates.txt" % rdsprefix
+ candidatefilename = "%s.candidates.txt" % fileprefix
getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
- expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+ expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
# calculate expanded exonic read density
- acceptedfilename = "%s.accepted.rpkm" % rdsprefix
- try:
- candidatefile = open(candidatefilename)
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- except IOError:
- candidateLines = []
-
- normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
- doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+ acceptedfilename = "%s.accepted.rpkm" % fileprefix
+ normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+ acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
# weigh multi-reads
- multicountfilename = "%s.multi.count" % rdsprefix
- geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+ multicountfilename = "%s.multi.count" % fileprefix
+ geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
replaceModels=replacemodels)
# calculate final exonic read density
- outfilename = "%s.final.rpkm" % rdsprefix
+ outfilename = "%s.final.rpkm" % fileprefix
normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+ #TODO: remove when not tracking
+ writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file