4 from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
5 from checkrmask import checkrmask
6 from geneMrnaCounts import geneMrnaCounts
7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
8 from getallgenes import getallgenes
9 from normalizeFinalExonic import normalizeFinalExonic
10 from normalizeExpandedExonic import normalizeExpandedExonic
11 from findall import findall, RegionFinder
19 print "runStandardAnalysis: version %s" % VERSION
20 usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
22 parser = getParser(usage)
23 (options, args) = parser.parse_args(argv[1:])
31 repeatmaskdb = args[2]
32 bpradius = int(args[3])
38 runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
45 configParser = getConfigParser()
46 section = "RunStandardAnalysis"
47 replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
48 parser.set_defaults(replacemodels=replacemodels)
53 def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
54 """ based on original script runStandardAnalysis.sh
55 usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
56 where fileprefix is the name of the bam file without the .bam extension
57 use "none" for the repeatmaskdb if you do not have one
60 bamfilename = "%s.bam" % fileprefix
61 bamfile = pysam.Samfile(bamfilename, "rb")
65 message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
66 writeLog(logfile, "runStandardAnalysis.py", message)
68 # count the unique reads falling on the gene models ; the nomatch files are
69 # mappable reads that fell outside of the Cistematic gene models and not the
70 # unmappable of Eland (i.e, the "NM" reads)
72 # These will need to be marked by running the BAM preprocessor with the markGID option
73 uniquecountfilename = "%s.uniqs.count" % fileprefix
74 geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
76 # calculate a first-pass RPKM to re-weigh the unique reads,
77 # using 'none' for the splice count
78 initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
80 readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
81 readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
82 readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
83 normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
85 # recount the unique reads with weights calculated during the first pass
86 uniquerecountfilename = "%s.uniqs.recount" % fileprefix
87 geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
90 splicecountfilename = "%s.splices.count" % fileprefix
91 geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
93 # Alternative 1: find new regions outside of gene models with reads piled up
94 newregionfilename = "%s.newregions.txt" % fileprefix
95 regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
96 findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
98 # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
99 outFileName = "%s.newregions.repstatus" % fileprefix
100 goodFileName = "%s.newregions.good" % fileprefix
101 checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
103 # map all candidate regions that are within a given radius of a gene in bp
104 candidatefilename = "%s.candidates.txt" % fileprefix
105 getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
107 expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
108 # calculate expanded exonic read density
109 acceptedfilename = "%s.accepted.rpkm" % fileprefix
110 normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
111 acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
114 multicountfilename = "%s.multi.count" % fileprefix
115 geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
116 replaceModels=replacemodels)
118 # calculate final exonic read density
119 outfilename = "%s.final.rpkm" % fileprefix
120 normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
122 #TODO: remove when not tracking
123 writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
126 if __name__ == "__main__":