runStandardAnalysis.py

   1 import sys
   2 import optparse
   3 import pysam
   4 from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
   5 from checkrmask import checkrmask
   6 from geneMrnaCounts import geneMrnaCounts
   7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
   8 from getallgenes import getallgenes
   9 from normalizeFinalExonic import normalizeFinalExonic
  10 from normalizeExpandedExonic import normalizeExpandedExonic
  11 from findall import findall, RegionFinder
  12
  13 VERSION = "1.0"
  14
  15 def main(argv=None):
  16     if not argv:
  17         argv = sys.argv
  18
  19     print "runStandardAnalysis: version %s" % VERSION
  20     usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
  21
  22     parser = getParser(usage)
  23     (options, args) = parser.parse_args(argv[1:])
  24
  25     if len(args) < 4:
  26         print usage
  27         sys.exit(1)
  28
  29     genome = args[0]
  30     fileprefix = args[1]
  31     repeatmaskdb = args[2]
  32     bpradius = int(args[3])
  33     try:
  34         modelfile = args[4]
  35     except IndexError:
  36         modelfile = ""
  37
  38     runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
  39
  40
  41 def getParser(usage):
  42     parser = optparse.OptionParser(usage=usage)
  43     parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
  44
  45     configParser = getConfigParser()
  46     section = "RunStandardAnalysis"
  47     replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
  48     parser.set_defaults(replacemodels=replacemodels)
  49
  50     return parser
  51
  52
  53 def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
  54     """ based on original script runStandardAnalysis.sh
  55         usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
  56                where fileprefix is the name of the bam file without the .bam extension
  57                use "none" for the repeatmaskdb if you do not have one
  58     """
  59
  60     bamfilename = "%s.bam" % fileprefix
  61     bamfile = pysam.Samfile(bamfilename, "rb")
  62     logfile = "rna.log"
  63
  64     # log the parameters
  65     message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
  66     writeLog(logfile, "runStandardAnalysis.py", message)
  67
  68     # count the unique reads falling on the gene models ; the nomatch files are
  69     # mappable reads that fell outside of the Cistematic gene models and not the
  70     # unmappable of Eland (i.e, the "NM" reads)
  71     #
  72     # These will need to be marked by running the BAM preprocessor with the markGID option
  73     uniquecountfilename = "%s.uniqs.count" % fileprefix
  74     geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  75
  76     # calculate a first-pass RPKM to re-weigh the unique reads,
  77     # using 'none' for the splice count
  78     initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
  79     readCounts = {}
  80     readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
  81     readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
  82     readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
  83     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  84
  85     # recount the unique reads with weights calculated during the first pass
  86     uniquerecountfilename = "%s.uniqs.recount" % fileprefix
  87     geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  88
  89     # count splice reads
  90     splicecountfilename = "%s.splices.count" % fileprefix
  91     geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  92
  93     # Alternative 1: find new regions outside of gene models with reads piled up
  94     newregionfilename = "%s.newregions.txt" % fileprefix
  95     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
  96     findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  97
  98     # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
  99     outFileName = "%s.newregions.repstatus" % fileprefix
 100     goodFileName = "%s.newregions.good" % fileprefix
 101     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 102
 103     # map all candidate regions that are within a given radius of a gene in bp
 104     candidatefilename = "%s.candidates.txt" % fileprefix
 105     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 106
 107     expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
 108     # calculate expanded exonic read density
 109     acceptedfilename = "%s.accepted.rpkm" % fileprefix
 110     normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
 111                             acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 112
 113     # weigh multi-reads
 114     multicountfilename = "%s.multi.count" % fileprefix
 115     geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
 116                            replaceModels=replacemodels)
 117
 118     # calculate final exonic read density
 119     outfilename = "%s.final.rpkm" % fileprefix
 120     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 121
 122     #TODO: remove when not tracking
 123     writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
 124
 125
 126 if __name__ == "__main__":
 127     main(sys.argv)