runRNAPairedAnalysis.py

   1 import sys
   2 import optparse
   3 import ReadDataset
   4 from commoncode import getConfigParser, getConfigBoolOption, writeLog
   5 from checkrmask import checkrmask
   6 from geneMrnaCounts import geneMrnaCounts
   7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
   8 from regionCounts import regionCounts
   9 from rnafarPairs import rnaFarPairs
  10 from normalizeFinalExonic import normalizeFinalExonic
  11 from normalizeExpandedExonic import normalizeExpandedExonic
  12 from findall import findall, RegionFinder
  13
  14 VERSION = "1.0"
  15
  16 def main(argv=None):
  17     if not argv:
  18         argv = sys.argv
  19
  20     print "runRNAPairedAnalysis: version %s" % VERSION
  21     usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]"
  22
  23     parser = getParser(usage)
  24     (options, args) = parser.parse_args(argv[1:])
  25
  26     if len(args) < 3:
  27         print usage
  28         sys.exit(1)
  29
  30     genome = args[0]
  31     rdsprefix = args[1]
  32     repeatmaskdb = args[2]
  33     try:
  34         modelfile = args[3]
  35     except IndexError:
  36         modelfile = ""
  37
  38     runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels)
  39
  40
  41 def getParser(usage):
  42     parser = optparse.OptionParser(usage=usage)
  43     parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
  44
  45     configParser = getConfigParser()
  46     section = "RunRNAPairedAnalysis"
  47     replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
  48     parser.set_defaults(replacemodels=replacemodels)
  49
  50     return parser
  51
  52
  53 def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
  54     """ based on original script runRNAPairedAnalysis.sh
  55         usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
  56             where rdsprefix is the name of the rds file without the .rds extension
  57             use "none" for the repeatmaskdb if you do not have one
  58     """
  59
  60     rdsfile = "%s.rds" % rdsprefix
  61     logfile = "rna.log"
  62
  63     # log the parameters
  64     message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
  65     writeLog(logfile, "runRNAPairedAnalysis.py", message)
  66
  67     # count the unique reads falling on the gene models ; the nomatch files are
  68     # mappable reads that fell outside of the Cistematic gene models and not the
  69     # unmappable of Eland (i.e, the "NM" reads)
  70     uniquecountfilename = "%s.uniqs.count" % rdsprefix
  71     geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
  72
  73     # calculate a first-pass RPKM to re-weigh the unique reads,
  74     # using 'none' for the splice count
  75     initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
  76     RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
  77     (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
  78     readCounts = {}
  79     readCounts["uniq"] = ucount
  80     readCounts["splice"] = mcount
  81     readCounts["multi"] = scount
  82     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  83
  84     # recount the unique reads with weights calculated during the first pass
  85     uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
  86     geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  87
  88     # count splice reads
  89     splicecountfilename = "%s.splices.count" % rdsprefix
  90     geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
  91
  92     # find new regions outside of gene models with reads piled up
  93     newregionfilename = "%s.newregions.txt" % rdsprefix
  94     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
  95     findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
  96
  97     # filter out new regions that overlap repeats more than a certain fraction
  98     outFileName = "%s.newregions.repstatus" % rdsprefix
  99     goodFileName = "%s.newregions.good" % rdsprefix
 100     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 101
 102     # calculate the read densities
 103     regionfilename = "%s.newregions.checked" % rdsprefix
 104     regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
 105
 106     # map all candidate regions that have paired ends overlapping with known genes
 107     candidatefilename = "%s.candidates.txt" % rdsprefix
 108     rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
 109
 110     expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
 111     # calculate expanded exonic read density
 112     acceptedfilename = "%s.accepted.rpkm" % rdsprefix
 113     try:
 114         candidatefile = open(candidatefilename)
 115         candidateLines = candidatefile.readlines()
 116         candidatefile.close()
 117     except IOError:
 118         candidateLines = []
 119
 120     normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
 121                             acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 122
 123     # weigh multi-reads
 124     multicountfilename = "%s.multi.count" % rdsprefix
 125     acceptfile = "%s.accepted.rpkm" % rdsprefix
 126     geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
 127                            extendGenome=modelfile, replaceModels=replacemodels)
 128
 129     # calculate final exonic read density
 130     outfilename = "%s.final.rpkm" % rdsprefix
 131     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 132
 133
 134 if __name__ == "__main__":
 135     main(sys.argv)