runStrandedAnalysis.py

   1 import sys
   2 import optparse
   3 import ReadDataset
   4 from commoncode import writeLog
   5 from checkrmask import checkrmask
   6 from geneMrnaCounts import geneMrnaCounts
   7 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
   8 from getallgenes import getallgenes
   9 from normalizeFinalExonic import normalizeFinalExonic
  10 from normalizeExpandedExonic import normalizeExpandedExonic
  11 from findall import findall, RegionFinder
  12
  13 VERSION = "1.0"
  14
  15 def main(argv=None):
  16     if not argv:
  17         argv = sys.argv
  18
  19     print "runStrandedAnalysis: version %s" % VERSION
  20     usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius"
  21
  22     parser = getParser(usage)
  23     (options, args) = parser.parse_args(argv[1:])
  24
  25     if len(args) < 4:
  26         print usage
  27         sys.exit(1)
  28
  29     genome = args[0]
  30     rdsprefix = args[1]
  31     repeatmaskdb = args[2]
  32     bpradius = args[3]
  33
  34     runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius)
  35
  36
  37 def getParser(usage):
  38     parser = optparse.OptionParser(usage=usage)
  39
  40     return parser
  41
  42
  43 def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
  44     """ based on original script runStrandedAnalysis.sh
  45         usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
  46                where rdsprefix is the name of the rds file without the .rds extension
  47                use "none" for the repeatmaskdb if you do not have one
  48     """
  49
  50     rdsfile = "%s.rds" % rdsprefix
  51     logfile = "rna.log"
  52
  53     # log the parameters
  54     message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
  55     writeLog(logfile, "runStrandedAnalysis.py", message)
  56
  57     # count the unique reads falling on the gene models ; the nomatch files are
  58     # mappable reads that fell outside of the Cistematic gene models and not the
  59     # unmappable of Eland (i.e, the "NM" reads)
  60     uniquecountfilename = "%s.uniqs.count" % rdsprefix
  61     geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
  62
  63     # calculate a first-pass RPKM to re-weigh the unique reads,
  64     # using 'none' for the splice count
  65     initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
  66     RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
  67     (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
  68     readCounts = {}
  69     readCounts["uniq"] = ucount
  70     readCounts["splice"] = mcount
  71     readCounts["multi"] = scount
  72     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  73
  74     # recount the unique reads with weights calculated during the first pass
  75     initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
  76     uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
  77     geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
  78
  79     # count splice reads
  80     splicecountfilename = "%s.splices.count" % rdsprefix
  81     geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
  82
  83     # find new regions outside of gene models with reads piled up
  84     newregionfilename = "%s.newregions.txt" % rdsprefix
  85     regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
  86     findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
  87
  88     regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
  89     findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
  90
  91     # filter out new regions that overlap repeats more than a certain fraction
  92     newregionfilename = "%s.newregions.txt" % rdsprefix
  93     outFileName = "%s.newregions.repstatus" % rdsprefix
  94     goodFileName = "%s.newregions.good" % rdsprefix
  95     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  96
  97     #TODO: these calls look wrong
  98     # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  99     #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
 100     #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
 101
 102     # map all candidate regions that are within a given radius of a gene in bp
 103     candidatefilename = "%s.candidates.txt" % rdsprefix
 104     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
 105
 106     expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
 107     # calculate expanded exonic read density
 108     acceptedfilename = "%s.accepted.rpkm" % rdsprefix
 109     try:
 110         candidatefile = open(candidatefilename)
 111         candidateLines = candidatefile.readlines()
 112         candidatefile.close()
 113     except IOError:
 114         candidateLines = []
 115
 116     normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
 117                             doCache=True)
 118
 119     # weigh multi-reads
 120     multicountfilename = "%s.multi.count" % rdsprefix
 121     geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
 122
 123     # calculate final exonic read density
 124     outfilename = "%s.final.rpkm" % rdsprefix
 125     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 126
 127
 128 if __name__ == "__main__":
 129     main(sys.argv)