normalizeFinalExonic.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     pass
   6
   7 import sys, optparse
   8 from commoncode import readDataset
   9
  10 print "%prog: version 3.5" % sys.argv[0]
  11
  12 def main(argv=None):
  13     if not argv:
  14         argv = sys.argv
  15
  16     usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
  17
  18     parser = optparse.OptionParser(usage=usage)
  19     parser.add_option("--multifraction", action="store_true", dest="reportfraction")
  20     parser.add_option("--multifold", action="store_true", dest="reportFold")
  21     parser.add_option("--minrpkm", type="float", dest="minThreshold")
  22     parser.add_option("--cache", action="store_true", dest="doCache")
  23     parser.add_option("--withGID", action="store_true", dest="writeGID")
  24     parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
  25                         doCache=False, writeGID=False)
  26
  27     (options, args) = parser.parse_args(argv[1:])
  28
  29     if len(args) < 4:
  30         print usage
  31         sys.exit(1)
  32
  33     rdsfilename = argv[1]
  34     expandedRPKMfile = args[3]
  35     multicountfile = args[2]
  36     outfilename = args[3]
  37
  38     normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
  39                          options.reportFraction, options.reportFold, options.minThreshold,
  40                          options.doCache, options.writeGID)
  41
  42
  43 def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
  44                          reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
  45                          writeGID=False):
  46
  47     expandedRPKMfile = open(expandedRPKMfilename)
  48     multicountfile = open(multicountfilename)
  49
  50     if reportFraction:
  51         print "reporting fractional contribution of multireads"
  52         reportFold = False
  53     elif reportFold:
  54         print "reporting fold contribution of multireads"
  55
  56     RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
  57     uniqcount = RDS.getUniqsCount()
  58     splicecount = RDS.getSplicesCount()
  59     multicount = RDS.getMultiCount()
  60     countDict = {}
  61     multicountDict = {}
  62     lengthDict = {}
  63     gidList = []
  64
  65     uniqspliceCount = (uniqcount + splicecount) / 1000000.
  66     totalCount = (uniqcount + splicecount + multicount) / 1000000.
  67
  68     symbolDict = {}
  69
  70     for line in expandedRPKMfile:
  71         fields = line.strip().split()
  72         lineGID = fields[0]
  73         symbolDict[lineGID] = fields[1]
  74         countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
  75         lengthDict[lineGID] = float(fields[-2])
  76         multicountDict[lineGID] = 0
  77         if lineGID not in gidList:
  78             gidList.append(lineGID)
  79
  80     expandedRPKMfile.close()
  81
  82     for line in multicountfile:
  83         fields = line.strip().split()
  84         gid = fields[0]
  85         if gid in countDict:
  86             countDict[gid] += float(fields[-1])
  87             multicountDict[gid] = float(fields[-1])
  88         else:
  89             print "could not find gid %s in dictionaries" % gid
  90
  91     multicountfile.close()
  92
  93     outfile = open(outfilename, "w")
  94     outheader = "#"
  95     if writeGID:
  96         outheader += "GID\t"
  97
  98     outheader += "gene\tlen_kb\tRPKM"
  99     if reportFraction:
 100         outheader += "\tmulti/all"
 101     elif reportFold:
 102         outheader += "\tall/uniq"
 103
 104     outheader += "\n"
 105     outfile.write(outheader)
 106
 107     outlineList = []
 108     index = 0
 109     for gid in gidList:
 110         outline = ""
 111         gene = symbolDict[gid]
 112         rpm = countDict[gid] / totalCount
 113         rpkm = rpm / lengthDict[gid]
 114         if rpkm < minThreshold:
 115             continue
 116
 117         if writeGID:
 118             outline = "%s\t" % gid
 119
 120         index += 1
 121         try:
 122             multirpm = multicountDict[gid] / totalCount
 123             multirpkm = multirpm / lengthDict[gid]
 124         except:
 125             print "problem with %s - skipping " % gid
 126             continue
 127
 128         if reportFraction or reportFold:
 129             try:
 130                 if reportFraction:
 131                     multivalue = multirpkm / rpkm
 132                 else:
 133                     if rpm > multirpm:
 134                         uniqrpkm = (rpm - multirpm) / lengthDict[gid]
 135                         multivalue = rpkm / uniqrpkm
 136                     elif rpkm > 0.01:
 137                         multivalue = 100.
 138                     else:
 139                         multivalue = 1.0
 140             except:
 141                 multivalue = 0
 142
 143             outline += "%s\t%.3f\t%.2f\t%.2f\n" %  (gene, lengthDict[gid], rpkm, multivalue)
 144             outlineList.append((rpkm, outline))
 145         else:
 146             outline += "%s\t%.3f\t%.2f\n" %  (gene, lengthDict[gid], rpkm)
 147             outlineList.append((rpkm, outline))
 148
 149     outlineList.sort()
 150     outlineList.reverse()
 151
 152     for (rpkm, line) in outlineList:
 153         outfile.write(line)
 154
 155     outfile.close()
 156
 157     print "returned %d genes" % index
 158
 159
 160 if __name__ == "__main__":
 161     main(sys.argv)