normalizeExpandedExonic.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     pass
   6
   7 import sys, optparse
   8 from commoncode import readDataset
   9 from cistematic.genomes import Genome
  10 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  11
  12 print "%prog: version 5.6"
  13
  14
  15 def main(argv=None):
  16     if not argv:
  17         argv = sys.argv
  18
  19     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  20
  21     parser = optparse.OptionParser(usage=usage)
  22     parser.add_option("--gidField", type="int", dest="fieldID")
  23     parser.add_option("--maxLength", type="float", dest="maxLength")
  24     parser.add_option("--cache", action="store_true", dest="doCache")
  25     parser.add_option("--models", dest="extendGenome")
  26     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  27     parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
  28                         replaceModels=False)
  29
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     if len(sys.argv) < 6:
  33         print usage
  34         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
  35         sys.exit(1)
  36
  37     genome = args[0]
  38     hitfile = args[1]
  39     uniquecountfile = args[2]
  40     splicecountfile = args[3]
  41     outfile = args[4]
  42
  43     candidateLines = []
  44     acceptedfilename = ""
  45     if len(args) > 5:
  46         try:
  47             candidatefile = open(args[5])
  48             candidateLines = candidatefile.readlines()
  49             candidatefile.close()
  50             acceptedfilename = args[6]
  51         except IndexError:
  52             pass
  53
  54     normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
  55                             candidateLines, acceptedfilename, options.fieldID,
  56                             options.maxLength, options.doCache, options.extendGenome,
  57                             options.replaceModels)
  58
  59
  60 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
  61                             outfilename, candidateLines=[], acceptedfilename="",
  62                             fieldID=0, maxLength=1000000000., doCache=False,
  63                             extendGenome="", replaceModels=False):
  64
  65     uniquecountfile = open(uniquecountfilename)
  66
  67     if acceptedfilename:
  68         acceptedfile = open(acceptedfilename, "w")
  69
  70     dosplicecount = False
  71     if splicecountfilename != "none":
  72         dosplicecount = True
  73         splicecountfile = open(splicecountfilename)
  74
  75     if extendGenome:
  76         if replaceModels:
  77             print "will replace gene models with %s" % extendGenome
  78         else:
  79             print "will extend gene models with %s" % extendGenome
  80
  81     if doCache:
  82         cacheGeneDB(genome)
  83         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
  84         print "%s cached" % genome
  85     else:
  86         hg = Genome(genome, inRAM=True)
  87
  88     if extendGenome != "":
  89         hg.extendFeatures(extendGenome, replace=replaceModels)
  90
  91     RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
  92     uniqcount = RDS.getUniqsCount()
  93     print "%d unique reads" % uniqcount
  94
  95     splicecount = 0
  96     countDict = {}
  97     gidList = []
  98     farList = []
  99     candidateDict = {}
 100
 101     gidToGeneDict = {}
 102
 103     featuresDict = hg.getallGeneFeatures()
 104     print "got featuresDict"
 105
 106     outfile = open(outfilename, "w")
 107
 108     for line in uniquecountfile:
 109         fields = line.strip().split()
 110         gid = fields[fieldID]
 111         gene = fields[1]
 112         countDict[gid] = float(fields[-1])
 113         gidList.append(gid)
 114         gidToGeneDict[gid] = gene
 115
 116     uniquecountfile.close()
 117
 118     if dosplicecount:
 119         for line in splicecountfile:
 120             fields = line.strip().split()
 121             gid = fields[fieldID]
 122             try:
 123                 countDict[gid] += float(fields[-1])
 124             except:
 125                 print fields
 126                 continue
 127
 128             splicecount += float(fields[-1])
 129
 130         splicecountfile.close()
 131
 132     for line in candidateLines:
 133         if "#" in line:
 134             continue
 135
 136         fields = line.strip().split()
 137         gid = fields[1]
 138         gene = fields[0]
 139         if gid not in gidList:
 140             if gid not in farList:
 141                 farList.append(gid)
 142                 gidToGeneDict[gid] = gene
 143
 144             if gid not in countDict:
 145                 countDict[gid] = 0
 146
 147             countDict[gid] += float(fields[6])
 148
 149         if gid not in candidateDict:
 150             candidateDict[gid] = []
 151
 152         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
 153
 154     totalCount = (uniqcount + splicecount) / 1000000.
 155     uniqScale = uniqcount / 1000000.
 156     for gid in gidList:
 157         gene = gidToGeneDict[gid]
 158         featureList = []
 159         try:
 160             featureList = featuresDict[gid]
 161         except:
 162             try:
 163                 featureList = featuresDict[gene]
 164             except:
 165                 print gene, gid
 166
 167         newfeatureList = []
 168         geneLength = 0.
 169         for (ftype, chrom, start, stop, sense) in featureList:
 170             if (start, stop) not in newfeatureList:
 171                 newfeatureList.append((start, stop))
 172                 geneLength += (abs(start - stop) + 1.) / 1000.
 173
 174         if geneLength < 0.1:
 175             geneLength = 0.1
 176         elif geneLength > maxLength:
 177             geneLength = maxLength
 178
 179         rpm = countDict[gid] / totalCount
 180         rpkm = rpm / geneLength
 181         if gid in candidateDict:
 182             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 183                 cratio = cCount / (cLength / 1000.)
 184                 cratio = (uniqScale * cratio) / totalCount
 185                 if 10. * cratio < rpkm:
 186                     continue
 187
 188                 countDict[gid] += cCount
 189                 geneLength += cLength / 1000.
 190                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
 191
 192         rpm = countDict[gid] / totalCount
 193         rpkm = rpm / geneLength
 194         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
 195
 196     for gid in farList:
 197         gene = gidToGeneDict[gid]
 198         geneLength = 0
 199         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 200             geneLength += cLength / 1000.
 201
 202         if geneLength < 0.1:
 203             continue
 204
 205         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 206             cratio = cCount / (cLength / 1000.)
 207             cratio = cratio / totalCount
 208             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
 209
 210         rpm = countDict[gid] / totalCount
 211         rpkm = rpm / geneLength
 212         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
 213
 214     outfile.close()
 215     try:
 216         acceptedfile.close()
 217     except:
 218         pass
 219
 220     if doCache:
 221         uncacheGeneDB(genome)
 222
 223
 224 if __name__ == "__main__":
 225     main(sys.argv)