normalizeExpandedExonic.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     pass
   6
   7 import sys
   8 import optparse
   9 import ReadDataset
  10 from cistematic.genomes import Genome
  11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
  13
  14 print "normalizeExpandedExonic: version 5.7"
  15
  16
  17 def main(argv=None):
  18     if not argv:
  19         argv = sys.argv
  20
  21     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  22
  23     parser = makeParser(usage)
  24     (options, args) = parser.parse_args(argv[1:])
  25
  26     if len(sys.argv) < 6:
  27         print usage
  28         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
  29         sys.exit(1)
  30
  31     genome = args[0]
  32     hitfile = args[1]
  33     uniquecountfile = args[2]
  34     splicecountfile = args[3]
  35     outfile = args[4]
  36
  37     candidateLines = []
  38     acceptedfilename = ""
  39     if len(args) > 5:
  40         try:
  41             candidatefile = open(args[5])
  42             candidateLines = candidatefile.readlines()
  43             candidatefile.close()
  44             acceptedfilename = args[6]
  45         except IndexError:
  46             pass
  47
  48     RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)
  49     uniqcount = RDS.getUniqsCount()
  50
  51     normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
  52                             candidateLines, acceptedfilename, options.fieldID,
  53                             options.maxLength, options.doCache, options.extendGenome,
  54                             options.replaceModels)
  55
  56
  57 def makeParser(usage=""):
  58     parser = optparse.OptionParser(usage=usage)
  59     parser.add_option("--gidField", type="int", dest="fieldID")
  60     parser.add_option("--maxLength", type="float", dest="maxLength")
  61     parser.add_option("--cache", action="store_true", dest="doCache")
  62     parser.add_option("--models", dest="extendGenome")
  63     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  64
  65     configParser = getConfigParser()
  66     section = "normalizeExpandedExonic"
  67     fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
  68     maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
  69     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  70     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
  71     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
  72
  73     parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
  74                         replaceModels=replaceModels)
  75
  76     return parser
  77
  78
  79 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
  80                             outfilename, candidateLines=[], acceptedfilename="",
  81                             fieldID=0, maxLength=1000000000., doCache=False,
  82                             extendGenome="", replaceModels=False):
  83
  84     uniquecountfile = open(uniquecountfilename)
  85
  86     if acceptedfilename:
  87         acceptedfile = open(acceptedfilename, "w")
  88
  89     dosplicecount = False
  90     if splicecountfilename != "none":
  91         dosplicecount = True
  92         splicecountfile = open(splicecountfilename)
  93
  94     if extendGenome:
  95         if replaceModels:
  96             print "will replace gene models with %s" % extendGenome
  97         else:
  98             print "will extend gene models with %s" % extendGenome
  99
 100     if doCache:
 101         cacheGeneDB(genome)
 102         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
 103         print "%s cached" % genome
 104     else:
 105         hg = Genome(genome, inRAM=True)
 106
 107     if extendGenome != "":
 108         hg.extendFeatures(extendGenome, replace=replaceModels)
 109
 110
 111     print "%d unique reads" % uniqcount
 112
 113     splicecount = 0
 114     countDict = {}
 115     gidList = []
 116     farList = []
 117     candidateDict = {}
 118
 119     gidToGeneDict = {}
 120
 121     featuresDict = hg.getallGeneFeatures()
 122     print "got featuresDict"
 123
 124     outfile = open(outfilename, "w")
 125
 126     for line in uniquecountfile:
 127         fields = line.strip().split()
 128         gid = fields[fieldID]
 129         gene = fields[1]
 130         countDict[gid] = float(fields[-1])
 131         gidList.append(gid)
 132         gidToGeneDict[gid] = gene
 133
 134     uniquecountfile.close()
 135
 136     if dosplicecount:
 137         for line in splicecountfile:
 138             fields = line.strip().split()
 139             gid = fields[fieldID]
 140             try:
 141                 countDict[gid] += float(fields[-1])
 142             except:
 143                 print fields
 144                 continue
 145
 146             splicecount += float(fields[-1])
 147
 148         splicecountfile.close()
 149
 150     for line in candidateLines:
 151         if "#" in line:
 152             continue
 153
 154         fields = line.strip().split()
 155         gid = fields[1]
 156         gene = fields[0]
 157         if gid not in gidList:
 158             if gid not in farList:
 159                 farList.append(gid)
 160                 gidToGeneDict[gid] = gene
 161
 162             if gid not in countDict:
 163                 countDict[gid] = 0
 164
 165             countDict[gid] += float(fields[6])
 166
 167         if gid not in candidateDict:
 168             candidateDict[gid] = []
 169
 170         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
 171
 172     totalCount = (uniqcount + splicecount) / 1000000.
 173     uniqScale = uniqcount / 1000000.
 174     for gid in gidList:
 175         gene = gidToGeneDict[gid]
 176         featureList = []
 177         try:
 178             featureList = featuresDict[gid]
 179         except:
 180             try:
 181                 featureList = featuresDict[gene]
 182             except:
 183                 print gene, gid
 184
 185         newfeatureList = []
 186         geneLength = 0.
 187         for (ftype, chrom, start, stop, sense) in featureList:
 188             if (start, stop) not in newfeatureList:
 189                 newfeatureList.append((start, stop))
 190                 geneLength += (abs(start - stop) + 1.) / 1000.
 191
 192         if geneLength < 0.1:
 193             geneLength = 0.1
 194         elif geneLength > maxLength:
 195             geneLength = maxLength
 196
 197         rpm = countDict[gid] / totalCount
 198         rpkm = rpm / geneLength
 199         if gid in candidateDict:
 200             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 201                 cratio = cCount / (cLength / 1000.)
 202                 cratio = (uniqScale * cratio) / totalCount
 203                 if 10. * cratio < rpkm:
 204                     continue
 205
 206                 countDict[gid] += cCount
 207                 geneLength += cLength / 1000.
 208                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
 209
 210         rpm = countDict[gid] / totalCount
 211         rpkm = rpm / geneLength
 212         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
 213
 214     for gid in farList:
 215         gene = gidToGeneDict[gid]
 216         geneLength = 0
 217         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 218             geneLength += cLength / 1000.
 219
 220         if geneLength < 0.1:
 221             continue
 222
 223         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 224             cratio = cCount / (cLength / 1000.)
 225             cratio = cratio / totalCount
 226             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
 227
 228         rpm = countDict[gid] / totalCount
 229         rpkm = rpm / geneLength
 230         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
 231
 232     outfile.close()
 233     try:
 234         acceptedfile.close()
 235     except:
 236         pass
 237
 238     if doCache:
 239         uncacheGeneDB(genome)
 240
 241
 242 if __name__ == "__main__":
 243     main(sys.argv)