normalizeExpandedExonic.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     pass
   6
   7 import sys
   8 import optparse
   9 import ReadDataset
  10 from cistematic.genomes import Genome
  11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
  13
  14 print "normalizeExpandedExonic: version 5.7"
  15
  16
  17 def main(argv=None):
  18     if not argv:
  19         argv = sys.argv
  20
  21     usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  22
  23     parser = makeParser(usage)
  24     (options, args) = parser.parse_args(argv[1:])
  25
  26     if len(sys.argv) < 6:
  27         print usage
  28         print "\twhere splicecountfile can be set to 'none' to not count splices\n"
  29         sys.exit(1)
  30
  31     genome = args[0]
  32     hitfile = args[1]
  33     uniquecountfile = args[2]
  34     splicecountfile = args[3]
  35     outfile = args[4]
  36
  37     candidateLines = []
  38     acceptedfilename = ""
  39     if len(args) > 5:
  40         try:
  41             candidatefile = open(args[5])
  42             candidateLines = candidatefile.readlines()
  43             candidatefile.close()
  44             acceptedfilename = args[6]
  45         except IndexError:
  46             pass
  47
  48     normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
  49                             candidateLines, acceptedfilename, options.fieldID,
  50                             options.maxLength, options.doCache, options.extendGenome,
  51                             options.replaceModels)
  52
  53
  54 def makeParser(usage=""):
  55     parser = optparse.OptionParser(usage=usage)
  56     parser.add_option("--gidField", type="int", dest="fieldID")
  57     parser.add_option("--maxLength", type="float", dest="maxLength")
  58     parser.add_option("--cache", action="store_true", dest="doCache")
  59     parser.add_option("--models", dest="extendGenome")
  60     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
  61
  62     configParser = getConfigParser()
  63     section = "normalizeExpandedExonic"
  64     fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
  65     maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
  66     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  67     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
  68     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
  69
  70     parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
  71                         replaceModels=replaceModels)
  72
  73     return parser
  74
  75
  76 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
  77                             outfilename, candidateLines=[], acceptedfilename="",
  78                             fieldID=0, maxLength=1000000000., doCache=False,
  79                             extendGenome="", replaceModels=False):
  80
  81     uniquecountfile = open(uniquecountfilename)
  82
  83     if acceptedfilename:
  84         acceptedfile = open(acceptedfilename, "w")
  85
  86     dosplicecount = False
  87     if splicecountfilename != "none":
  88         dosplicecount = True
  89         splicecountfile = open(splicecountfilename)
  90
  91     if extendGenome:
  92         if replaceModels:
  93             print "will replace gene models with %s" % extendGenome
  94         else:
  95             print "will extend gene models with %s" % extendGenome
  96
  97     if doCache:
  98         cacheGeneDB(genome)
  99         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
 100         print "%s cached" % genome
 101     else:
 102         hg = Genome(genome, inRAM=True)
 103
 104     if extendGenome != "":
 105         hg.extendFeatures(extendGenome, replace=replaceModels)
 106
 107     RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
 108     uniqcount = RDS.getUniqsCount()
 109     print "%d unique reads" % uniqcount
 110
 111     splicecount = 0
 112     countDict = {}
 113     gidList = []
 114     farList = []
 115     candidateDict = {}
 116
 117     gidToGeneDict = {}
 118
 119     featuresDict = hg.getallGeneFeatures()
 120     print "got featuresDict"
 121
 122     outfile = open(outfilename, "w")
 123
 124     for line in uniquecountfile:
 125         fields = line.strip().split()
 126         gid = fields[fieldID]
 127         gene = fields[1]
 128         countDict[gid] = float(fields[-1])
 129         gidList.append(gid)
 130         gidToGeneDict[gid] = gene
 131
 132     uniquecountfile.close()
 133
 134     if dosplicecount:
 135         for line in splicecountfile:
 136             fields = line.strip().split()
 137             gid = fields[fieldID]
 138             try:
 139                 countDict[gid] += float(fields[-1])
 140             except:
 141                 print fields
 142                 continue
 143
 144             splicecount += float(fields[-1])
 145
 146         splicecountfile.close()
 147
 148     for line in candidateLines:
 149         if "#" in line:
 150             continue
 151
 152         fields = line.strip().split()
 153         gid = fields[1]
 154         gene = fields[0]
 155         if gid not in gidList:
 156             if gid not in farList:
 157                 farList.append(gid)
 158                 gidToGeneDict[gid] = gene
 159
 160             if gid not in countDict:
 161                 countDict[gid] = 0
 162
 163             countDict[gid] += float(fields[6])
 164
 165         if gid not in candidateDict:
 166             candidateDict[gid] = []
 167
 168         candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
 169
 170     totalCount = (uniqcount + splicecount) / 1000000.
 171     uniqScale = uniqcount / 1000000.
 172     for gid in gidList:
 173         gene = gidToGeneDict[gid]
 174         featureList = []
 175         try:
 176             featureList = featuresDict[gid]
 177         except:
 178             try:
 179                 featureList = featuresDict[gene]
 180             except:
 181                 print gene, gid
 182
 183         newfeatureList = []
 184         geneLength = 0.
 185         for (ftype, chrom, start, stop, sense) in featureList:
 186             if (start, stop) not in newfeatureList:
 187                 newfeatureList.append((start, stop))
 188                 geneLength += (abs(start - stop) + 1.) / 1000.
 189
 190         if geneLength < 0.1:
 191             geneLength = 0.1
 192         elif geneLength > maxLength:
 193             geneLength = maxLength
 194
 195         rpm = countDict[gid] / totalCount
 196         rpkm = rpm / geneLength
 197         if gid in candidateDict:
 198             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 199                 cratio = cCount / (cLength / 1000.)
 200                 cratio = (uniqScale * cratio) / totalCount
 201                 if 10. * cratio < rpkm:
 202                     continue
 203
 204                 countDict[gid] += cCount
 205                 geneLength += cLength / 1000.
 206                 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
 207
 208         rpm = countDict[gid] / totalCount
 209         rpkm = rpm / geneLength
 210         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
 211
 212     for gid in farList:
 213         gene = gidToGeneDict[gid]
 214         geneLength = 0
 215         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 216             geneLength += cLength / 1000.
 217
 218         if geneLength < 0.1:
 219             continue
 220
 221         for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
 222             cratio = cCount / (cLength / 1000.)
 223             cratio = cratio / totalCount
 224             acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
 225
 226         rpm = countDict[gid] / totalCount
 227         rpkm = rpm / geneLength
 228         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
 229
 230     outfile.close()
 231     try:
 232         acceptedfile.close()
 233     except:
 234         pass
 235
 236     if doCache:
 237         uncacheGeneDB(genome)
 238
 239
 240 if __name__ == "__main__":
 241     main(sys.argv)