X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=normalizeExpandedExonic.py;fp=normalizeExpandedExonic.py;h=e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d;hp=cf02cb3af4e26d987a971817057c9e790dae874d;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py index cf02cb3..e4eb12e 100644 --- a/normalizeExpandedExonic.py +++ b/normalizeExpandedExonic.py @@ -6,10 +6,10 @@ except: import sys import optparse -import ReadDataset +import pysam from cistematic.genomes import Genome from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption +from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment print "normalizeExpandedExonic: version 5.7" @@ -18,7 +18,7 @@ def main(argv=None): if not argv: argv = sys.argv - usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]" + usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]" parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -34,22 +34,19 @@ def main(argv=None): splicecountfile = args[3] outfile = args[4] - candidateLines = [] + candidatefilename = "" acceptedfilename = "" - if len(args) > 5: - try: - candidatefile = open(args[5]) - candidateLines = candidatefile.readlines() - candidatefile.close() - acceptedfilename = args[6] - except IndexError: - pass - - RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False) - uniqcount = RDS.getUniqsCount() + try: + candidatefilename = args[5] + acceptedfilename = args[6] + except IndexError: + pass + + bamfile = pysam.Samfile(hitfile, "rb") + uniqcount = getHeaderComment(bamfile.header, "Unique") normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile, - candidateLines, acceptedfilename, options.fieldID, + candidatefilename, acceptedfilename, options.fieldID, options.maxLength, options.doCache, options.extendGenome, options.replaceModels) @@ -77,102 +74,22 @@ def makeParser(usage=""): def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename, - outfilename, candidateLines=[], acceptedfilename="", + outfilename, candidatefilename="", acceptedfilename="", fieldID=0, maxLength=1000000000., doCache=False, extendGenome="", replaceModels=False): - - uniquecountfile = open(uniquecountfilename) - + + print "%d unique reads" % uniqcount + gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="") + totalCount = (uniqcount + splicecount) / 1000000. + uniqScale = uniqcount / 1000000. if acceptedfilename: acceptedfile = open(acceptedfilename, "w") - dosplicecount = False - if splicecountfilename != "none": - dosplicecount = True - splicecountfile = open(splicecountfilename) - - if extendGenome: - if replaceModels: - print "will replace gene models with %s" % extendGenome - else: - print "will extend gene models with %s" % extendGenome - - if doCache: - cacheGeneDB(genome) - hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) - print "%s cached" % genome - else: - hg = Genome(genome, inRAM=True) - - if extendGenome != "": - hg.extendFeatures(extendGenome, replace=replaceModels) - - - print "%d unique reads" % uniqcount - - splicecount = 0 - countDict = {} - gidList = [] - farList = [] - candidateDict = {} - - gidToGeneDict = {} - - featuresDict = hg.getallGeneFeatures() - print "got featuresDict" - outfile = open(outfilename, "w") - - for line in uniquecountfile: - fields = line.strip().split() - gid = fields[fieldID] - gene = fields[1] - countDict[gid] = float(fields[-1]) - gidList.append(gid) - gidToGeneDict[gid] = gene - - uniquecountfile.close() - - if dosplicecount: - for line in splicecountfile: - fields = line.strip().split() - gid = fields[fieldID] - try: - countDict[gid] += float(fields[-1]) - except: - print fields - continue - - splicecount += float(fields[-1]) - - splicecountfile.close() - - for line in candidateLines: - if "#" in line: - continue - - fields = line.strip().split() - gid = fields[1] - gene = fields[0] - if gid not in gidList: - if gid not in farList: - farList.append(gid) - gidToGeneDict[gid] = gene - - if gid not in countDict: - countDict[gid] = 0 - - countDict[gid] += float(fields[6]) - - if gid not in candidateDict: - candidateDict[gid] = [] - - candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])) - - totalCount = (uniqcount + splicecount) / 1000000. - uniqScale = uniqcount / 1000000. + featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels) + print "got featuresDict" for gid in gidList: - gene = gidToGeneDict[gid] + gene = gidToGeneDict[gid]["name"] featureList = [] try: featureList = featuresDict[gid] @@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf elif geneLength > maxLength: geneLength = maxLength - rpm = countDict[gid] / totalCount + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength if gid in candidateDict: for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - cratio = cCount / (cLength / 1000.) - cratio = (uniqScale * cratio) / totalCount + kilobaseLength = cLength / 1000. + cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount if 10. * cratio < rpkm: continue - countDict[gid] += cCount - geneLength += cLength / 1000. - acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene)) + gidToGeneDict[gid]["count"] += cCount + geneLength += kilobaseLength + try: + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene)) + except TypeError: + pass - rpm = countDict[gid] / totalCount + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm)) - for gid in farList: - gene = gidToGeneDict[gid] - geneLength = 0 - for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - geneLength += cLength / 1000. - + for (gid, geneLength) in farList: if geneLength < 0.1: continue - for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: - cratio = cCount / (cLength / 1000.) - cratio = cratio / totalCount - acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene)) - - rpm = countDict[gid] / totalCount + gene = gidToGeneDict[gid]["name"] + if acceptedfilename: + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + kilobaseLength = cLength / 1000. + cratio = (cCount / kilobaseLength) / totalCount + try: + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene)) + except TypeError: + pass + + rpm = gidToGeneDict[gid]["count"] / totalCount rpkm = rpm / geneLength outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm)) @@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf except: pass + +def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""): + + gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID) + uniqueGidList = gidToGeneDict.keys() + if candidatefilename: + candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList) + else: + candidateDict = {} + geneDictUpdates = {} + + farList = [] + for gid in geneDictUpdates: + gidToGeneDict[gid] = geneDictUpdates[gid] + geneLength = 0. + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + geneLength += cLength / 1000. + + farList.append((gid, geneLength)) + + return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount + + +def getGeneDict(uniquecountfilename, splicecountfilename, fieldID): + + gidToGeneDict = {} + uniquecountfile = open(uniquecountfilename) + for line in uniquecountfile: + fields = line.strip().split() + gid = fields[fieldID] + gene = fields[1] + gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])} + + uniquecountfile.close() + splicecount = 0. + if splicecountfilename != "none": + splicecountfile = open(splicecountfilename) + for line in splicecountfile: + fields = line.strip().split() + gid = fields[fieldID] + try: + gidToGeneDict[gid]["count"] += float(fields[-1]) + except: + print fields + continue + + splicecount += float(fields[-1]) + + splicecountfile.close() + + return gidToGeneDict, splicecount + + +def processCandidatesFile(candidatefilename, fieldID, gidList): + """ Reads entries from the candiates file + gid entries not in gidList are returned for inclusion + creates and returns a dictionary keyed off gid with a single list of + tuples (count, length, chrom, start, stop) for each gid + """ + geneDictUpdates = {} + candidateDict = {} + candidatefile = open(candidatefilename) + for line in candidatefile.readlines(): + if "#" in line: + continue + + fields = line.strip().split() + gid = fields[1] + gene = fields[0] + if gid not in gidList: + try: + geneDictUpdates[gid]["count"] += float(fields[6]) + except KeyError: + geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])} + + try: + candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])) + except KeyError: + candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])] + + candidatefile.close() + + return candidateDict, geneDictUpdates + + +def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False): + if extendGenome: + if replaceModels: + print "will replace gene models with %s" % extendGenome + else: + print "will extend gene models with %s" % extendGenome + + if doCache: + cacheGeneDB(genome) + hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) + print "%s cached" % genome + else: + hg = Genome(genome, inRAM=True) + + if extendGenome != "": + hg.extendFeatures(extendGenome, replace=replaceModels) + + featuresDict = hg.getallGeneFeatures() + if doCache: uncacheGeneDB(genome) + return featuresDict + if __name__ == "__main__": main(sys.argv) \ No newline at end of file