except:
pass
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import pysam
from cistematic.genomes import Genome
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
-print "%prog: version 5.6"
+print "normalizeExpandedExonic: version 5.7"
def main(argv=None):
if not argv:
argv = sys.argv
- usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
-
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--gidField", type="int", dest="fieldID")
- parser.add_option("--maxLength", type="float", dest="maxLength")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
- replaceModels=False)
+ usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(sys.argv) < 6:
splicecountfile = args[3]
outfile = args[4]
- candidateLines = []
+ candidatefilename = ""
acceptedfilename = ""
- if len(args) > 5:
- try:
- candidatefile = open(args[5])
- candidateLines = candidatefile.readlines()
- candidatefile.close()
- acceptedfilename = args[6]
- except IndexError:
- pass
-
- normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
- candidateLines, acceptedfilename, options.fieldID,
+ try:
+ candidatefilename = args[5]
+ acceptedfilename = args[6]
+ except IndexError:
+ pass
+
+ bamfile = pysam.Samfile(hitfile, "rb")
+ uniqcount = getHeaderComment(bamfile.header, "Unique")
+
+ normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
+ candidatefilename, acceptedfilename, options.fieldID,
options.maxLength, options.doCache, options.extendGenome,
options.replaceModels)
-def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
- outfilename, candidateLines=[], acceptedfilename="",
- fieldID=0, maxLength=1000000000., doCache=False,
- extendGenome="", replaceModels=False):
-
- uniquecountfile = open(uniquecountfilename)
-
- if acceptedfilename:
- acceptedfile = open(acceptedfilename, "w")
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--gidField", type="int", dest="fieldID")
+ parser.add_option("--maxLength", type="float", dest="maxLength")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- dosplicecount = False
- if splicecountfilename != "none":
- dosplicecount = True
- splicecountfile = open(splicecountfilename)
+ configParser = getConfigParser()
+ section = "normalizeExpandedExonic"
+ fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
+ maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
- if extendGenome:
- if replaceModels:
- print "will replace gene models with %s" % extendGenome
- else:
- print "will extend gene models with %s" % extendGenome
+ parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
+ replaceModels=replaceModels)
- if doCache:
- cacheGeneDB(genome)
- hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
- print "%s cached" % genome
- else:
- hg = Genome(genome, inRAM=True)
+ return parser
- if extendGenome != "":
- hg.extendFeatures(extendGenome, replace=replaceModels)
- RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
- uniqcount = RDS.getUniqsCount()
+def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
+ outfilename, candidatefilename="", acceptedfilename="",
+ fieldID=0, maxLength=1000000000., doCache=False,
+ extendGenome="", replaceModels=False):
+
print "%d unique reads" % uniqcount
-
- splicecount = 0
- countDict = {}
- gidList = []
- farList = []
- candidateDict = {}
-
- gidToGeneDict = {}
-
- featuresDict = hg.getallGeneFeatures()
- print "got featuresDict"
-
- outfile = open(outfilename, "w")
-
- for line in uniquecountfile:
- fields = line.strip().split()
- gid = fields[fieldID]
- gene = fields[1]
- countDict[gid] = float(fields[-1])
- gidList.append(gid)
- gidToGeneDict[gid] = gene
-
- uniquecountfile.close()
-
- if dosplicecount:
- for line in splicecountfile:
- fields = line.strip().split()
- gid = fields[fieldID]
- try:
- countDict[gid] += float(fields[-1])
- except:
- print fields
- continue
-
- splicecount += float(fields[-1])
-
- splicecountfile.close()
-
- for line in candidateLines:
- if "#" in line:
- continue
-
- fields = line.strip().split()
- gid = fields[1]
- gene = fields[0]
- if gid not in gidList:
- if gid not in farList:
- farList.append(gid)
- gidToGeneDict[gid] = gene
-
- if gid not in countDict:
- countDict[gid] = 0
-
- countDict[gid] += float(fields[6])
-
- if gid not in candidateDict:
- candidateDict[gid] = []
-
- candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
+ gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
totalCount = (uniqcount + splicecount) / 1000000.
uniqScale = uniqcount / 1000000.
+ if acceptedfilename:
+ acceptedfile = open(acceptedfilename, "w")
+
+ outfile = open(outfilename, "w")
+ featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+ print "got featuresDict"
for gid in gidList:
- gene = gidToGeneDict[gid]
+ gene = gidToGeneDict[gid]["name"]
featureList = []
try:
featureList = featuresDict[gid]
elif geneLength > maxLength:
geneLength = maxLength
- rpm = countDict[gid] / totalCount
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
if gid in candidateDict:
for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- cratio = cCount / (cLength / 1000.)
- cratio = (uniqScale * cratio) / totalCount
+ kilobaseLength = cLength / 1000.
+ cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
if 10. * cratio < rpkm:
continue
- countDict[gid] += cCount
- geneLength += cLength / 1000.
- acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+ gidToGeneDict[gid]["count"] += cCount
+ geneLength += kilobaseLength
+ try:
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+ except TypeError:
+ pass
- rpm = countDict[gid] / totalCount
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
- for gid in farList:
- gene = gidToGeneDict[gid]
- geneLength = 0
- for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- geneLength += cLength / 1000.
-
+ for (gid, geneLength) in farList:
if geneLength < 0.1:
continue
- for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
- cratio = cCount / (cLength / 1000.)
- cratio = cratio / totalCount
- acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
- rpm = countDict[gid] / totalCount
+ gene = gidToGeneDict[gid]["name"]
+ if acceptedfilename:
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ kilobaseLength = cLength / 1000.
+ cratio = (cCount / kilobaseLength) / totalCount
+ try:
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+ except TypeError:
+ pass
+
+ rpm = gidToGeneDict[gid]["count"] / totalCount
rpkm = rpm / geneLength
outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
except:
pass
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+ gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+ uniqueGidList = gidToGeneDict.keys()
+ if candidatefilename:
+ candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+ else:
+ candidateDict = {}
+ geneDictUpdates = {}
+
+ farList = []
+ for gid in geneDictUpdates:
+ gidToGeneDict[gid] = geneDictUpdates[gid]
+ geneLength = 0.
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ geneLength += cLength / 1000.
+
+ farList.append((gid, geneLength))
+
+ return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+ gidToGeneDict = {}
+ uniquecountfile = open(uniquecountfilename)
+ for line in uniquecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ gene = fields[1]
+ gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+ uniquecountfile.close()
+ splicecount = 0.
+ if splicecountfilename != "none":
+ splicecountfile = open(splicecountfilename)
+ for line in splicecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ try:
+ gidToGeneDict[gid]["count"] += float(fields[-1])
+ except:
+ print fields
+ continue
+
+ splicecount += float(fields[-1])
+
+ splicecountfile.close()
+
+ return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+ """ Reads entries from the candiates file
+ gid entries not in gidList are returned for inclusion
+ creates and returns a dictionary keyed off gid with a single list of
+ tuples (count, length, chrom, start, stop) for each gid
+ """
+ geneDictUpdates = {}
+ candidateDict = {}
+ candidatefile = open(candidatefilename)
+ for line in candidatefile.readlines():
+ if "#" in line:
+ continue
+
+ fields = line.strip().split()
+ gid = fields[1]
+ gene = fields[0]
+ if gid not in gidList:
+ try:
+ geneDictUpdates[gid]["count"] += float(fields[6])
+ except KeyError:
+ geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+ try:
+ candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+ except KeyError:
+ candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+ candidatefile.close()
+
+ return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+ if extendGenome:
+ if replaceModels:
+ print "will replace gene models with %s" % extendGenome
+ else:
+ print "will extend gene models with %s" % extendGenome
+
+ if doCache:
+ cacheGeneDB(genome)
+ hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+ print "%s cached" % genome
+ else:
+ hg = Genome(genome, inRAM=True)
+
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace=replaceModels)
+
+ featuresDict = hg.getallGeneFeatures()
+
if doCache:
uncacheGeneDB(genome)
+ return featuresDict
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file