10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
14 print "normalizeExpandedExonic: version 5.7"
21 usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
23 parser = makeParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
28 print "\twhere splicecountfile can be set to 'none' to not count splices\n"
33 uniquecountfile = args[2]
34 splicecountfile = args[3]
37 candidatefilename = ""
40 candidatefilename = args[5]
41 acceptedfilename = args[6]
45 bamfile = pysam.Samfile(hitfile, "rb")
46 uniqcount = getHeaderComment(bamfile.header, "Unique")
48 normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
49 candidatefilename, acceptedfilename, options.fieldID,
50 options.maxLength, options.doCache, options.extendGenome,
51 options.replaceModels)
54 def makeParser(usage=""):
55 parser = optparse.OptionParser(usage=usage)
56 parser.add_option("--gidField", type="int", dest="fieldID")
57 parser.add_option("--maxLength", type="float", dest="maxLength")
58 parser.add_option("--cache", action="store_true", dest="doCache")
59 parser.add_option("--models", dest="extendGenome")
60 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
62 configParser = getConfigParser()
63 section = "normalizeExpandedExonic"
64 fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
65 maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
66 doCache = getConfigBoolOption(configParser, section, "doCache", False)
67 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
68 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
70 parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
71 replaceModels=replaceModels)
76 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
77 outfilename, candidatefilename="", acceptedfilename="",
78 fieldID=0, maxLength=1000000000., doCache=False,
79 extendGenome="", replaceModels=False):
81 print "%d unique reads" % uniqcount
82 gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
83 totalCount = (uniqcount + splicecount) / 1000000.
84 uniqScale = uniqcount / 1000000.
86 acceptedfile = open(acceptedfilename, "w")
88 outfile = open(outfilename, "w")
89 featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
90 print "got featuresDict"
92 gene = gidToGeneDict[gid]["name"]
95 featureList = featuresDict[gid]
98 featureList = featuresDict[gene]
104 for (ftype, chrom, start, stop, sense) in featureList:
105 if (start, stop) not in newfeatureList:
106 newfeatureList.append((start, stop))
107 geneLength += (abs(start - stop) + 1.) / 1000.
111 elif geneLength > maxLength:
112 geneLength = maxLength
114 rpm = gidToGeneDict[gid]["count"] / totalCount
115 rpkm = rpm / geneLength
116 if gid in candidateDict:
117 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
118 kilobaseLength = cLength / 1000.
119 cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
120 if 10. * cratio < rpkm:
123 gidToGeneDict[gid]["count"] += cCount
124 geneLength += kilobaseLength
126 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
130 rpm = gidToGeneDict[gid]["count"] / totalCount
131 rpkm = rpm / geneLength
132 outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
134 for (gid, geneLength) in farList:
138 gene = gidToGeneDict[gid]["name"]
140 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
141 kilobaseLength = cLength / 1000.
142 cratio = (cCount / kilobaseLength) / totalCount
144 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
148 rpm = gidToGeneDict[gid]["count"] / totalCount
149 rpkm = rpm / geneLength
150 outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
159 def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
161 gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
162 uniqueGidList = gidToGeneDict.keys()
163 if candidatefilename:
164 candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
170 for gid in geneDictUpdates:
171 gidToGeneDict[gid] = geneDictUpdates[gid]
173 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
174 geneLength += cLength / 1000.
176 farList.append((gid, geneLength))
178 return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
181 def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
184 uniquecountfile = open(uniquecountfilename)
185 for line in uniquecountfile:
186 fields = line.strip().split()
187 gid = fields[fieldID]
189 gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
191 uniquecountfile.close()
193 if splicecountfilename != "none":
194 splicecountfile = open(splicecountfilename)
195 for line in splicecountfile:
196 fields = line.strip().split()
197 gid = fields[fieldID]
199 gidToGeneDict[gid]["count"] += float(fields[-1])
204 splicecount += float(fields[-1])
206 splicecountfile.close()
208 return gidToGeneDict, splicecount
211 def processCandidatesFile(candidatefilename, fieldID, gidList):
212 """ Reads entries from the candiates file
213 gid entries not in gidList are returned for inclusion
214 creates and returns a dictionary keyed off gid with a single list of
215 tuples (count, length, chrom, start, stop) for each gid
219 candidatefile = open(candidatefilename)
220 for line in candidatefile.readlines():
224 fields = line.strip().split()
227 if gid not in gidList:
229 geneDictUpdates[gid]["count"] += float(fields[6])
231 geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
234 candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
236 candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
238 candidatefile.close()
240 return candidateDict, geneDictUpdates
243 def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
246 print "will replace gene models with %s" % extendGenome
248 print "will extend gene models with %s" % extendGenome
252 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
253 print "%s cached" % genome
255 hg = Genome(genome, inRAM=True)
257 if extendGenome != "":
258 hg.extendFeatures(extendGenome, replace=replaceModels)
260 featuresDict = hg.getallGeneFeatures()
263 uncacheGeneDB(genome)
268 if __name__ == "__main__":