10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
14 print "normalizeExpandedExonic: version 5.7"
21 usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
23 parser = makeParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
28 print "\twhere splicecountfile can be set to 'none' to not count splices\n"
33 uniquecountfile = args[2]
34 splicecountfile = args[3]
41 candidatefile = open(args[5])
42 candidateLines = candidatefile.readlines()
44 acceptedfilename = args[6]
48 RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)
49 uniqcount = RDS.getUniqsCount()
51 normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
52 candidateLines, acceptedfilename, options.fieldID,
53 options.maxLength, options.doCache, options.extendGenome,
54 options.replaceModels)
57 def makeParser(usage=""):
58 parser = optparse.OptionParser(usage=usage)
59 parser.add_option("--gidField", type="int", dest="fieldID")
60 parser.add_option("--maxLength", type="float", dest="maxLength")
61 parser.add_option("--cache", action="store_true", dest="doCache")
62 parser.add_option("--models", dest="extendGenome")
63 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
65 configParser = getConfigParser()
66 section = "normalizeExpandedExonic"
67 fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
68 maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
69 doCache = getConfigBoolOption(configParser, section, "doCache", False)
70 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
71 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
73 parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
74 replaceModels=replaceModels)
79 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
80 outfilename, candidateLines=[], acceptedfilename="",
81 fieldID=0, maxLength=1000000000., doCache=False,
82 extendGenome="", replaceModels=False):
84 uniquecountfile = open(uniquecountfilename)
87 acceptedfile = open(acceptedfilename, "w")
90 if splicecountfilename != "none":
92 splicecountfile = open(splicecountfilename)
96 print "will replace gene models with %s" % extendGenome
98 print "will extend gene models with %s" % extendGenome
102 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
103 print "%s cached" % genome
105 hg = Genome(genome, inRAM=True)
107 if extendGenome != "":
108 hg.extendFeatures(extendGenome, replace=replaceModels)
111 print "%d unique reads" % uniqcount
121 featuresDict = hg.getallGeneFeatures()
122 print "got featuresDict"
124 outfile = open(outfilename, "w")
126 for line in uniquecountfile:
127 fields = line.strip().split()
128 gid = fields[fieldID]
130 countDict[gid] = float(fields[-1])
132 gidToGeneDict[gid] = gene
134 uniquecountfile.close()
137 for line in splicecountfile:
138 fields = line.strip().split()
139 gid = fields[fieldID]
141 countDict[gid] += float(fields[-1])
146 splicecount += float(fields[-1])
148 splicecountfile.close()
150 for line in candidateLines:
154 fields = line.strip().split()
157 if gid not in gidList:
158 if gid not in farList:
160 gidToGeneDict[gid] = gene
162 if gid not in countDict:
165 countDict[gid] += float(fields[6])
167 if gid not in candidateDict:
168 candidateDict[gid] = []
170 candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
172 totalCount = (uniqcount + splicecount) / 1000000.
173 uniqScale = uniqcount / 1000000.
175 gene = gidToGeneDict[gid]
178 featureList = featuresDict[gid]
181 featureList = featuresDict[gene]
187 for (ftype, chrom, start, stop, sense) in featureList:
188 if (start, stop) not in newfeatureList:
189 newfeatureList.append((start, stop))
190 geneLength += (abs(start - stop) + 1.) / 1000.
194 elif geneLength > maxLength:
195 geneLength = maxLength
197 rpm = countDict[gid] / totalCount
198 rpkm = rpm / geneLength
199 if gid in candidateDict:
200 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
201 cratio = cCount / (cLength / 1000.)
202 cratio = (uniqScale * cratio) / totalCount
203 if 10. * cratio < rpkm:
206 countDict[gid] += cCount
207 geneLength += cLength / 1000.
208 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
210 rpm = countDict[gid] / totalCount
211 rpkm = rpm / geneLength
212 outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
215 gene = gidToGeneDict[gid]
217 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
218 geneLength += cLength / 1000.
223 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
224 cratio = cCount / (cLength / 1000.)
225 cratio = cratio / totalCount
226 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
228 rpm = countDict[gid] / totalCount
229 rpkm = rpm / geneLength
230 outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
239 uncacheGeneDB(genome)
242 if __name__ == "__main__":