10 from cistematic.genomes import Genome
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
14 print "normalizeExpandedExonic: version 5.7"
21 usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
23 parser = makeParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
28 print "\twhere splicecountfile can be set to 'none' to not count splices\n"
33 uniquecountfile = args[2]
34 splicecountfile = args[3]
41 candidatefile = open(args[5])
42 candidateLines = candidatefile.readlines()
44 acceptedfilename = args[6]
48 normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
49 candidateLines, acceptedfilename, options.fieldID,
50 options.maxLength, options.doCache, options.extendGenome,
51 options.replaceModels)
54 def makeParser(usage=""):
55 parser = optparse.OptionParser(usage=usage)
56 parser.add_option("--gidField", type="int", dest="fieldID")
57 parser.add_option("--maxLength", type="float", dest="maxLength")
58 parser.add_option("--cache", action="store_true", dest="doCache")
59 parser.add_option("--models", dest="extendGenome")
60 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
62 configParser = getConfigParser()
63 section = "normalizeExpandedExonic"
64 fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
65 maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
66 doCache = getConfigBoolOption(configParser, section, "doCache", False)
67 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
68 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
70 parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
71 replaceModels=replaceModels)
76 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
77 outfilename, candidateLines=[], acceptedfilename="",
78 fieldID=0, maxLength=1000000000., doCache=False,
79 extendGenome="", replaceModels=False):
81 uniquecountfile = open(uniquecountfilename)
84 acceptedfile = open(acceptedfilename, "w")
87 if splicecountfilename != "none":
89 splicecountfile = open(splicecountfilename)
93 print "will replace gene models with %s" % extendGenome
95 print "will extend gene models with %s" % extendGenome
99 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
100 print "%s cached" % genome
102 hg = Genome(genome, inRAM=True)
104 if extendGenome != "":
105 hg.extendFeatures(extendGenome, replace=replaceModels)
107 RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
108 uniqcount = RDS.getUniqsCount()
109 print "%d unique reads" % uniqcount
119 featuresDict = hg.getallGeneFeatures()
120 print "got featuresDict"
122 outfile = open(outfilename, "w")
124 for line in uniquecountfile:
125 fields = line.strip().split()
126 gid = fields[fieldID]
128 countDict[gid] = float(fields[-1])
130 gidToGeneDict[gid] = gene
132 uniquecountfile.close()
135 for line in splicecountfile:
136 fields = line.strip().split()
137 gid = fields[fieldID]
139 countDict[gid] += float(fields[-1])
144 splicecount += float(fields[-1])
146 splicecountfile.close()
148 for line in candidateLines:
152 fields = line.strip().split()
155 if gid not in gidList:
156 if gid not in farList:
158 gidToGeneDict[gid] = gene
160 if gid not in countDict:
163 countDict[gid] += float(fields[6])
165 if gid not in candidateDict:
166 candidateDict[gid] = []
168 candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
170 totalCount = (uniqcount + splicecount) / 1000000.
171 uniqScale = uniqcount / 1000000.
173 gene = gidToGeneDict[gid]
176 featureList = featuresDict[gid]
179 featureList = featuresDict[gene]
185 for (ftype, chrom, start, stop, sense) in featureList:
186 if (start, stop) not in newfeatureList:
187 newfeatureList.append((start, stop))
188 geneLength += (abs(start - stop) + 1.) / 1000.
192 elif geneLength > maxLength:
193 geneLength = maxLength
195 rpm = countDict[gid] / totalCount
196 rpkm = rpm / geneLength
197 if gid in candidateDict:
198 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
199 cratio = cCount / (cLength / 1000.)
200 cratio = (uniqScale * cratio) / totalCount
201 if 10. * cratio < rpkm:
204 countDict[gid] += cCount
205 geneLength += cLength / 1000.
206 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
208 rpm = countDict[gid] / totalCount
209 rpkm = rpm / geneLength
210 outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
213 gene = gidToGeneDict[gid]
215 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
216 geneLength += cLength / 1000.
221 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
222 cratio = cCount / (cLength / 1000.)
223 cratio = cratio / totalCount
224 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
226 rpm = countDict[gid] / totalCount
227 rpkm = rpm / geneLength
228 outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
237 uncacheGeneDB(genome)
240 if __name__ == "__main__":