import optparse
import ReadDataset
from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
print "geneDownstreamBins: version 2.1"
def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
- bins = 10
- standardMinThresh = standardMinDist / bins
hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
normalizationFactor = 1.0
geneinfoDict = getGeneInfoDict(genome, cache=True)
hg = Genome(genome)
featuresDict = hg.getallGeneFeatures()
-
outfile = open(outfilename, "w")
-
gidList = hg.allGIDs()
gidList.sort()
for gid in gidList:
- symbol = "LOC" + gid
- geneinfo = ""
- featureList = []
try:
- geneinfo = geneinfoDict[gid]
- featureList = featuresDict[gid]
- symbol = geneinfo[0][0]
- except:
+ featuresList = featuresDict[gid]
+ except KeyError:
print gid
- if len(featureList) == 0:
+ try:
+ binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+ except ErangeError:
continue
- newfeatureList = []
- for (ftype, chrom, start, stop, fsense) in featureList:
- if (start, stop) not in newfeatureList:
- newfeatureList.append((start, stop))
+ tagCount *= normalizationFactor
+ print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+ outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+ for binAmount in binList:
+ outfile.write("\t%.2f" % binAmount)
- if chrom not in hitDict:
- continue
+ outfile.write("\n")
- newfeatureList.sort()
- if len(newfeatureList) < 1:
- continue
+ outfile.close()
- glen = standardMinDist
- if fsense == "F":
- nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
- if nextGene < glen * 2:
- glen = nextGene / 2
- if glen < 1:
- glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
- gstart = newfeatureList[-1][1]
- else:
- nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
- if nextGene < glen * 2:
- glen = nextGene / 2
+ symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+ geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+ if geneLength < standardMinDist:
+ raise ErangeError("gene length less than minimum")
- if glen < 1:
- glen = 1
+ binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+ if tagCount < 2:
+ raise ErangeError("tag count less than minimum")
- gstart = newfeatureList[0][0] - glen
- if gstart < 0:
- gstart = 0
+ return binList, symbol, geneLength, tagCount
- tagCount = 0
- if glen < standardMinDist:
- continue
- binList = [0.] * bins
- for read in hitDict[chrom]:
- tagStart = read["start"]
- weight = read["weight"]
- tagStart -= gstart
- if tagStart >= glen:
- break
-
- if tagStart > 0:
- tagCount += weight
- if fsense == "F":
- # we are relying on python's integer division quirk
- binID = tagStart / standardMinThresh
- binList[binID] += weight
- else:
- rdist = glen - tagStart
- binID = rdist / standardMinThresh
- binList[binID] += weight
-
- if tagCount < 2:
- continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+ if len(featureList) == 0:
+ raise ErangeError("no features found")
- tagCount *= normalizationFactor
- print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
- outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
- for binAmount in binList:
- outfile.write("\t%.2f" % binAmount)
+ symbol = "LOC%s" % gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except KeyError:
+ print gid
- outfile.write("\n")
+ newfeatureList = []
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+
+ if len(newfeatureList) < 1:
+ raise ErangeError("no features found")
+
+ if chrom not in chromosomeList:
+ raise ErangeError("chromosome not found in reads")
+
+ newfeatureList.sort()
+
+ return symbol, newfeatureList, sense, chrom
- outfile.close()
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+ geneLength = minDistance
+ if sense == "F":
+ nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+ geneStart = featureList[-1][1]
+ else:
+ nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+ geneStart = max(featureList[0][0] - geneLength, 0)
+
+ if nextGene < geneLength * 2:
+ geneLength = nextGene / 2
+
+ geneLength = max(geneLength, 1)
+
+ return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+ tagCount = 0
+ bins = 10
+ standardMinThresh = standardMinDist / bins
+ binList = [0.] * bins
+ for read in readList:
+ tagStart = read["start"]
+ if tagStart >= geneLength:
+ break
+
+ tagStart -= geneStart
+ weight = read["weight"]
+ if tagStart > 0:
+ tagCount += weight
+ if sense == "F":
+ # we are relying on python's integer division quirk
+ binID = tagStart / standardMinThresh
+ else:
+ rdist = geneLength - tagStart
+ binID = rdist / standardMinThresh
+
+ binList[binID] += weight
+
+ return binList, tagCount
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file