first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneDownstreamBins.py
1 #
2 #  geneDownstreamBins.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
13 import sys
14 import optparse
15 import ReadDataset
16 from cistematic.genomes import Genome
17 from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
18
19 print "geneDownstreamBins: version 2.1"
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
26
27     parser = makeParser(usage)
28     (options, args) = parser.parse_args(argv[1:])
29
30     if len(args) < 3:
31         print usage
32         sys.exit(1)
33
34     genome = args[0]
35     hitfile =  args[1]
36     outfilename = args[2]
37
38     geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
39
40
41 def makeParser(usage=""):
42     parser = optparse.OptionParser(usage=usage)
43     parser.add_option("--max", type="int", dest="standardMinDist",
44                       help="maximum region in bp")
45
46     configParser = getConfigParser()
47     section = "geneDownstreamBins"
48     standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
49
50     parser.set_defaults(standardMinDist=standardMinDist)
51
52     return parser
53
54
55 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
56
57     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
58     normalizationFactor = 1.0
59     if normalize:
60         hitDictSize = len(hitRDS)
61         normalizationFactor = hitDictSize / 1000000.
62
63     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
64     geneinfoDict = getGeneInfoDict(genome, cache=True)
65     hg = Genome(genome)
66     featuresDict = hg.getallGeneFeatures()
67     outfile = open(outfilename, "w")
68     gidList = hg.allGIDs()
69     gidList.sort()
70     for gid in gidList:
71         try:
72             featuresList = featuresDict[gid]
73         except KeyError:
74             print gid
75
76         try:
77             binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
78         except ErangeError:
79             continue
80
81         tagCount *= normalizationFactor
82         print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
83         outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
84         for binAmount in binList:
85             outfile.write("\t%.2f" % binAmount)
86
87         outfile.write("\n")
88
89     outfile.close()
90
91
92 def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
93
94     symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
95     geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
96     if geneLength < standardMinDist:
97         raise ErangeError("gene length less than minimum")
98
99     binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
100     if tagCount < 2:
101         raise ErangeError("tag count less than minimum")
102
103     return binList, symbol, geneLength, tagCount
104
105
106 def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
107     if len(featureList) == 0:
108         raise ErangeError("no features found")
109
110     symbol = "LOC%s" % gid
111     geneinfo = ""
112     try:
113         geneinfo = geneinfoDict[gid]
114         symbol = geneinfo[0][0]
115     except KeyError:
116         print gid
117
118     newfeatureList = []
119     for (ftype, chrom, start, stop, sense) in featureList:
120         if (start, stop) not in newfeatureList:
121             newfeatureList.append((start, stop))
122
123     if len(newfeatureList) < 1:
124         raise ErangeError("no features found")
125
126     if chrom not in chromosomeList:
127         raise ErangeError("chromosome not found in reads")
128
129     newfeatureList.sort()
130
131     return symbol, newfeatureList, sense, chrom
132
133
134 def getGeneStats(genome, gid, minDistance, featureList, sense):
135     geneLength = minDistance
136     if sense == "F":
137         nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
138         geneStart = featureList[-1][1]
139     else:
140         nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
141         geneStart = max(featureList[0][0] - geneLength, 0)
142
143     if nextGene < geneLength * 2:
144         geneLength = nextGene / 2
145
146     geneLength = max(geneLength, 1)
147
148     return geneStart, geneLength
149
150
151 def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
152     tagCount = 0
153     bins = 10
154     standardMinThresh = standardMinDist / bins
155     binList = [0.] * bins
156     for read in readList:
157         tagStart = read["start"]
158         if tagStart >= geneLength:
159             break
160
161         tagStart -= geneStart
162         weight = read["weight"]
163         if tagStart > 0:
164             tagCount += weight
165             if sense == "F":
166                 # we are relying on python's integer division quirk
167                 binID = tagStart / standardMinThresh 
168             else:
169                 rdist = geneLength - tagStart
170                 binID = rdist / standardMinThresh
171
172             binList[binID] += weight
173
174     return binList, tagCount
175
176 if __name__ == "__main__":
177     main(sys.argv)