snapshot of 4.0a development. initial git repo commit
[erange.git] / geneDownstreamBins.py
1 #
2 #  geneDownstreamBins.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
13 import sys, optparse
14 from commoncode import readDataset
15 from cistematic.genomes import Genome
16 from cistematic.core.geneinfo import geneinfoDB
17
18 print "%prog: version 2.0"
19
20 def main(argv=None):
21     if not argv:
22         argv = sys.argv
23
24     usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
25
26     parser = optparse.OptionParser(usage=usage)
27     parser.add_option("--max", type="int", dest="standardMinDist",
28                       help="maximum region in bp")
29     parser.set_defaults(standardMinDist=3000)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         sys.exit(1)
35
36     genome = args[0]
37     hitfile =  args[1]
38     outfilename = args[2]
39
40     geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
41
42
43 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
44     bins = 10
45     standardMinThresh = standardMinDist / bins
46
47     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
48     normalizationFactor = 1.0
49     if normalize:
50         hitDictSize = len(hitRDS)
51         normalizationFactor = hitDictSize / 1000000.
52
53     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
54
55     hg = Genome(genome)
56     idb = geneinfoDB(cache=True)
57
58     geneinfoDict = idb.getallGeneInfo(genome)
59     featuresDict = hg.getallGeneFeatures()
60
61     outfile = open(outfilename, "w")
62
63     gidList = hg.allGIDs()
64     gidList.sort()
65     for gid in gidList:
66         symbol = "LOC" + gid
67         geneinfo = ""
68         featureList = []
69         try:
70             geneinfo = geneinfoDict[gid]
71             featureList = featuresDict[gid]
72             symbol = geneinfo[0][0]
73         except:
74             print gid
75
76         if len(featureList) == 0:
77             continue
78
79         newfeatureList = []
80         for (ftype, chrom, start, stop, fsense) in featureList:
81             if (start, stop) not in newfeatureList:
82                 newfeatureList.append((start, stop))
83
84         if chrom not in hitDict:
85             continue
86
87         newfeatureList.sort()
88         if len(newfeatureList) < 1:
89             continue
90
91         glen = standardMinDist
92         if fsense == "F":
93             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
94             if nextGene < glen * 2:
95                 glen = nextGene / 2
96
97             if glen < 1:
98                 glen = 1
99
100             gstart = newfeatureList[-1][1]
101         else:
102             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
103             if nextGene < glen * 2:
104                 glen = nextGene / 2
105
106             if glen < 1:
107                 glen = 1
108
109             gstart = newfeatureList[0][0] - glen
110             if gstart < 0:
111                 gstart = 0
112
113         tagCount = 0
114         if glen < standardMinDist:
115             continue
116
117         binList = [0.] * bins
118         for (tagStart, sense, weight) in hitDict[chrom]:
119             tagStart -= gstart
120             if tagStart >= glen:
121                 break
122
123             if tagStart > 0:
124                 tagCount += weight
125                 if fsense == "F":
126                     # we are relying on python's integer division quirk
127                     binID = tagStart / standardMinThresh 
128                     binList[binID] += weight
129                 else:
130                     rdist = glen - tagStart
131                     binID = rdist / standardMinThresh 
132                     binList[binID] += weight
133
134         if tagCount < 2:
135             continue
136
137         tagCount *= normalizationFactor
138         print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
139         outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
140         for binAmount in binList:
141             outfile.write("\t%.2f" % binAmount)
142
143         outfile.write("\n")
144
145     outfile.close()
146
147
148 if __name__ == "__main__":
149     main(sys.argv)