snapshot of 4.0a development. initial git repo commit
[erange.git] / getfasta.py
1 #
2 #  getfasta.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys, optparse
13 from commoncode import readDataset, getMergedRegions, findPeak
14 from cistematic.genomes import Genome
15
16 print "%s: version 3.4" % sys.argv[0]
17
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     usage = "usage: python %s genome regionfile outfilename [options]"
24
25     parser = optparse.OptionParser(usage=usage)
26     parser.add_option("--seqradius", type="int", dest="seqsize")
27     parser.add_option("--minreads", type="int", dest="minHitThresh")
28     parser.add_option("--returnTop", type="int", dest="topRegions")
29     parser.add_option("--maxsize", type="int", dest="maxsize")
30     parser.add_option("--usepeak", action="store_true", dest="usePeaks")
31     parser.add_option("--dataset", dest="hitfile")
32     parser.add_option("--cache", action="store_true", dest="doCache")
33     parser.add_option("--compact", action="store_true", dest="doCompact")
34     parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
35                         usePeaks=False, hitfile=None, doCache=False, doCompact=False)
36
37     (options, args) = parser.parse_args(argv[1:])
38
39     if len(args) < 3:
40         print usage
41         sys.exit(1)
42
43     genome = args[0]
44     regionfile = args[1]
45     outfilename = args[2]
46
47     getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
48              options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
49              options.doCache, options.doCompact)
50
51
52 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
53              maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
54     doDataset = False
55     if hitfile is not None:
56         if usePeaks:
57             print "ignoring dataset and relying on peak data"
58         else:
59             doDataset = True
60
61     if doCompact:
62         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
63                                       chromField=0, compact=True, keepPeak=usePeaks,
64                                       returnTop=topRegions)
65     else:
66         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
67                                       keepPeak=usePeaks, returnTop=topRegions)
68
69     if usePeaks:
70         ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
71     elif doDataset:
72         hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
73         ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
74     else:
75         ncregions = getDefaultRegion(mergedRegions, maxsize)
76
77     writeFastaFile(ncregions, genome, outfilename, seqsize)
78
79
80 def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
81     hg = Genome(genome)
82     outfile = open(outfilename, "w")
83     for chrom in ncregions:
84         for regionDict in ncregions[chrom]:
85             rstart = regionDict["start"]
86             rlen = regionDict["length"]
87             topPos = regionDict["topPos"]
88             if topPos[0] >= 0:
89                 newrstart = rstart + topPos[0] - seqsize
90                 newrlen = 2 * seqsize + 1
91             else:
92                 newrstart = rstart
93                 newrlen = rlen
94
95             seq2 = hg.sequence(chrom, newrstart, newrlen)
96             outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
97
98     outfile.close()
99
100
101 def getDefaultRegion(regionDict, maxsize):
102     ncregions = {}
103     for chrom in regionDict:
104         ncregions[chrom] = []
105
106     for achrom in regionDict:
107         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
108         for region in regionDict[achrom]:
109             (rstart, rstop, rlen) = region
110
111             if rlen > maxsize:
112                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
113                 continue
114
115             resultDict = {"start": rstart,
116                           "length": rlen,
117                           "topPos": [-1]
118             }
119             ncregions[achrom].append(resultDict)
120
121     return ncregions
122
123
124 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
125
126     ncregions = {}
127     for chrom in regionDict:
128         ncregions[chrom] = []
129
130     for achrom in regionDict:
131         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
132         for region in regionDict[achrom]:
133             (rstart, rstop, rlen, peakPos, peakHeight) = region
134
135             if rlen > maxsize:
136                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
137                 continue
138
139             topPos = peakPos - rstart
140             if peakHeight > minHitThresh:
141                 resultDict = {"start": rstart,
142                               "length": rlen,
143                               "topPos": [topPos]
144                 }
145                 ncregions[achrom].append(resultDict)
146
147     return ncregions
148
149
150 def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
151
152     readlen = hitRDS.getReadSize()
153
154     ncregions = {}
155     for chrom in regionDict:
156         ncregions[chrom] = []
157
158     for achrom in regionDict:
159         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
160         for region in regionDict[achrom]:
161             (rstart, rstop, rlen) = region
162
163             if rlen > maxsize:
164                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
165                 continue
166
167             thechrom = "chr%s" % achrom
168             print "."
169             hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
170             print "hitDict length: %d", len(hitDict[thechrom])
171             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
172             if numHits > minHitThresh:
173                 resultDict = {"start": rstart,
174                               "length": rlen,
175                               "topPos": topPos
176                 }
177                 ncregions[achrom].append(resultDict)
178
179     return ncregions
180
181
182 if __name__ == "__main__":
183     main(sys.argv)