first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / getfasta.py
1 #
2 #  getfasta.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     pass
11
12 import sys
13 import optparse
14 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
15 import ReadDataset
16 from cistematic.genomes import Genome
17
18 print "getfasta: version 3.5"
19
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     usage = "usage: python %s genome regionfile outfilename [options]"
26
27     parser = getParser(usage)
28     (options, args) = parser.parse_args(argv[1:])
29
30     if len(args) < 3:
31         print usage
32         sys.exit(1)
33
34     genome = args[0]
35     regionfile = args[1]
36     outfilename = args[2]
37
38     getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
39              options.topRegions, options.maxsize, options.usePeaks, options.hitfile,
40              options.doCache, options.doCompact)
41
42
43 def getParser(usage):
44     parser = optparse.OptionParser(usage=usage)
45     parser.add_option("--seqradius", type="int", dest="seqsize")
46     parser.add_option("--minreads", type="int", dest="minHitThresh")
47     parser.add_option("--returnTop", type="int", dest="topRegions")
48     parser.add_option("--maxsize", type="int", dest="maxsize")
49     parser.add_option("--usepeak", action="store_true", dest="usePeaks")
50     parser.add_option("--dataset", dest="hitfile")
51     parser.add_option("--cache", action="store_true", dest="doCache")
52     parser.add_option("--compact", action="store_true", dest="doCompact")
53
54     configParser = getConfigParser()
55     section = "getfasta"
56     seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
57     minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
58     topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
59     maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
60     usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
61     hitfile = getConfigOption(configParser, section, "hitFile", None)
62     doCache = getConfigBoolOption(configParser, section, "doCache", False)
63     doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
64
65     parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
66                         usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
67
68     return parser
69
70 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
71              maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
72     doDataset = False
73     if hitfile is not None:
74         if usePeaks:
75             print "ignoring dataset and relying on peak data"
76         else:
77             doDataset = True
78
79     if doCompact:
80         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
81                                       chromField=0, compact=True, keepPeak=usePeaks,
82                                       returnTop=topRegions)
83     else:
84         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
85                                       keepPeak=usePeaks, returnTop=topRegions)
86
87     if usePeaks:
88         ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
89     elif doDataset:
90         hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
91         ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
92     else:
93         ncregions = getDefaultRegion(mergedRegions, maxsize)
94
95     writeFastaFile(ncregions, genome, outfilename, seqsize)
96
97
98 def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
99     hg = Genome(genome)
100     outfile = open(outfilename, "w")
101     for chrom in ncregions:
102         for regionDict in ncregions[chrom]:
103             rstart = regionDict["start"]
104             rlen = regionDict["length"]
105             topPos = regionDict["topPos"]
106             if topPos[0] >= 0:
107                 newrstart = rstart + topPos[0] - seqsize
108                 newrlen = 2 * seqsize + 1
109             else:
110                 newrstart = rstart
111                 newrlen = rlen
112
113             seq2 = hg.sequence(chrom, newrstart, newrlen)
114             outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
115
116     outfile.close()
117
118
119 def getDefaultRegion(regionDict, maxsize):
120     ncregions = {}
121     for chromosome in regionDict:
122         ncregions[chromosome] = []
123
124     for chromosome in regionDict:
125         print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
126         for region in regionDict[chromosome]:
127             start = region.start
128             length = region.length
129
130             if length > maxsize:
131                 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
132                 continue
133
134             resultDict = {"start": start,
135                           "length": length,
136                           "topPos": [-1]
137             }
138             ncregions[chromosome].append(resultDict)
139
140     return ncregions
141
142
143 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
144
145     ncregions = {}
146     for chromosome in regionDict:
147         ncregions[chromosome] = []
148
149     for chromosome in regionDict:
150         print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
151         for region in regionDict[chromosome]:
152             start = region.start
153             length = region.length
154
155             if length > maxsize:
156                 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
157                 continue
158
159             topPos = region.peakPos - start
160             if region.peakHeight > minHitThresh:
161                 resultDict = {"start": start,
162                               "length": length,
163                               "topPos": [topPos]
164                 }
165                 ncregions[chromosome].append(resultDict)
166
167     return ncregions
168
169
170 def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
171
172     readlen = hitRDS.getReadSize()
173
174     ncregions = {}
175     for chromosome in regionDict:
176         ncregions[chromosome] = []
177
178     for chromosome in regionDict:
179         print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
180         for region in regionDict[chromosome]:
181             start = region.start
182             stop = region.stop
183             length = region.length
184
185             if length > maxsize:
186                 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
187                 continue
188
189             thechrom = "chr%s" % chromosome
190             print "."
191             hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
192             print "hitDict length: %d", len(hitDict[thechrom])
193             peak = findPeak(hitDict[thechrom], start, length, readlen)
194             if peak.numHits > minHitThresh:
195                 resultDict = {"start": start,
196                               "length": length,
197                               "topPos": peak.topPos
198                 }
199                 ncregions[chromosome].append(resultDict)
200
201     return ncregions
202
203
204 if __name__ == "__main__":
205     main(sys.argv)