14 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
16 from cistematic.genomes import Genome
18 print "getfasta: version 3.5"
25 usage = "usage: python %s genome regionfile outfilename [options]"
27 parser = getParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
38 getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
39 options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
40 options.doCache, options.doCompact)
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--seqradius", type="int", dest="seqsize")
46 parser.add_option("--minreads", type="int", dest="minHitThresh")
47 parser.add_option("--returnTop", type="int", dest="topRegions")
48 parser.add_option("--maxsize", type="int", dest="maxsize")
49 parser.add_option("--usepeak", action="store_true", dest="usePeaks")
50 parser.add_option("--dataset", dest="hitfile")
51 parser.add_option("--cache", action="store_true", dest="doCache")
52 parser.add_option("--compact", action="store_true", dest="doCompact")
54 configParser = getConfigParser()
56 seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
57 minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
58 topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
59 maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
60 usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
61 hitfile = getConfigOption(configParser, section, "hitFile", None)
62 doCache = getConfigBoolOption(configParser, section, "doCache", False)
63 doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
65 parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
66 usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
70 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
71 maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
73 if hitfile is not None:
75 print "ignoring dataset and relying on peak data"
80 mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
81 chromField=0, compact=True, keepPeak=usePeaks,
84 mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
85 keepPeak=usePeaks, returnTop=topRegions)
88 ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
90 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
91 ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
93 ncregions = getDefaultRegion(mergedRegions, maxsize)
95 writeFastaFile(ncregions, genome, outfilename, seqsize)
98 def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
100 outfile = open(outfilename, "w")
101 for chrom in ncregions:
102 for regionDict in ncregions[chrom]:
103 rstart = regionDict["start"]
104 rlen = regionDict["length"]
105 topPos = regionDict["topPos"]
107 newrstart = rstart + topPos[0] - seqsize
108 newrlen = 2 * seqsize + 1
113 seq2 = hg.sequence(chrom, newrstart, newrlen)
114 outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
119 def getDefaultRegion(regionDict, maxsize):
121 for chromosome in regionDict:
122 ncregions[chromosome] = []
124 for chromosome in regionDict:
125 print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
126 for region in regionDict[chromosome]:
128 length = region.length
131 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
134 resultDict = {"start": start,
138 ncregions[chromosome].append(resultDict)
143 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
146 for chromosome in regionDict:
147 ncregions[chromosome] = []
149 for chromosome in regionDict:
150 print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
151 for region in regionDict[chromosome]:
153 length = region.length
156 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
159 topPos = region.peakPos - start
160 if region.peakHeight > minHitThresh:
161 resultDict = {"start": start,
165 ncregions[chromosome].append(resultDict)
170 def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
172 readlen = hitRDS.getReadSize()
175 for chromosome in regionDict:
176 ncregions[chromosome] = []
178 for chromosome in regionDict:
179 print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
180 for region in regionDict[chromosome]:
183 length = region.length
186 print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
189 thechrom = "chr%s" % chromosome
191 hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
192 print "hitDict length: %d", len(hitDict[thechrom])
193 peak = findPeak(hitDict[thechrom], start, length, readlen)
194 if peak.numHits > minHitThresh:
195 resultDict = {"start": start,
197 "topPos": peak.topPos
199 ncregions[chromosome].append(resultDict)
204 if __name__ == "__main__":