13 from commoncode import readDataset, getMergedRegions, findPeak
14 from cistematic.genomes import Genome
16 print "%s: version 3.4" % sys.argv[0]
23 usage = "usage: python %s genome regionfile outfilename [options]"
25 parser = optparse.OptionParser(usage=usage)
26 parser.add_option("--seqradius", type="int", dest="seqsize")
27 parser.add_option("--minreads", type="int", dest="minHitThresh")
28 parser.add_option("--returnTop", type="int", dest="topRegions")
29 parser.add_option("--maxsize", type="int", dest="maxsize")
30 parser.add_option("--usepeak", action="store_true", dest="usePeaks")
31 parser.add_option("--dataset", dest="hitfile")
32 parser.add_option("--cache", action="store_true", dest="doCache")
33 parser.add_option("--compact", action="store_true", dest="doCompact")
34 parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
35 usePeaks=False, hitfile=None, doCache=False, doCompact=False)
37 (options, args) = parser.parse_args(argv[1:])
47 getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
48 options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
49 options.doCache, options.doCompact)
52 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
53 maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
55 if hitfile is not None:
57 print "ignoring dataset and relying on peak data"
62 mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
63 chromField=0, compact=True, keepPeak=usePeaks,
66 mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
67 keepPeak=usePeaks, returnTop=topRegions)
70 ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
72 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
73 ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
75 ncregions = getDefaultRegion(mergedRegions, maxsize)
77 writeFastaFile(ncregions, genome, outfilename, seqsize)
80 def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
82 outfile = open(outfilename, "w")
83 for chrom in ncregions:
84 for regionDict in ncregions[chrom]:
85 rstart = regionDict["start"]
86 rlen = regionDict["length"]
87 topPos = regionDict["topPos"]
89 newrstart = rstart + topPos[0] - seqsize
90 newrlen = 2 * seqsize + 1
95 seq2 = hg.sequence(chrom, newrstart, newrlen)
96 outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
101 def getDefaultRegion(regionDict, maxsize):
103 for chrom in regionDict:
104 ncregions[chrom] = []
106 for achrom in regionDict:
107 print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
108 for region in regionDict[achrom]:
109 (rstart, rstop, rlen) = region
112 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
115 resultDict = {"start": rstart,
119 ncregions[achrom].append(resultDict)
124 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
127 for chrom in regionDict:
128 ncregions[chrom] = []
130 for achrom in regionDict:
131 print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
132 for region in regionDict[achrom]:
133 (rstart, rstop, rlen, peakPos, peakHeight) = region
136 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
139 topPos = peakPos - rstart
140 if peakHeight > minHitThresh:
141 resultDict = {"start": rstart,
145 ncregions[achrom].append(resultDict)
150 def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
152 readlen = hitRDS.getReadSize()
155 for chrom in regionDict:
156 ncregions[chrom] = []
158 for achrom in regionDict:
159 print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
160 for region in regionDict[achrom]:
161 (rstart, rstop, rlen) = region
164 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
167 thechrom = "chr%s" % achrom
169 hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
170 print "hitDict length: %d", len(hitDict[thechrom])
171 (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
172 if numHits > minHitThresh:
173 resultDict = {"start": rstart,
177 ncregions[achrom].append(resultDict)
182 if __name__ == "__main__":