getfasta.py

   1 #
   2 #  getfasta.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 import sys, optparse
  13 from commoncode import readDataset, getMergedRegions, findPeak
  14 from cistematic.genomes import Genome
  15
  16 print "%s: version 3.4" % sys.argv[0]
  17
  18
  19 def main(argv=None):
  20     if not argv:
  21         argv = sys.argv
  22
  23     usage = "usage: python %s genome regionfile outfilename [options]"
  24
  25     parser = optparse.OptionParser(usage=usage)
  26     parser.add_option("--seqradius", type="int", dest="seqsize")
  27     parser.add_option("--minreads", type="int", dest="minHitThresh")
  28     parser.add_option("--returnTop", type="int", dest="topRegions")
  29     parser.add_option("--maxsize", type="int", dest="maxsize")
  30     parser.add_option("--usepeak", action="store_true", dest="usePeaks")
  31     parser.add_option("--dataset", dest="hitfile")
  32     parser.add_option("--cache", action="store_true", dest="doCache")
  33     parser.add_option("--compact", action="store_true", dest="doCompact")
  34     parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
  35                         usePeaks=False, hitfile=None, doCache=False, doCompact=False)
  36
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     if len(args) < 3:
  40         print usage
  41         sys.exit(1)
  42
  43     genome = args[0]
  44     regionfile = args[1]
  45     outfilename = args[2]
  46
  47     getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
  48              options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
  49              options.doCache, options.doCompact)
  50
  51
  52 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
  53              maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
  54     doDataset = False
  55     if hitfile is not None:
  56         if usePeaks:
  57             print "ignoring dataset and relying on peak data"
  58         else:
  59             doDataset = True
  60
  61     if doCompact:
  62         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
  63                                       chromField=0, compact=True, keepPeak=usePeaks,
  64                                       returnTop=topRegions)
  65     else:
  66         mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
  67                                       keepPeak=usePeaks, returnTop=topRegions)
  68
  69     if usePeaks:
  70         ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
  71     elif doDataset:
  72         hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
  73         ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
  74     else:
  75         ncregions = getDefaultRegion(mergedRegions, maxsize)
  76
  77     writeFastaFile(ncregions, genome, outfilename, seqsize)
  78
  79
  80 def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
  81     hg = Genome(genome)
  82     outfile = open(outfilename, "w")
  83     for chrom in ncregions:
  84         for regionDict in ncregions[chrom]:
  85             rstart = regionDict["start"]
  86             rlen = regionDict["length"]
  87             topPos = regionDict["topPos"]
  88             if topPos[0] >= 0:
  89                 newrstart = rstart + topPos[0] - seqsize
  90                 newrlen = 2 * seqsize + 1
  91             else:
  92                 newrstart = rstart
  93                 newrlen = rlen
  94
  95             seq2 = hg.sequence(chrom, newrstart, newrlen)
  96             outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
  97
  98     outfile.close()
  99
 100
 101 def getDefaultRegion(regionDict, maxsize):
 102     ncregions = {}
 103     for chrom in regionDict:
 104         ncregions[chrom] = []
 105
 106     for achrom in regionDict:
 107         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
 108         for region in regionDict[achrom]:
 109             (rstart, rstop, rlen) = region
 110
 111             if rlen > maxsize:
 112                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
 113                 continue
 114
 115             resultDict = {"start": rstart,
 116                           "length": rlen,
 117                           "topPos": [-1]
 118             }
 119             ncregions[achrom].append(resultDict)
 120
 121     return ncregions
 122
 123
 124 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
 125
 126     ncregions = {}
 127     for chrom in regionDict:
 128         ncregions[chrom] = []
 129
 130     for achrom in regionDict:
 131         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
 132         for region in regionDict[achrom]:
 133             (rstart, rstop, rlen, peakPos, peakHeight) = region
 134
 135             if rlen > maxsize:
 136                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
 137                 continue
 138
 139             topPos = peakPos - rstart
 140             if peakHeight > minHitThresh:
 141                 resultDict = {"start": rstart,
 142                               "length": rlen,
 143                               "topPos": [topPos]
 144                 }
 145                 ncregions[achrom].append(resultDict)
 146
 147     return ncregions
 148
 149
 150 def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
 151
 152     readlen = hitRDS.getReadSize()
 153
 154     ncregions = {}
 155     for chrom in regionDict:
 156         ncregions[chrom] = []
 157
 158     for achrom in regionDict:
 159         print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
 160         for region in regionDict[achrom]:
 161             (rstart, rstop, rlen) = region
 162
 163             if rlen > maxsize:
 164                 print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
 165                 continue
 166
 167             thechrom = "chr%s" % achrom
 168             print "."
 169             hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
 170             print "hitDict length: %d", len(hitDict[thechrom])
 171             (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
 172             if numHits > minHitThresh:
 173                 resultDict = {"start": rstart,
 174                               "length": rlen,
 175                               "topPos": topPos
 176                 }
 177                 ncregions[achrom].append(resultDict)
 178
 179     return ncregions
 180
 181
 182 if __name__ == "__main__":
 183     main(sys.argv)