regionintersects.py

   1 #
   2 #  regionintersects.py
   3 #  ENRAGE
   4 #
   5 try:
   6     import psyco
   7     psyco.full()
   8 except:
   9     pass
  10
  11 import sys, optparse
  12 from commoncode import readDataset, getMergedRegions, findPeak
  13
  14 print "%prog: version 3.0"
  15
  16 def main(argv=None):
  17     if not argv:
  18         argv = sys.argv
  19
  20     usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
  21
  22     parser = optparse.OptionParser(usage=usage)
  23     parser.add_option("--reject1", dest="rejectOneName")
  24     parser.add_option("--reject2", dest="rejectTwoName")
  25     parser.add_option("--union", action="store_true", dest="trackReject")
  26     parser.add_option("--cache", action="store_true", dest="doCache")
  27     parser.add_option("--raw", action="store_false", dest="normalize")
  28     parser.add_option("--verbose", action="store_true", dest="doVerbose")
  29     parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
  30                         doCache=False, normalize=True, doVerbose=False)
  31
  32     (options, args) = parser.parse_args(argv[1:])
  33
  34     if len(args) < 5:
  35         print usage
  36         sys.exit(1)
  37
  38     readOneName =  args[0]
  39     regionOneName = args[1]
  40     readTwoName = args[2]
  41     regionTwoName = args[3]
  42     outfilename = args[4]
  43
  44     regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
  45                      outfilename, options.rejectOneName, options.rejectTwoName,
  46                      options.trackReject, options.doCache, options.normalize,
  47                      options.doVerbose)
  48
  49
  50 def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
  51                      outfilename, rejectOneName=None, rejectTwoName=None,
  52                      trackReject=False, doCache=False, normalize=True, doVerbose=False):
  53
  54     mergedist=0
  55
  56     outfile = open(outfilename, "w")
  57
  58     doReject = False
  59     if rejectOneName is not None:
  60         trackReject = True
  61         doReject = True
  62         rejectOne = open(rejectOneName, "w")
  63
  64     if rejectTwoName is not None:
  65         trackReject = True
  66         doReject = True
  67         rejectTwo = open(rejectTwoName, "w")
  68
  69     oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
  70     twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
  71
  72     oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache)
  73     twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
  74
  75     if normalize:
  76         normalize1 = len(oneRDS) / 1000000.
  77         normalize2 = len(twoRDS) / 1000000.
  78     else:
  79         normalize1 = 1.
  80         normalize2 = 1.
  81
  82     commonRegions = 0
  83     oneRejectIndex = 0
  84     twoRejectIndex = 0
  85
  86     onePeaksDict = {}
  87     oneFoundDict = {}
  88
  89     numRegionsOne = 0
  90     numRegionsTwo = 0
  91     for rchrom in oneDict:
  92         numRegionsOne += len(oneDict[rchrom])
  93
  94     for rchrom in twoDict:
  95         numRegionsTwo += len(twoDict[rchrom])
  96
  97     outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
  98
  99     for rchrom in oneDict:
 100         if rchrom not in twoDict:
 101             continue
 102
 103         print rchrom
 104         rindex = 0
 105         rindex2 = 0
 106         fullchrom = "chr" + rchrom
 107         oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
 108         dictLen1 = len(oneReads[fullchrom])
 109         twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
 110         dictLen2 = len(twoReads[fullchrom])
 111         chrom = rchrom
 112         onePeaksDict[chrom] = []
 113         oneFoundDict[chrom] = []
 114         for (start, stop, length) in oneDict[chrom]:
 115             readList = []
 116             for localIndex in xrange(rindex, dictLen1):
 117                 read = oneReads[fullchrom][localIndex]
 118                 if read[0] < start:
 119                     rindex += 1
 120                 elif start <= read[0] <= stop:
 121                     readList.append(read)
 122                 else:
 123                     break
 124
 125             if len(readList) < 1:
 126                 continue
 127
 128             readList.sort()
 129
 130             (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
 131             onePeakScore = smoothArray[topPos[0]]
 132             onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
 133
 134         for (start, stop, length) in twoDict[chrom]:
 135             readList2 = []
 136             for localIndex in xrange(rindex2, dictLen2):
 137                 read = twoReads[fullchrom][localIndex]
 138                 if read[0] < start:
 139                     rindex2 += 1
 140                 elif start <= read[0] <= stop:
 141                     readList2.append(read)
 142                 else:
 143                     break
 144
 145             if len(readList2) < 1:
 146                 continue
 147
 148             readList2.sort()
 149             (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
 150             numHits /= normalize2
 151             twoIsCommon = False
 152             twoPeak = topPos[0] + start
 153             twoRadius = length/2
 154             twoPeakScore = smoothArray[topPos[0]] / normalize2
 155             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
 156                 if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
 157                     if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
 158                         oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
 159
 160                     twoIsCommon = True
 161                     commonRegions += 1
 162                     outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
 163                     if doVerbose:
 164                         print outline
 165
 166                     outfile.write(outline + "\n")
 167
 168             if trackReject and not twoIsCommon:
 169                 twoRejectIndex += 1
 170                 outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
 171                 if doReject:
 172                     rejectTwo.write(outline + "\n")
 173                 else:
 174                     outfile.write(outline + "\n")
 175
 176                 if doVerbose:
 177                     print outline
 178
 179         if trackReject:
 180             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
 181                 if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
 182                     oneRejectIndex += 1
 183                     outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
 184                     if doReject:
 185                         rejectOne.write(outline + "\n")
 186                     else:
 187                         outfile.write(outline + "\n")
 188
 189                     if doVerbose:
 190                         print outline
 191
 192     if trackReject:
 193         print "common: %d   one-only: %d   two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex)
 194         outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex))
 195     else:
 196         print "common: %d" % commonRegions
 197         outfile.write("#common: %d\n" % commonRegions)
 198
 199     outfile.close()
 200
 201
 202 if __name__ == "__main__":
 203     main(sys.argv)