distalPairs.py

   1 #
   2 #  distalPairs.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 10/14/08.
   6 #
   7
   8
   9 try:
  10     import psyco
  11     psyco.full()
  12 except:
  13     pass
  14
  15 from commoncode import readDataset
  16 import sys, time, optparse
  17
  18
  19 def main(argv=None):
  20     if not argv:
  21         argv = sys.argv
  22
  23     print "%prog: version 3.3"
  24     print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
  25     usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
  26
  27     parser = optparse.OptionParser(usage=usage)
  28     parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
  29     parser.add_option("--splices", action="store_true", dest="doSplices")
  30     parser.add_option("--verbose", action="store_true", dest="doVerbose")
  31     parser.add_option("--maxDist", type="int", dest="maxDist")
  32     parser.add_option("--cache", type="int", dest="cachePages")
  33     parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None)
  34     (options, args) = parser.parse_args(argv[1:])
  35
  36     if len(args) < 3:
  37         print usage
  38         sys.exit(1)
  39
  40     minDist = int(args[0])
  41     rdsfile = args[1]
  42     outfilename = args[2]
  43
  44     distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
  45
  46
  47 def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
  48     if cachePages is not None:
  49         doCache = True
  50     else:
  51         doCache = False
  52         cachePages = -1
  53
  54     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
  55     if not RDS.hasIndex():
  56         print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
  57         sys.exit(1)
  58
  59     if cachePages > RDS.getDefaultCacheSize():
  60         RDS.setDBcache(cachePages)
  61
  62     print time.ctime()
  63
  64     if doSplices:
  65         print "getting splices"
  66         splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
  67         print "got splices"
  68
  69     print "getting uniq reads"
  70     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
  71     print "got uniqs"
  72
  73     if doSplices:
  74         for readID in splicesDict:
  75             theRead = splicesDict[readID]
  76             read0 = theRead[0]
  77             del read0[1]
  78             try:
  79                 uniqDict[readID].append(read0)
  80             except:
  81                 if len(theRead) == 4:
  82                     read2 = theRead[2]
  83                     del read2[1]
  84                     uniqDict[readID] = [read0,read2]
  85
  86     if doVerbose:
  87         print len(uniqDict), time.ctime()
  88
  89     outfile = open(outfilename,"w")
  90
  91     diffChrom = 0
  92     distal = 0
  93     total = 0
  94     for readID in uniqDict:
  95         readList = uniqDict[readID]
  96         if len(readList) == 2:
  97             total += 1
  98             (start1, sense1, chrom1, pair1) = readList[0]
  99             (start2, sense2, chrom2, pair2) = readList[1]
 100
 101             if chrom1 != chrom2:
 102                 diffChrom += 1
 103                 if sameChromOnly:
 104                     continue
 105                 else:
 106                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
 107                     outfile.write(outline + "\n")
 108                     if doVerbose:
 109                         print diffChrom, outline
 110             else:
 111                 dist = abs(start1 - start2)
 112
 113                 if minDist < dist < maxDist:
 114                     distal += 1
 115                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
 116                     outfile.write(outline + "\n")
 117                     if doVerbose:
 118                         print distal, outline
 119
 120     outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total))
 121     total = float(total)
 122     if total < 1:
 123         total = 1.
 124
 125     outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)))
 126     outfile.close()
 127     print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total))
 128     print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))
 129     print time.ctime()
 130
 131
 132 if __name__ == "__main__":
 133     main(sys.argv)