farPairs.py

   1 #
   2 #  farPairs.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 7/13/10.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys, time
  15 import optparse
  16 from commoncode import readDataset
  17
  18 print "%prog: version 1.3"
  19
  20
  21 def main(argv=None):
  22     if not argv:
  23         argv = sys.argv
  24
  25     usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
  26
  27     parser = optparse.OptionParser(usage=usage)
  28     parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
  29     parser.add_option("--cache", type="int", dest="cachePages")
  30     parser.add_option("--verbose", action="store_true", dest="doVerbose")
  31     parser.add_option("--minDist", type="int", dest="minDist")
  32     parser.add_option("--maxDist", type="int", dest="maxDist")
  33     parser.add_option("--minCount", type="int", dest="minCount")
  34     parser.add_option("--label", dest="label")
  35     parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
  36                         minDist=1000, maxDist=500000, minCount=2, label=None)
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     if len(args) < 3:
  40         print usage
  41         print "\tIs both slow and takes up large amount of RAM"
  42         sys.exit(1)
  43
  44     rdsfile = args[0]
  45     outfilename = args[1]
  46     outbedname = args[2]
  47
  48     farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
  49              options.cachePages, options.minDist, options.maxDist, options.minCount,
  50              options.label)
  51
  52
  53 def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
  54              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
  55
  56     doCache = False
  57     if cachePages is not None:
  58         doCache = True
  59     else:
  60         cachePages = 0
  61
  62     if label is None:
  63         label = rdsfile
  64
  65     RDS = readDataset(rdsfile, verbose=True, cache=doCache)
  66     rdsChromList = RDS.getChromosomes()
  67
  68     if doVerbose:
  69         print time.ctime()
  70
  71     total = 0
  72     outfile = open(outfilename, "w")
  73     outbed = open(outbedname, "w")
  74     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
  75
  76     readlen = RDS.getReadSize()
  77     flagDict = {}
  78     for chromosome in rdsChromList:
  79         if doNotProcessChromosome(chromosome):
  80             continue
  81
  82         print chromosome
  83         uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
  84         if doVerbose:
  85             print len(uniqDict), time.ctime()
  86
  87         for readID in uniqDict:
  88             readList = uniqDict[readID]
  89             if len(readList) == 2:
  90                 total += 1
  91                 (start1, flag1, pair1) = readList[0]
  92                 (start2, flag2, pair2) = readList[1]
  93
  94                 if flag1 != flag2:
  95                     dist = abs(start1 - start2)
  96                     startList = [start1, start2]
  97                     stopList = [start1 + readlen, start2 + readlen]
  98                     startList.sort()
  99                     stopList.sort()
 100                     if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
 101                         outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
 102                         outbed.write(outputLine)
 103                         if doVerbose:
 104                             print flag1, flag2, dist
 105
 106                         try:
 107                             flagDict[flag1].append((flag2, start1, start2))
 108                         except KeyError:
 109                             flagDict[flag1] = [(flag2, start1, start2)]
 110
 111                         try:
 112                             flagDict[flag2].append((flag1, start1, start2))
 113                         except KeyError:
 114                             flagDict[flag2] = [(flag2, start1, start2)]
 115
 116     print "%d connected regions" % len(flagDict)
 117
 118     for region in flagDict:
 119         flagDict[region].sort()
 120         regionConnections = {}
 121         for (region2, start1, start2) in flagDict[region]:
 122             try:
 123                 regionConnections[region2] += 1
 124             except KeyError:
 125                 regionConnections[region2] = 1
 126
 127         for region2 in regionConnections:
 128             if regionConnections[region2] >= minCount:
 129                 outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
 130                 if doVerbose:
 131                     print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
 132
 133     outfile.close()
 134     outbed.close()
 135     if doVerbose:
 136         print "finished: ", time.ctime()
 137
 138
 139 def doNotProcessChromosome(chrom):
 140     return chrom == "chrM"
 141
 142
 143 def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
 144     readSizes = "%d" % (stopList[0] - startList[0])
 145     readCoords = "0"
 146     leftStart = startList[0] - 1
 147     rightStop = stopList[-1]
 148     for index in range(1, numPieces):
 149         readSizes += ",%d" % (stopList[index] - startList[index] + 1)
 150         readCoords += ",%d" % (startList[index] - startList[0])
 151
 152     if rsense == "+":
 153         senseCode = plusSense
 154     else:
 155         senseCode = minusSense
 156
 157     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
 158     return outline
 159
 160
 161 if __name__ == "__main__":
 162     main(sys.argv)