altSpliceCounts.py

   1 try:
   2     import psyco
   3     psyco.full()
   4 except:
   5     print "psyco not running"
   6
   7 print "altSpliceCounts: version 3.7"
   8
   9 import sys
  10 import optparse
  11 import ReadDataset
  12 from commoncode import getConfigParser, getConfigOption
  13
  14 def main(argv=None):
  15     if not argv:
  16         argv = sys.argv
  17
  18     usage = "usage: python %s rdsfile outfilename [--cache pages]"
  19
  20     parser = makeParser(usage)
  21     (options, args) = parser.parse_args(argv[1:])
  22
  23     if len(args) < 2:
  24         print usage
  25         sys.exit(1)
  26
  27     hitfile =  args[0]
  28     outfilename = args[1]
  29
  30     if options.numCachePages is not None:
  31         doCache = True
  32         cachePages = options.numCachePages
  33     else:
  34         doCache = False
  35         cachePages = 100000
  36
  37     altSpliceCounts(hitfile, outfilename, doCache, cachePages)
  38
  39
  40 def makeParser(usage=""):
  41     parser = optparse.OptionParser(usage=usage)
  42     parser.add_option("--cache", type="int", dest="numCachePages",
  43                       help="number of cache pages to use [default: 100000]")
  44
  45     configParser = getConfigParser()
  46     section = "altSpliceCounts"
  47     numCachePages = getConfigOption(configParser, section, "numCachePages", None)
  48
  49     parser.set_defaults(numCachePages=numCachePages)
  50
  51     return parser
  52
  53
  54 def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
  55     startDict = {}
  56     stopDict = {}
  57     resultDict = {}
  58
  59     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
  60     if cachePages > hitRDS.getDefaultCacheSize():
  61         hitRDS.setDBcache(cachePages)
  62
  63     readlen = hitRDS.getReadSize()
  64     hitDict = hitRDS.getSplicesDict(noSense=True)
  65     outfile = open(outfilename, "w")
  66
  67     for chrom in hitDict:
  68         startDict[chrom] = []
  69         stopDict[chrom] = []
  70         resultDict[chrom] = []
  71
  72     index = 0
  73     for chrom in hitDict:
  74         for read in hitDict[chrom]:
  75             tagStart = read["startL"]
  76             tagStop = read["stopR"]
  77             index += 1
  78             length = tagStop - tagStart
  79             if length < readlen + 5:
  80                 continue
  81
  82             startDict[chrom].append((tagStart, length))
  83             stopDict[chrom].append((tagStop, length))
  84
  85         startDict[chrom].sort()
  86         stopDict[chrom].sort()
  87
  88     spliceEvent = 0
  89     altSpliceEvent = 0
  90     alternative = 1
  91     for chrom in startDict:
  92         firstIndex = 0
  93         maxIndex = len(startDict[chrom])
  94         while firstIndex < maxIndex:
  95             (fstart, flen) = startDict[chrom][firstIndex]
  96             (start, length) = (fstart, flen)
  97             secondIndex = firstIndex
  98             secondLengths = []
  99             while (start - fstart) < readlen:
 100                 if secondIndex >= maxIndex:
 101                     break
 102
 103                 (start, length) = startDict[chrom][secondIndex]
 104                 if (start - fstart) < readlen and abs(length - flen) > readlen:
 105                     line =  (chrom, fstart, fstart + flen, chrom, start, start + length)
 106                     alreadySeen = False
 107                     for slength in secondLengths:
 108                         if abs(slength - length) < readlen:
 109                             alreadySeen = True
 110
 111                     if len(resultDict[chrom]) == 0:
 112                         resultDict[chrom].append(line)
 113                     elif line != resultDict[chrom][-1] and not alreadySeen:
 114                         resultDict[chrom].append(line)
 115                         secondLengths.append(length)
 116                         altSpliceEvent += 1
 117                         spliceEvent += 1
 118
 119                 secondIndex += 1
 120
 121             firstIndex = secondIndex
 122             spliceEvent += 1
 123
 124         firstIndex = 0
 125         maxIndex = len(stopDict[chrom])
 126         while firstIndex < maxIndex:
 127             (fstop, flen) = stopDict[chrom][firstIndex]
 128             (stop, length) = (fstop, flen)
 129             secondIndex = firstIndex
 130             secondLengths = []
 131             while (stop - fstop) < readlen:
 132                 if secondIndex >= maxIndex:
 133                     break
 134                 (stop, length) = stopDict[chrom][secondIndex]
 135                 if (stop - fstop) < readlen and abs(length - flen) > readlen:
 136                     line = (chrom, fstop - flen, fstop, chrom, stop - length, stop)
 137                     alreadySeen = False
 138                     for slength in secondLengths:
 139                         if abs(slength - length) < readlen:
 140                             alreadySeen = True
 141
 142                     if len(resultDict[chrom]) == 0:
 143                         resultDict[chrom].append(line)
 144
 145                     if line != resultDict[chrom][-1] and not alreadySeen:
 146                         resultDict[chrom].append(line)
 147                         secondLengths.append(length)
 148                         altSpliceEvent += 1
 149                         spliceEvent += 1
 150
 151                 secondIndex += 1
 152
 153             firstIndex = secondIndex
 154             spliceEvent += 1
 155
 156         resultDict[chrom].sort()
 157         for line in resultDict[chrom]:
 158             outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
 159             alternative += 1
 160
 161         print chrom, maxIndex, spliceEvent, altSpliceEvent
 162
 163     print spliceEvent, altSpliceEvent
 164     outfile.close()
 165
 166 if __name__ == "__main__":
 167     main(sys.argv)