makerdsfromblat.py

   1 #
   2 #  makerdsfromblat.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 12/7/08.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys, string, optparse
  15 from commoncode import readDataset, writeLog
  16
  17 verstring = "%prog: version 3.9"
  18 print verstring
  19
  20 def main(argv=None):
  21     if not argv:
  22         argv = sys.argv
  23
  24     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  25
  26     parser = optparse.OptionParser(usage=usage)
  27     parser.add_option("--append", action="store_false", dest="init")
  28     parser.add_option("--index", action="store_true", dest="doIndex")
  29     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
  30     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
  31     parser.add_option("--flag", action="store_true", dest="flagReads")
  32     parser.add_option("--strict", type="int", dest="minSpliceLength",
  33                       help="min required bp on each side of a splice")
  34     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
  35     parser.add_option("--verbose", action="store_true", dest="verbose")
  36     parser.add_option("--cache", type="int", dest="cachePages")
  37     parser.add_option("--RNA", dest="geneDataFileName")
  38     parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
  39     (options, args) = parser.parse_args(argv[1:])
  40
  41     if len(args) < 3:
  42         print usage
  43         sys.exit(1)
  44
  45     label = args[0]
  46     filename = args[1]
  47     outdbname = args[2]
  48
  49     if options.geneDataFileName:
  50         dataType = "RNA"
  51     else:
  52         dataType = "DNA"
  53
  54     theFlag = ""
  55     if options.flagReads:
  56         theFlag = "blat"
  57
  58     propertyList = []
  59     for arg in args:
  60         if "::" in arg:
  61             (pname, pvalue) = arg.strip().split("::")
  62             propertyList.append((pname, pvalue))
  63
  64     makerdsfromblat(label, filename, outdbname, dataType, options.init,
  65                    options.doIndex, options.trimReadID, options. minSpliceLength,
  66                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
  67                    options.cachePages, options.geneDataFileName, propertyList)
  68
  69
  70 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  71                     doIndex=False,trimReadID=True, minSpliceLength=0,
  72                     forceRNA=False, theFlag="", spliceOnly=False,
  73                     verbose=False, cachePages=100000, geneDataFileName="",
  74                     propertyList=[]):
  75
  76     delimiter = "|"
  77     minIntron = 10
  78     maxBorder = 0
  79     index = 0
  80     insertSize = 100000
  81
  82     if forceRNA:
  83         print "forcing datatype to RNA"
  84         dataType = "RNA"
  85
  86     if dataType == "RNA":
  87         genedatafile = open(geneDataFileName)
  88
  89     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
  90
  91     geneDict = {}
  92     mapDict = {}
  93     if dataType == "RNA" and not forceRNA:
  94         for line in genedatafile:
  95             fields = line.strip().split("\t")
  96             blockCount = int(fields[7])
  97             if blockCount < 2:
  98                 continue
  99
 100             uname = fields[0]
 101             chrom = fields[1]
 102             sense = fields[2]
 103             chromstarts = fields[8][:-1].split(",")
 104             chromstops = fields[9][:-1].split(",")
 105             exonLengths = []
 106             totalLength = 0
 107             for index in range(blockCount):
 108                 chromstarts[index] = int(chromstarts[index])
 109                 chromstops[index] = int(chromstops[index])
 110                 exonLengths.append(chromstops[index] - chromstarts[index])
 111                 totalLength += exonLengths[index]
 112
 113             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 114             mapDict[uname] = []
 115
 116         genedatafile.close()
 117
 118     rds = readDataset(outdbname, init, dataType, verbose=True)
 119
 120     #check that our cacheSize is better than the dataset's default cache size
 121     defaultCacheSize = rds.getDefaultCacheSize()
 122     if cachePages > defaultCacheSize:
 123         if init:
 124             rds.setDBcache(cachePages, default=True)
 125         else:
 126             rds.setDBcache(cachePages)
 127
 128     if not init and doIndex:
 129         try:
 130             if rds.hasIndex():
 131                 rds.dropIndex()
 132         except:
 133             if verbose:
 134                 print "couldn't drop Index"
 135
 136
 137     if len(propertyList) > 0:
 138         rds.insertMetadata(propertyList)
 139
 140     # make some assumptions based on first read
 141     infile = open(filename, "r")
 142     for arg in range(6):
 143         line = infile.readline()
 144
 145     fields = line.split()
 146     readsize = int(fields[10])
 147     pairedTest = fields[9][-2:]
 148     paired = False
 149     if pairedTest in ["/1", "/2"]:
 150         print "assuming reads are paired"
 151         paired = True
 152
 153     print "read size: %d bp" % readsize
 154     if init:
 155         rds.insertMetadata([("readsize", readsize)])
 156         if paired:
 157             rds.insertMetadata([("paired", "True")])
 158
 159     infile.close()
 160     if "blat_mapped" not in rds.getMetadata():
 161         rds.insertMetadata([("blat_mapped", "True")])
 162
 163     minReadScore = readsize - readsize/25 - 1
 164     trim = -4
 165     if dataType == "RNA":
 166         maxBorder = readsize + trim
 167
 168     infile = open(filename, "r")
 169     prevID = ""
 170     readList = []
 171     uInsertList = []
 172     mInsertList = []
 173     sInsertList = []
 174     index = uIndex = mIndex = sIndex = lIndex = 0
 175     bestScore = 0
 176     # skip headers
 177     for arg in range(5):
 178         line = infile.readline()
 179
 180     for line in infile:
 181         lIndex += 1
 182         fields = line.strip().split()
 183         readID = fields[9]
 184         if trimReadID:
 185             readID = string.join(readID.split(":")[1:], ":")
 186
 187         if readID != prevID:
 188             newReadList = []
 189             if bestScore > minReadScore:
 190                 for readData in readList:
 191                     if readData[1] == bestScore:
 192                         newReadList.append(readData)
 193
 194             if trimReadID:
 195                 prevID = label + "-" + prevID
 196
 197             listlen = len(newReadList)
 198             if listlen == 1:
 199                 parts = int(newReadList[0][0])
 200                 if parts == 1 and not spliceOnly:
 201                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 202                     stop = start + readsize
 203                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
 204                     uIndex += 1
 205                 elif forceRNA and parts == 2:
 206                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
 207                     startL = int(startList[0])
 208                     stopL = startL + int(lengthList[0])
 209                     startR = int(startList[1])
 210                     stopR = startR + int(lengthList[1])
 211                     if stopL + minIntron < startR:
 212                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 213                         sIndex += 1
 214                 elif parts == 2:
 215                     print newReadList
 216                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 217                     currentSplice = chrom
 218                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
 219                     if model not in geneDict:
 220                         print fields
 221                         continue
 222
 223                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 224                     spliceID = int(spliceID)
 225                     rstart = int(start) - 2
 226                     lefthalf = maxBorder - rstart
 227                     if lefthalf < 1 or lefthalf > maxBorder:
 228                         continue
 229
 230                     righthalf = readsize - lefthalf
 231                     startL = int(regionStart)  + rstart
 232                     stopL = startL + lefthalf
 233                     startR = chromstarts[spliceID + 1]
 234                     stopR = chromstarts[spliceID + 1] + righthalf
 235                     if stopL + minIntron < startR:
 236                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 237                         sIndex += 1
 238             elif listlen > 1 and not spliceOnly:
 239                 prevID = prevID + "::" + str(listlen)
 240                 mIndex += 1
 241                 # ignore multireads that can also map across splices
 242                 skip = False
 243                 for readData in newReadList:
 244                     if readData[0] > 1:
 245                         skip = True
 246
 247                 if not skip:
 248                     for (part, score, sense, chrom, start, mismatches) in newReadList:
 249                         stop = start + readsize
 250                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
 251             else:
 252                 prevID = readID
 253
 254             if index % insertSize == 0:
 255                 rds.insertUniqs(uInsertList)
 256                 rds.insertMulti(mInsertList)
 257                 uInsertList = []
 258                 mInsertList = []
 259                 if dataType == "RNA":
 260                     rds.insertSplices(sInsertList)
 261                     sInsertList = []
 262
 263                 print ".",
 264                 sys.stdout.flush()
 265
 266             # start processing new read
 267             readList = []
 268             prevID = readID
 269             bestScore = 0
 270             index += 1
 271
 272         # add the new read
 273         score = int(fields[0])
 274         sense = fields[8]
 275         chrom = fields[13]
 276         parts = int(fields[17])
 277         passStrict = True
 278         if parts > 1:
 279             lengthList = fields[18][:-1].split(",")
 280             startList = fields[20][:-1].split(",")
 281             listlen = len(lengthList)
 282             for lpos in range(listlen):
 283                 if int(lengthList[lpos]) < minSpliceLength:
 284                     passStrict = False
 285
 286                 # throw out deletions, for now
 287                 if lpos > 0:
 288                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
 289                         passStrict = False
 290             pass
 291         else:
 292             start = int(fields[15])
 293
 294         if passStrict:
 295             if score > bestScore:
 296                 bestScore = score
 297
 298             mismatches = ""
 299             if int(fields[1]) > 0:
 300                 try:
 301                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
 302                 except:
 303                     mismatches = ""
 304
 305             if parts == 1:
 306                 readList.append((parts, score, sense, chrom, start, mismatches))
 307             else:
 308                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
 309
 310         if lIndex % 1000000 == 0:
 311             print "processed %d lines" % lIndex
 312
 313     print "%d lines processed" % lIndex
 314
 315     if len(uInsertList) > 0:
 316         rds.insertUniqs(uInsertList)
 317     if len(mInsertList) > 0:
 318         rds.insertMulti(mInsertList)
 319     if len(sInsertList) > 0:
 320         rds.insertSplices(sInsertList)
 321
 322     combString = "%d unique reads" % uIndex
 323     combString += "\t%d multi reads" % mIndex
 324     if dataType == "RNA":
 325         combString += "\t%d spliced reads" % sIndex
 326
 327     print
 328     print combString.replace("\t", "\n")
 329
 330     writeLog(outdbname + ".log", verstring, combString)
 331
 332     if doIndex:
 333         print "building index...."
 334         if cachePages > defaultCacheSize:
 335             rds.setDBcache(cachePages)
 336             rds.buildIndex(cachePages)
 337         else:
 338             rds.buildIndex(defaultCacheSize)
 339
 340
 341 def decodeMismatches(gString, rString, rsense):
 342
 343     output = []
 344     rlen = len(gString)
 345     partIndex = 0
 346     for rindex in xrange(rlen):
 347         if gString == ",":
 348             partIndex += 1
 349
 350         if gString[rindex] == rString[rindex]:
 351             continue
 352
 353         genNT = gString[rindex]
 354         readNT = rString[rindex]
 355         # for eland-compatibility, we are 1-based
 356         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
 357
 358     return string.join(output, ",")
 359
 360
 361 if __name__ == "__main__":
 362     main(sys.argv)