makerdsfromblat.py

   1 #
   2 #  makerdsfromblat.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 12/7/08.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys
  15 import string
  16 import optparse
  17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  18 import ReadDataset
  19
  20 verstring = "makerdsfromblat: version 3.10"
  21 print verstring
  22
  23 def main(argv=None):
  24     if not argv:
  25         argv = sys.argv
  26
  27     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  28
  29     parser = getParser(usage)
  30     (options, args) = parser.parse_args(argv[1:])
  31
  32     if len(args) < 3:
  33         print usage
  34         sys.exit(1)
  35
  36     label = args[0]
  37     filename = args[1]
  38     outdbname = args[2]
  39
  40     if options.geneDataFileName:
  41         dataType = "RNA"
  42     else:
  43         dataType = "DNA"
  44
  45     theFlag = ""
  46     if options.flagReads:
  47         theFlag = "blat"
  48
  49     propertyList = []
  50     for arg in args:
  51         if "::" in arg:
  52             (pname, pvalue) = arg.strip().split("::")
  53             propertyList.append((pname, pvalue))
  54
  55     makerdsfromblat(label, filename, outdbname, dataType, options.init,
  56                    options.doIndex, options.trimReadID, options. minSpliceLength,
  57                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
  58                    options.cachePages, options.geneDataFileName, propertyList)
  59
  60
  61 def getParser(usage):
  62     parser = optparse.OptionParser(usage=usage)
  63     parser.add_option("--append", action="store_false", dest="init")
  64     parser.add_option("--index", action="store_true", dest="doIndex")
  65     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
  66     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
  67     parser.add_option("--flag", action="store_true", dest="flagReads")
  68     parser.add_option("--strict", type="int", dest="minSpliceLength",
  69                       help="min required bp on each side of a splice")
  70     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
  71     parser.add_option("--verbose", action="store_true", dest="verbose")
  72     parser.add_option("--cache", type="int", dest="cachePages")
  73     parser.add_option("--RNA", dest="geneDataFileName")
  74
  75     configParser = getConfigParser()
  76     section = "makerdsfromblat"
  77     init = getConfigBoolOption(configParser, section, "init", True)
  78     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  79     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
  80     minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
  81     forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
  82     flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
  83     spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
  84     verbose = getConfigBoolOption(configParser, section, "verbose", False)
  85     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  86     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
  87
  88     parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
  89                         flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
  90                         geneDataFileName=geneDataFileName)
  91
  92     return parser
  93
  94
  95 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  96                     doIndex=False,trimReadID=True, minSpliceLength=0,
  97                     forceRNA=False, theFlag="", spliceOnly=False,
  98                     verbose=False, cachePages=100000, geneDataFileName="",
  99                     propertyList=[]):
 100
 101     delimiter = "|"
 102     minIntron = 10
 103     maxBorder = 0
 104     index = 0
 105     insertSize = 100000
 106
 107     if forceRNA:
 108         print "forcing datatype to RNA"
 109         dataType = "RNA"
 110
 111     if dataType == "RNA":
 112         genedatafile = open(geneDataFileName)
 113
 114     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
 115
 116     geneDict = {}
 117     mapDict = {}
 118     if dataType == "RNA" and not forceRNA:
 119         for line in genedatafile:
 120             fields = line.strip().split("\t")
 121             blockCount = int(fields[7])
 122             if blockCount < 2:
 123                 continue
 124
 125             uname = fields[0]
 126             chrom = fields[1]
 127             sense = fields[2]
 128             chromstarts = fields[8][:-1].split(",")
 129             chromstops = fields[9][:-1].split(",")
 130             exonLengths = []
 131             totalLength = 0
 132             for index in range(blockCount):
 133                 chromstarts[index] = int(chromstarts[index])
 134                 chromstops[index] = int(chromstops[index])
 135                 exonLengths.append(chromstops[index] - chromstarts[index])
 136                 totalLength += exonLengths[index]
 137
 138             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 139             mapDict[uname] = []
 140
 141         genedatafile.close()
 142
 143     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
 144
 145     #check that our cacheSize is better than the dataset's default cache size
 146     defaultCacheSize = rds.getDefaultCacheSize()
 147     if cachePages > defaultCacheSize:
 148         if init:
 149             rds.setDBcache(cachePages, default=True)
 150         else:
 151             rds.setDBcache(cachePages)
 152
 153     if not init and doIndex:
 154         try:
 155             if rds.hasIndex():
 156                 rds.dropIndex()
 157         except:
 158             if verbose:
 159                 print "couldn't drop Index"
 160
 161
 162     if len(propertyList) > 0:
 163         rds.insertMetadata(propertyList)
 164
 165     # make some assumptions based on first read
 166     infile = open(filename, "r")
 167     for arg in range(6):
 168         line = infile.readline()
 169
 170     fields = line.split()
 171     readsize = int(fields[10])
 172     pairedTest = fields[9][-2:]
 173     paired = False
 174     if pairedTest in ["/1", "/2"]:
 175         print "assuming reads are paired"
 176         paired = True
 177
 178     print "read size: %d bp" % readsize
 179     if init:
 180         rds.insertMetadata([("readsize", readsize)])
 181         if paired:
 182             rds.insertMetadata([("paired", "True")])
 183
 184     infile.close()
 185     if "blat_mapped" not in rds.getMetadata():
 186         rds.insertMetadata([("blat_mapped", "True")])
 187
 188     minReadScore = readsize - readsize/25 - 1
 189     trim = -4
 190     if dataType == "RNA":
 191         maxBorder = readsize + trim
 192
 193     infile = open(filename, "r")
 194     prevID = ""
 195     readList = []
 196     uInsertList = []
 197     mInsertList = []
 198     sInsertList = []
 199     index = uIndex = mIndex = sIndex = lIndex = 0
 200     bestScore = 0
 201     # skip headers
 202     for arg in range(5):
 203         line = infile.readline()
 204
 205     for line in infile:
 206         lIndex += 1
 207         fields = line.strip().split()
 208         readID = fields[9]
 209         if trimReadID:
 210             readID = string.join(readID.split(":")[1:], ":")
 211
 212         if readID != prevID:
 213             newReadList = []
 214             if bestScore > minReadScore:
 215                 for readData in readList:
 216                     if readData[1] == bestScore:
 217                         newReadList.append(readData)
 218
 219             if trimReadID:
 220                 prevID = label + "-" + prevID
 221
 222             listlen = len(newReadList)
 223             if listlen == 1:
 224                 parts = int(newReadList[0][0])
 225                 if parts == 1 and not spliceOnly:
 226                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 227                     stop = start + readsize
 228                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
 229                     uIndex += 1
 230                 elif forceRNA and parts == 2:
 231                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
 232                     startL = int(startList[0])
 233                     stopL = startL + int(lengthList[0])
 234                     startR = int(startList[1])
 235                     stopR = startR + int(lengthList[1])
 236                     if stopL + minIntron < startR:
 237                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 238                         sIndex += 1
 239                 elif parts == 2:
 240                     print newReadList
 241                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 242                     currentSplice = chrom
 243                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
 244                     if model not in geneDict:
 245                         print fields
 246                         continue
 247
 248                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 249                     spliceID = int(spliceID)
 250                     rstart = int(start) - 2
 251                     lefthalf = maxBorder - rstart
 252                     if lefthalf < 1 or lefthalf > maxBorder:
 253                         continue
 254
 255                     righthalf = readsize - lefthalf
 256                     startL = int(regionStart)  + rstart
 257                     stopL = startL + lefthalf
 258                     startR = chromstarts[spliceID + 1]
 259                     stopR = chromstarts[spliceID + 1] + righthalf
 260                     if stopL + minIntron < startR:
 261                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 262                         sIndex += 1
 263             elif listlen > 1 and not spliceOnly:
 264                 prevID = prevID + "::" + str(listlen)
 265                 mIndex += 1
 266                 # ignore multireads that can also map across splices
 267                 skip = False
 268                 for readData in newReadList:
 269                     if readData[0] > 1:
 270                         skip = True
 271
 272                 if not skip:
 273                     for (part, score, sense, chrom, start, mismatches) in newReadList:
 274                         stop = start + readsize
 275                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
 276             else:
 277                 prevID = readID
 278
 279             if index % insertSize == 0:
 280                 rds.insertUniqs(uInsertList)
 281                 rds.insertMulti(mInsertList)
 282                 uInsertList = []
 283                 mInsertList = []
 284                 if dataType == "RNA":
 285                     rds.insertSplices(sInsertList)
 286                     sInsertList = []
 287
 288                 print ".",
 289                 sys.stdout.flush()
 290
 291             # start processing new read
 292             readList = []
 293             prevID = readID
 294             bestScore = 0
 295             index += 1
 296
 297         # add the new read
 298         score = int(fields[0])
 299         sense = fields[8]
 300         chrom = fields[13]
 301         parts = int(fields[17])
 302         passStrict = True
 303         if parts > 1:
 304             lengthList = fields[18][:-1].split(",")
 305             startList = fields[20][:-1].split(",")
 306             listlen = len(lengthList)
 307             for lpos in range(listlen):
 308                 if int(lengthList[lpos]) < minSpliceLength:
 309                     passStrict = False
 310
 311                 # throw out deletions, for now
 312                 if lpos > 0:
 313                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
 314                         passStrict = False
 315             pass
 316         else:
 317             start = int(fields[15])
 318
 319         if passStrict:
 320             if score > bestScore:
 321                 bestScore = score
 322
 323             mismatches = ""
 324             if int(fields[1]) > 0:
 325                 try:
 326                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
 327                 except:
 328                     mismatches = ""
 329
 330             if parts == 1:
 331                 readList.append((parts, score, sense, chrom, start, mismatches))
 332             else:
 333                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
 334
 335         if lIndex % 1000000 == 0:
 336             print "processed %d lines" % lIndex
 337
 338     print "%d lines processed" % lIndex
 339
 340     if len(uInsertList) > 0:
 341         rds.insertUniqs(uInsertList)
 342     if len(mInsertList) > 0:
 343         rds.insertMulti(mInsertList)
 344     if len(sInsertList) > 0:
 345         rds.insertSplices(sInsertList)
 346
 347     combString = "%d unique reads" % uIndex
 348     combString += "\t%d multi reads" % mIndex
 349     if dataType == "RNA":
 350         combString += "\t%d spliced reads" % sIndex
 351
 352     print
 353     print combString.replace("\t", "\n")
 354
 355     writeLog(outdbname + ".log", verstring, combString)
 356
 357     if doIndex:
 358         print "building index...."
 359         if cachePages > defaultCacheSize:
 360             rds.setDBcache(cachePages)
 361             rds.buildIndex(cachePages)
 362         else:
 363             rds.buildIndex(defaultCacheSize)
 364
 365
 366 def decodeMismatches(gString, rString, rsense):
 367
 368     output = []
 369     rlen = len(gString)
 370     partIndex = 0
 371     for rindex in xrange(rlen):
 372         if gString == ",":
 373             partIndex += 1
 374
 375         if gString[rindex] == rString[rindex]:
 376             continue
 377
 378         genNT = gString[rindex]
 379         readNT = rString[rindex]
 380         # for eland-compatibility, we are 1-based
 381         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
 382
 383     return string.join(output, ",")
 384
 385
 386 if __name__ == "__main__":
 387     main(sys.argv)