makerdsfromblat.py

   1 #
   2 #  makerdsfromblat.py
   3 #  ENRAGE
   4 #
   5 #  Created by Ali Mortazavi on 12/7/08.
   6 #
   7
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys
  15 import string
  16 import optparse
  17 from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  18 import ReadDataset
  19
  20 verstring = "makerdsfromblat: version 3.10"
  21 print verstring
  22
  23 NUM_HEADER_LINES = 5
  24
  25
  26 def main(argv=None):
  27     if not argv:
  28         argv = sys.argv
  29
  30     usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  31
  32     parser = getParser(usage)
  33     (options, args) = parser.parse_args(argv[1:])
  34
  35     if len(args) < 3:
  36         print usage
  37         sys.exit(1)
  38
  39     label = args[0]
  40     filename = args[1]
  41     outdbname = args[2]
  42
  43     if options.geneDataFileName:
  44         dataType = "RNA"
  45     else:
  46         dataType = "DNA"
  47
  48     theFlag = ""
  49     if options.flagReads:
  50         theFlag = "blat"
  51
  52     propertyList = []
  53     for arg in args:
  54         if "::" in arg:
  55             (pname, pvalue) = arg.strip().split("::")
  56             propertyList.append((pname, pvalue))
  57
  58     makerdsfromblat(label, filename, outdbname, dataType, options.init,
  59                    options.doIndex, options.trimReadID, options. minSpliceLength,
  60                    options.forceRNA, theFlag, options.spliceOnly, options.verbose,
  61                    options.cachePages, options.geneDataFileName, propertyList)
  62
  63
  64 def getParser(usage):
  65     parser = optparse.OptionParser(usage=usage)
  66     parser.add_option("--append", action="store_false", dest="init")
  67     parser.add_option("--index", action="store_true", dest="doIndex")
  68     parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
  69     parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
  70     parser.add_option("--flag", action="store_true", dest="flagReads")
  71     parser.add_option("--strict", type="int", dest="minSpliceLength",
  72                       help="min required bp on each side of a splice")
  73     parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
  74     parser.add_option("--verbose", action="store_true", dest="verbose")
  75     parser.add_option("--cache", type="int", dest="cachePages")
  76     parser.add_option("--RNA", dest="geneDataFileName")
  77
  78     configParser = getConfigParser()
  79     section = "makerdsfromblat"
  80     init = getConfigBoolOption(configParser, section, "init", True)
  81     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  82     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
  83     minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
  84     forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
  85     flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
  86     spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
  87     verbose = getConfigBoolOption(configParser, section, "verbose", False)
  88     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  89     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
  90
  91     parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
  92                         flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
  93                         geneDataFileName=geneDataFileName)
  94
  95     return parser
  96
  97
  98 def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  99                     doIndex=False,trimReadID=True, minSpliceLength=0,
 100                     forceRNA=False, theFlag="", spliceOnly=False,
 101                     verbose=False, cachePages=100000, geneDataFileName="",
 102                     propertyList=[]):
 103
 104     writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
 105     if forceRNA:
 106         print "forcing datatype to RNA"
 107         dataType = "RNA"
 108
 109     geneDict = {}
 110     mapDict = {}
 111     if dataType == "RNA" and not forceRNA:
 112         genedatafile = open(geneDataFileName)
 113         for line in genedatafile:
 114             fields = line.strip().split("\t")
 115             blockCount = int(fields[7])
 116             if blockCount < 2:
 117                 continue
 118
 119             uname = fields[0]
 120             chrom = fields[1]
 121             sense = fields[2]
 122             chromstarts = fields[8][:-1].split(",")
 123             chromstops = fields[9][:-1].split(",")
 124             exonLengths = []
 125             totalLength = 0
 126             for index in range(blockCount):
 127                 chromstarts[index] = int(chromstarts[index])
 128                 chromstops[index] = int(chromstops[index])
 129                 exonLengths.append(chromstops[index] - chromstarts[index])
 130                 totalLength += exonLengths[index]
 131
 132             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 133             mapDict[uname] = []
 134
 135         genedatafile.close()
 136
 137     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
 138
 139     #check that our cacheSize is better than the dataset's default cache size
 140     defaultCacheSize = rds.getDefaultCacheSize()
 141     if cachePages > defaultCacheSize:
 142         if init:
 143             rds.setDBcache(cachePages, default=True)
 144         else:
 145             rds.setDBcache(cachePages)
 146
 147     if not init and doIndex:
 148         try:
 149             if rds.hasIndex():
 150                 rds.dropIndex()
 151         except:
 152             if verbose:
 153                 print "couldn't drop Index"
 154
 155
 156     if len(propertyList) > 0:
 157         rds.insertMetadata(propertyList)
 158
 159     # make some assumptions based on first read
 160     infile = open(filename, "r")
 161     for arg in range(NUM_HEADER_LINES):
 162         line = infile.readline()
 163
 164     line = infile.readline()
 165     fields = line.split()
 166     readsize = int(fields[10])
 167     pairedTest = fields[9][-2:]
 168     paired = False
 169     if pairedTest in ["/1", "/2"]:
 170         print "assuming reads are paired"
 171         paired = True
 172
 173     print "read size: %d bp" % readsize
 174     if init:
 175         rds.insertMetadata([("readsize", readsize)])
 176         if paired:
 177             rds.insertMetadata([("paired", "True")])
 178
 179     infile.close()
 180     if "blat_mapped" not in rds.getMetadata():
 181         rds.insertMetadata([("blat_mapped", "True")])
 182
 183     minReadScore = readsize - readsize/25 - 1
 184     maxBorder = 0
 185     if dataType == "RNA":
 186         trim = -4
 187         maxBorder = readsize + trim
 188
 189     infile = open(filename, "r")
 190     prevID = ""
 191     readList = []
 192     uInsertList = []
 193     mInsertList = []
 194     sInsertList = []
 195     index = uIndex = mIndex = sIndex = lIndex = 0
 196     bestScore = 0
 197     # skip headers
 198     for arg in range(NUM_HEADER_LINES):
 199         line = infile.readline()
 200
 201     insertSize = 100000
 202     delimiter = "|"
 203     minIntron = 10
 204     for line in infile:
 205         lIndex += 1
 206         fields = line.strip().split()
 207         readID = fields[9]
 208         if trimReadID:
 209             readID = string.join(readID.split(":")[1:], ":")
 210
 211         if readID != prevID:
 212             newReadList = []
 213             if bestScore > minReadScore:
 214                 for readData in readList:
 215                     if readData[1] == bestScore:
 216                         newReadList.append(readData)
 217
 218             if trimReadID:
 219                 prevID = label + "-" + prevID
 220
 221             listlen = len(newReadList)
 222             if listlen == 1:
 223                 parts = int(newReadList[0][0])
 224                 if parts == 1 and not spliceOnly:
 225                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 226                     stop = start + readsize
 227                     uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
 228                     uIndex += 1
 229                 elif forceRNA and parts == 2:
 230                     (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
 231                     startL = int(startList[0])
 232                     stopL = startL + int(lengthList[0])
 233                     startR = int(startList[1])
 234                     stopR = startR + int(lengthList[1])
 235                     if stopL + minIntron < startR:
 236                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 237                         sIndex += 1
 238                 elif parts == 2:
 239                     print newReadList
 240                     (part, score, sense, chrom, start, mismatches) = newReadList[0]
 241                     currentSplice = chrom
 242                     (model, spliceID, regionStart) = currentSplice.split(delimiter)
 243                     if model not in geneDict:
 244                         print fields
 245                         continue
 246
 247                     (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 248                     spliceID = int(spliceID)
 249                     rstart = int(start) - 2
 250                     lefthalf = maxBorder - rstart
 251                     if lefthalf < 1 or lefthalf > maxBorder:
 252                         continue
 253
 254                     righthalf = readsize - lefthalf
 255                     startL = int(regionStart)  + rstart
 256                     stopL = startL + lefthalf
 257                     startR = chromstarts[spliceID + 1]
 258                     stopR = chromstarts[spliceID + 1] + righthalf
 259                     if stopL + minIntron < startR:
 260                         sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
 261                         sIndex += 1
 262             elif listlen > 1 and not spliceOnly:
 263                 prevID = prevID + "::" + str(listlen)
 264                 mIndex += 1
 265                 # ignore multireads that can also map across splices
 266                 skip = False
 267                 for readData in newReadList:
 268                     if readData[0] > 1:
 269                         skip = True
 270
 271                 if not skip:
 272                     for (part, score, sense, chrom, start, mismatches) in newReadList:
 273                         stop = start + readsize
 274                         mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
 275             else:
 276                 prevID = readID
 277
 278             if index % insertSize == 0:
 279                 rds.insertUniqs(uInsertList)
 280                 rds.insertMulti(mInsertList)
 281                 uInsertList = []
 282                 mInsertList = []
 283                 if dataType == "RNA":
 284                     rds.insertSplices(sInsertList)
 285                     sInsertList = []
 286
 287                 print ".",
 288                 sys.stdout.flush()
 289
 290             # start processing new read
 291             readList = []
 292             prevID = readID
 293             bestScore = 0
 294             index += 1
 295
 296         # add the new read
 297         score = int(fields[0])
 298         sense = fields[8]
 299         chrom = fields[13]
 300         parts = int(fields[17])
 301         passStrict = True
 302         if parts > 1:
 303             lengthList = fields[18][:-1].split(",")
 304             startList = fields[20][:-1].split(",")
 305             listlen = len(lengthList)
 306             for lpos in range(listlen):
 307                 if int(lengthList[lpos]) < minSpliceLength:
 308                     passStrict = False
 309
 310                 # throw out deletions, for now
 311                 if lpos > 0:
 312                     if int(lengthList[lpos - 1]) == int(startList[lpos]):
 313                         passStrict = False
 314             pass
 315         else:
 316             start = int(fields[15])
 317
 318         if passStrict:
 319             if score > bestScore:
 320                 bestScore = score
 321
 322             mismatches = ""
 323             if int(fields[1]) > 0:
 324                 try:
 325                     mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
 326                 except:
 327                     mismatches = ""
 328
 329             if parts == 1:
 330                 readList.append((parts, score, sense, chrom, start, mismatches))
 331             else:
 332                 readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
 333
 334         if lIndex % 1000000 == 0:
 335             print "processed %d lines" % lIndex
 336
 337     print "%d lines processed" % lIndex
 338
 339     if len(uInsertList) > 0:
 340         rds.insertUniqs(uInsertList)
 341     if len(mInsertList) > 0:
 342         rds.insertMulti(mInsertList)
 343     if len(sInsertList) > 0:
 344         rds.insertSplices(sInsertList)
 345
 346     combString = "%d unique reads" % uIndex
 347     combString += "\t%d multi reads" % mIndex
 348     if dataType == "RNA":
 349         combString += "\t%d spliced reads" % sIndex
 350
 351     print
 352     print combString.replace("\t", "\n")
 353
 354     writeLog(outdbname + ".log", verstring, combString)
 355
 356     if doIndex:
 357         print "building index...."
 358         if cachePages > defaultCacheSize:
 359             rds.setDBcache(cachePages)
 360             rds.buildIndex(cachePages)
 361         else:
 362             rds.buildIndex(defaultCacheSize)
 363
 364
 365 def decodeMismatches(gString, rString, rsense):
 366
 367     output = []
 368     rlen = len(gString)
 369     partIndex = 0
 370     for rindex in xrange(rlen):
 371         if gString == ",":
 372             partIndex += 1
 373
 374         if gString[rindex] == rString[rindex]:
 375             continue
 376
 377         genNT = gString[rindex]
 378         readNT = rString[rindex]
 379         # for eland-compatibility, we are 1-based
 380         output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
 381
 382     return string.join(output, ",")
 383
 384
 385 if __name__ == "__main__":
 386     main(sys.argv)