makerdsfromeland2.py

   1 #
   2 #  makerdsfromeland2.py
   3 #  ENRAGE
   4 #
   5 try:
   6     import psyco
   7     psyco.full()
   8 except:
   9     pass
  10
  11 import sys
  12 import string
  13 import optparse
  14 import ReadDataset
  15 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  16
  17 def main(argv=None):
  18     if not argv:
  19         argv = sys.argv
  20
  21     verstring = "makerdsfromeland2: version 3.5"
  22     print verstring
  23
  24     usage = "usage:  %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
  25             \ninput reads must be sorted to properly record multireads"
  26
  27     parser = getParser(usage)
  28     (options, args) = parser.parse_args(argv[1:])
  29
  30     if len(args) < 3:
  31         print usage
  32         sys.exit(1)
  33
  34     label = args[0]
  35     filename = args[1]
  36     outdbname = args[2]
  37
  38     delimiter = '|'
  39     if options.useOldDelimiter:
  40         delimiter = ':'
  41
  42     paired = False
  43     pairID = '1'
  44     if options.pairID is not None:
  45         paired = True
  46         if options.pairID not in ['1','2']:
  47             print 'pairID value must be 1 or 2'
  48             sys.exit(-1)
  49
  50         print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
  51
  52     dataType = 'DNA'
  53     if options.geneDataFileName is not None:
  54         dataType = 'RNA'
  55
  56     makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
  57                       options.pairID, dataType, options.geneDataFileName, options.cachePages,
  58                       options.maxLines, options.extended, options.verbose)
  59
  60
  61 def getParser(usage):
  62     parser = optparse.OptionParser(usage=usage)
  63     parser.add_option("--append", action="store_false", dest="init",
  64                       help="append to existing rds file [default: create new]")
  65     parser.add_option("--RNA", dest="geneDataFileName",
  66                       help="set data type to RNA [default: DNA]")
  67     parser.add_option("--index", action="store_true", dest="doIndex",
  68                       help="index the output rds file")
  69     parser.add_option("--cache", type="int", dest="cachePages",
  70                       help="number of cache pages to use [default: 100000")
  71     parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
  72                       help="use : as the delimiter")
  73     parser.add_option("--paired", dest="pairID",
  74                       help="pairID value")
  75     parser.add_option("--extended", action="store_true", dest="extended",
  76                       help="use eland_extended input")
  77     parser.add_option("--verbose", action="store_true", dest="verbose")
  78     parser.add_option("--maxlines", type="int", dest="maxLines",
  79                       help="[default: 1000000000")
  80
  81     configParser = getConfigParser()
  82     section = "makerdsfromeland2"
  83     init = getConfigBoolOption(configParser, section, "init", True)
  84     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  85     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  86     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
  87     useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
  88     pairID = getConfigOption(configParser, section, "pairID", None)
  89     maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
  90     extended = getConfigBoolOption(configParser, section, "extended", False)
  91     verbose = getConfigBoolOption(configParser, section, "verbose", False)
  92
  93     parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
  94                         geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
  95                         pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
  96
  97     return parser
  98
  99
 100 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
 101                       init=True, pairID="1", dataType="DNA", geneDataFileName=None,
 102                       cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
 103
 104     maxBorder = 0
 105     index = 0
 106     insertSize = 100000
 107
 108     geneDict = {}
 109     if dataType == 'RNA':
 110         genedatafile = open(geneDataFileName)
 111         for line in genedatafile:
 112             fields = line.strip().split('\t')
 113             blockCount = int(fields[7])
 114             if blockCount < 2:
 115                 continue
 116
 117             uname = fields[0]
 118             chrom = fields[1]
 119             sense = fields[2]
 120             chromstarts = fields[8][:-1].split(',')
 121             for index in range(blockCount):
 122                 chromstarts[index] = int(chromstarts[index])
 123
 124             geneDict[uname] = (sense, blockCount, chrom, chromstarts)
 125
 126         genedatafile.close()
 127
 128     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
 129
 130     if cachePages > rds.getDefaultCacheSize():
 131         if init:
 132             rds.setDBcache(cachePages, default=True)
 133         else:
 134             rds.setDBcache(cachePages)
 135
 136     if not init and doIndex:
 137         try:
 138             if rds.hasIndex():
 139                 rds.dropIndex()
 140         except:
 141             if verbose:
 142                 print "couldn't drop Index"
 143
 144     propertyList = []
 145     for arg in sys.argv:
 146         if '::' in arg:
 147             (pname, pvalue) = arg.strip().split('::')
 148             if pname == 'flowcell' and paired:
 149                 pvalue = pvalue + '/' + pairID
 150
 151             propertyList.append((pname, pvalue))
 152
 153     if len(propertyList) > 0:
 154         rds.insertMetadata(propertyList)
 155
 156     infile = open(filename,'r')
 157     line = infile.readline()
 158     fields = line.split()
 159     readsize = len(fields[1])
 160     readsizeString = str(readsize)
 161     if dataType == 'RNA' and readsize > 32:
 162         splicesizeString = '32'
 163     else:
 164         splicesizeString = readsizeString
 165
 166     print 'read size: %d bp' % readsize
 167     if init:
 168         rds.insertMetadata([('readsize', readsize)])
 169         rds.insertMetadata([('eland_mapped', 'True')])
 170         if extended:
 171             rds.insertMetadata([('eland_extended', 'True')])
 172
 173         if paired:
 174             rds.insertMetadata([('paired', 'True')])
 175
 176     trim = -4
 177     if dataType == 'RNA':
 178         maxBorder = readsize + trim
 179
 180     insertList = []
 181     infile = open(filename,'r')
 182     print 'mapping unique reads...'
 183     lineIndex = 0
 184     for line in infile:
 185         lineIndex += 1
 186         if lineIndex > maxLines:
 187             break
 188
 189         fields = line.split()
 190         if fields[2] in  ['QC','NM']:
 191             continue
 192
 193         (matchType, bestMatch) = getUniqueMatch(fields[2])
 194         if matchType == -1:
 195             continue
 196
 197         bestpos = []
 198         try:
 199             pos = fields[3].split(',')
 200         except:
 201             if verbose:
 202                 print 'problem with line: %s' % line.strip()
 203             continue
 204
 205         matchDict = {0:[], 1:[], 2:[], 3:[]}
 206         if len(pos) == 1:
 207             if 'splice' in pos:
 208                 continue
 209
 210             bestpos = pos
 211         else:
 212             currentChr = ''
 213             for apos in pos:
 214                 if 'splice' in apos:
 215                     continue
 216
 217                 if ':' in apos:
 218                     (front, back) = apos.split(':')
 219                     currentChr = front
 220                 else:
 221                     back = apos
 222                     apos = currentChr + ':' + apos
 223
 224                 if extended:
 225                     matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 226                     if matchType > 2:
 227                         matchType = 3
 228                 else:
 229                     matchType = int(apos[-1])
 230
 231                 matchDict[matchType].append(apos)
 232                 if bestMatch[matchType]:
 233                     bestpos.append(apos)
 234
 235         # for padded reads, mapped read might have more mismatches!
 236         if len(bestpos) == 0:
 237             # let's not worry about these yet.
 238             if 'splice' in line:
 239                 continue
 240
 241             for matchType in [1, 2, 3]:
 242                 if len(matchDict[matchType]) > 0:
 243                     if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
 244                         bestpos = matchDict[matchType]
 245                     break
 246
 247             if len(bestpos) == 0 and verbose:
 248                 print "couldn't pick best read from line: %s" % line
 249
 250         for apos in bestpos:
 251             try:
 252                 (chrom, back) = apos.split(':')
 253             except:
 254                 continue
 255
 256             if 'splice' in chrom:
 257                 continue
 258
 259             if '/' in chrom:
 260                 chromfields = chrom.split('/')
 261                 chrom = chromfields[-1]
 262
 263             if '.' in chrom:
 264                 try:
 265                     (chrom, fileExt) = chrom.split('.')
 266                 except:
 267                     if verbose:
 268                         print 'problem with chromosome on line %s' % line.strip()
 269
 270                     continue
 271
 272             if extended:
 273                 if 'F' in back:
 274                     sense = '+'
 275                     (start, matchPart) = back.split('F')
 276                 else:
 277                     sense = '-'
 278                     (start, matchPart) = back.split('R')
 279
 280                 start = int(start)
 281                 if matchPart == readsizeString:
 282                     matchType = ''
 283                 else:
 284                     matchType = decodeMismatches(fields[1], matchPart)
 285             else:
 286                 start = int(back[:-2])
 287                 if back[-2] == 'F':
 288                     sense = '+'
 289                 else:
 290                     sense = '-'
 291
 292             stop = int(start) + readsize - 1
 293             if paired:
 294                 readID = label + '-' + str(lineIndex) + '/' + pairID
 295             else:
 296                 readID = label + '-' + str(index)
 297
 298             if len(chrom) > 0:
 299                 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
 300
 301             if index % insertSize == 0:
 302                 rds.insertUniqs(insertList)
 303                 insertList = []
 304                 print '.',
 305                 sys.stdout.flush()
 306
 307             index += 1
 308
 309     if len(insertList) > 0:
 310         rds.insertUniqs(insertList)
 311         insertList = []
 312
 313     print
 314     print '%d unique reads' % index
 315     infile.close()
 316
 317     seenSpliceList = []
 318     if dataType == 'RNA':
 319         print 'mapping splices...'
 320         index = 0
 321         lineIndex = 0
 322         mapfile = open(filename,'r')
 323         for line in mapfile:
 324             lineIndex += 1
 325             if lineIndex > maxLines:
 326                 break
 327
 328             if 'splice' not in line:
 329                 continue
 330
 331             fields = line.strip().split()
 332             (matchType, bestMatch) = getUniqueMatch(fields[2])
 333             if matchType == -1:
 334                 continue
 335
 336             bestpos = []
 337             pos = fields[3].split(',')
 338             matchDict = {0:[], 1:[], 2:[], 3:[]}
 339             if len(pos) == 1:
 340                 if 'chr' in pos:
 341                     continue
 342
 343                 bestpos = pos
 344             else:
 345                 currentSplice = ''
 346                 for apos in pos:
 347                     if 'splice' not in apos:
 348                         continue
 349
 350                     if ':' in apos:
 351                         if delimiter == ':':
 352                             try:
 353                                 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 354                             except:
 355                                 try:
 356                                     (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 357                                     extmodel = extmodel1 + ':' + extmodel2
 358                                 except:
 359                                     print 'warning: could not process splice %s' % apos
 360                                     continue
 361
 362                             currentSplice = extmodel + ':' + spliceID + ':' + regionStart
 363                         else:
 364                             try:
 365                                 (currentSplice, thepos) = apos.split(':')
 366                             except:
 367                                 try:
 368                                     (extmodel1, restSplice, thepos) = apos.split(':')
 369                                     currentSplice = extmodel1 + ':' + restSplice
 370                                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 371                                 except:
 372                                     print 'warning: could not process splice %s' % apos
 373                                     continue
 374                     else:
 375                         thepos = apos
 376                         apos = currentSplice + ':' + apos
 377
 378                     if extended:
 379                         matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
 380                         if matchType > 2:
 381                             matchType = 3
 382
 383                         # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
 384                         if thepos[:2] == splicesizeString:
 385                             matchType = 0
 386                     else:
 387                         matchType = int(apos[-1])
 388
 389                     if bestMatch[matchType]:
 390                         bestpos.append(apos)
 391
 392             # for padded reads, mapped read might have more mismatches!
 393             if len(bestpos) == 0:
 394                 for matchType in [1, 2, 3]:
 395                     if len(matchDict[matchType]) > 0:
 396                         if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
 397                             bestpos = matchDict[matchType]
 398
 399                         break
 400                 if len(bestpos) == 0 and verbose:
 401                     print "couldn't pick best read from line: %s" % line
 402
 403             for apos in bestpos:
 404                 if delimiter == ':':
 405                     try:
 406                         (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 407                     except:
 408                         try:
 409                             (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 410                             extmodel = extmodel1 + ':' + extmodel2
 411                         except:
 412                             print 'warning: could not process splice %s' % apos
 413                             continue
 414                 else:
 415                     try:
 416                         (currentSplice, thepos) = apos.split(':')
 417                     except:
 418                         try:
 419                             (extmodel1, restSplice, thepos) = apos.split(':')
 420                             currentSplice = extmodel1 + ':' + restSplice
 421                         except:
 422                             print 'warning: could not process splice %s' % apos
 423                             continue
 424
 425                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 426
 427                 modelfields = extmodel.split('/')
 428                 if len(modelfields) > 2:
 429                     model = string.join(modelfields[1:],'/')
 430                 else:
 431                     model = modelfields[1]
 432
 433                 if model not in geneDict:
 434                     print fields
 435                     continue
 436
 437                 (sense, blockCount, chrom, chromstarts) = geneDict[model]
 438                 if extended:
 439                     if 'F' in thepos:
 440                         rsense = '+'
 441                         (start, matchPart) = thepos.split('F')
 442                     else:
 443                         rsense = '-'
 444                         (start, matchPart) = thepos.split('R')
 445
 446                     rstart = int(start) - 2
 447                     if matchPart == readsizeString:
 448                         matchType = ''
 449                     elif matchPart[:2] == splicesizeString:
 450                         matchType = ''
 451                     else:
 452                         matchType = decodeMismatches(fields[1], matchPart)
 453                 else:
 454                     rstart = int(thepos[:-2]) - 2
 455                     if thepos[-2] == 'F':
 456                         rsense = '+'
 457                     else:
 458                         rsense = '-'
 459
 460                 if trim <= rstart <= maxBorder:
 461                     pass
 462                 else:
 463                     print rstart
 464                     continue
 465
 466                 currentSplice = model + delimiter + spliceID + delimiter + regionStart
 467                 spliceID = int(spliceID)
 468                 lefthalf = maxBorder - rstart
 469                 if lefthalf < 1 or lefthalf > maxBorder:
 470                     continue
 471
 472                 righthalf = readsize - lefthalf
 473                 startL = int(regionStart)  + rstart
 474                 stopL = startL + lefthalf
 475                 startR = chromstarts[spliceID + 1]
 476                 stopR = chromstarts[spliceID + 1] + righthalf
 477                 if paired:
 478                     readName = label + '-' + str(lineIndex) + '/' + pairID
 479                 else:
 480                     readName = model + '-' + str(thepos)
 481
 482                 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
 483                 index += 1
 484                 if index % insertSize == 0:
 485                     rds.insertSplices(insertList)
 486                     print '.',
 487                     sys.stdout.flush()
 488                     insertList = []
 489
 490                 if currentSplice not in seenSpliceList:
 491                     seenSpliceList.append(currentSplice)
 492
 493         mapfile.close()
 494         if len(insertList) > 0:
 495             rds.insertSplices(insertList)
 496             insertList = []
 497
 498         print
 499         print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
 500
 501     infile = open(filename,'r')
 502     print 'mapping multireads...'
 503     lineIndex = 0
 504     origReadid = rds.getMultiCount()
 505     try:
 506         readid = int(origReadid) + 1
 507     except:
 508         readid = 0
 509         origReadid = 0
 510
 511     print 'starting at %d' % (readid + 1)
 512
 513     for line in infile:
 514         lineIndex += 1
 515         if lineIndex > maxLines:
 516             break
 517
 518         fields = line.split()
 519         if len(fields) < 4:
 520             continue
 521
 522         if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
 523             continue
 524
 525         (zero, one, two) = fields[2].split(':')
 526         zero = int(zero)
 527         one = int(one)
 528         two = int(two)
 529
 530         bestMatch = [False] * readsize
 531         if zero > 1:
 532             bestMatch[0] = True
 533         elif zero == 0 and one > 1:
 534             bestMatch[1] = True
 535         elif zero == 0 and one == 0 and two > 1:
 536             bestMatch[2] = True
 537         else:
 538             continue
 539
 540         readcount = 0
 541         bestpos = []
 542         pos = fields[3].split(',')
 543         matchDict = {0:[], 1:[], 2:[], 3:[]}
 544         currentChr = ''
 545         for apos in pos:
 546             if ':' in apos:
 547                 try:
 548                     (front, back) = apos.split(':')
 549                 except:
 550                     if verbose:
 551                         print "problem splitting %s" % str(apos)
 552                     continue
 553
 554                 currentChr = front
 555             else:
 556                 back = apos
 557                 apos = currentChr + ':' + apos
 558
 559             if extended:
 560                 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 561             else:
 562                 matchType = int(apos[-1])
 563
 564             try:
 565                 matchDict[matchType].append(apos)
 566             except:
 567                 matchDict[matchType] = [apos]
 568
 569             if bestMatch[matchType]:
 570                 bestpos.append(apos)
 571
 572         # for padded reads, mapped read might have more mismatches!
 573         if len(bestpos) == 0:
 574             for matchType in [1, 2, 3]:
 575                 if len(matchDict[matchType]) > 0:
 576                     if len(matchDict[matchType]) > 1:
 577                         noSplice = True
 578                         for arg in matchDict[matchType]:
 579                             if 'splice' in arg:
 580                                 noSplice = False
 581
 582                         if noSplice:
 583                             bestpos = matchDict[matchType]
 584                     break
 585
 586             if len(bestpos) == 0 and verbose:
 587                 print "couldn't pick best read from line: %s" % line
 588                 continue
 589
 590         hasSplice = False
 591         for apos in bestpos:
 592             if 'splice' in apos:
 593                 hasSplice = True
 594
 595         # do not allow multireads that can also map accross splices for now
 596         if hasSplice:
 597             if verbose:
 598                 print "throwing out multiread because of splice conflict"
 599             continue
 600
 601         if len(bestpos) > 0:
 602             readid += 1
 603
 604         for apos in bestpos:
 605             readcount += 1
 606             (front, back) = apos.split(':')
 607             chrom = front[:-3]
 608             if extended:
 609                 if 'F' in back:
 610                     sense = '+'
 611                     (start, matchPart) = back.split('F')
 612                 else:
 613                     sense = '-'
 614                     (start, matchPart) = back.split('R')
 615
 616                 start = int(start)
 617                 if matchPart == readsizeString:
 618                     matchType = ''
 619                 else:
 620                     matchType = decodeMismatches(fields[1], matchPart)
 621             else:
 622                 start = int(back[:-2])
 623                 if back[-2] == 'F':
 624                     sense = '+'
 625                 else:
 626                     sense = '-'
 627
 628             stop = int(start) + readsize
 629             readName = '%dx%d' % (readid, len(bestpos))
 630             if paired:
 631                 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
 632
 633             insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
 634             if index % insertSize == 0:
 635                 rds.insertMulti(insertList)
 636                 insertList = []
 637                 print '.',
 638                 sys.stdout.flush()
 639
 640             index += 1
 641
 642     if len(insertList) > 0:
 643         rds.insertMulti(insertList)
 644         insertList = []
 645
 646     print
 647     print '%d multireads' % (readid - origReadid)
 648
 649     if doIndex:
 650         print 'building index....'
 651         rds.buildIndex(cachePages)
 652
 653
 654 def getUniqueMatch(elandCode):
 655     (zero, one, two) = elandCode.split(':')
 656     zero = int(zero)
 657     one = int(one)
 658     two = int(two)
 659     bestMatch = [False, False, False, False]
 660     if zero == 1:
 661         bestMatch[0] = True
 662         matchType = 0
 663     elif zero == 0 and one == 1:
 664         bestMatch[1] = True
 665         matchType = 1
 666     elif zero == 0 and one == 0 and two == 1:
 667         bestMatch[2] = True
 668         matchType = 2
 669     else:
 670         matchType = -1
 671
 672     return (matchType, bestMatch)
 673
 674
 675 def decodeMismatches(origSeq, code):
 676     output = []
 677     number = '0'
 678     index = 0
 679     for pos in code:
 680         if pos.isdigit():
 681             number += pos
 682         else:
 683             index += int(number) + 1
 684             origNT = origSeq[index - 1]
 685             output.append('%s%d%s' % (origNT, index, pos))
 686             number = '0'
 687
 688     return string.join(output, ',')
 689
 690
 691 if __name__ == "__main__":
 692     main(sys.argv)