makerdsfromeland2.py

   1 #
   2 #  makerdsfromeland2.py
   3 #  ENRAGE
   4 #
   5 try:
   6     import psyco
   7     psyco.full()
   8 except:
   9     pass
  10
  11 import sys
  12 import string
  13 import optparse
  14 import ReadDataset
  15 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  16
  17 def main(argv=None):
  18     if not argv:
  19         argv = sys.argv
  20
  21     verstring = "makerdsfromeland2: version 3.5"
  22     print verstring
  23
  24     usage = "usage:  %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
  25             \ninput reads must be sorted to properly record multireads"
  26
  27     parser = getParser(usage)
  28     (options, args) = parser.parse_args(argv[1:])
  29
  30     if len(args) < 3:
  31         print usage
  32         sys.exit(1)
  33
  34     label = args[0]
  35     filename = args[1]
  36     outdbname = args[2]
  37
  38     delimiter = '|'
  39     if options.useOldDelimiter:
  40         delimiter = ':'
  41
  42     paired = False
  43     pairID = '1'
  44     if options.pairID is not None:
  45         paired = True
  46         if options.pairID not in ['1','2']:
  47             print 'pairID value must be 1 or 2'
  48             sys.exit(-1)
  49
  50         print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
  51
  52     dataType = 'DNA'
  53     if options.geneDataFileName is not None:
  54         dataType = 'RNA'
  55
  56     makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
  57                       options.pairID, dataType, options.geneDataFileName, options.cachePages,
  58                       options.maxLines, options.extended, options.verbose)
  59
  60
  61 def getParser(usage):
  62     parser = optparse.OptionParser(usage=usage)
  63     parser.add_option("--append", action="store_false", dest="init",
  64                       help="append to existing rds file [default: create new]")
  65     parser.add_option("--RNA", dest="geneDataFileName",
  66                       help="set data type to RNA [default: DNA]")
  67     parser.add_option("--index", action="store_true", dest="doIndex",
  68                       help="index the output rds file")
  69     parser.add_option("--cache", type="int", dest="cachePages",
  70                       help="number of cache pages to use [default: 100000")
  71     parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
  72                       help="use : as the delimiter")
  73     parser.add_option("--paired", dest="pairID",
  74                       help="pairID value")
  75     parser.add_option("--extended", action="store_true", dest="extended",
  76                       help="use eland_extended input")
  77     parser.add_option("--verbose", action="store_true", dest="verbose")
  78     parser.add_option("--maxlines", type="int", dest="maxLines",
  79                       help="[default: 1000000000")
  80
  81     configParser = getConfigParser()
  82     section = "makerdsfromeland2"
  83     init = getConfigBoolOption(configParser, section, "init", True)
  84     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  85     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  86     geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
  87     useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
  88     pairID = getConfigOption(configParser, section, "pairID", None)
  89     maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
  90     extended = getConfigBoolOption(configParser, section, "extended", False)
  91     verbose = getConfigBoolOption(configParser, section, "verbose", False)
  92
  93     parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
  94                         geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
  95                         pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
  96
  97     return parser
  98
  99
 100 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
 101                       init=True, pairID="1", dataType="DNA", geneDataFileName=None,
 102                       cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
 103
 104     maxBorder = 0
 105     index = 0
 106     insertSize = 100000
 107
 108     geneDict = {}
 109     mapDict = {}
 110     seenSpliceList = []
 111     if dataType == 'RNA':
 112         genedatafile = open(geneDataFileName)
 113         for line in genedatafile:
 114             fields = line.strip().split('\t')
 115             blockCount = int(fields[7])
 116             if blockCount < 2:
 117                 continue
 118
 119             uname = fields[0]
 120             chrom = fields[1]
 121             sense = fields[2]
 122             chromstarts = fields[8][:-1].split(',')
 123             chromstops = fields[9][:-1].split(',')
 124             exonLengths = []
 125             totalLength = 0
 126             for index in range(blockCount):
 127                 chromstarts[index] = int(chromstarts[index])
 128                 chromstops[index] = int(chromstops[index])
 129                 exonLengths.append(chromstops[index] - chromstarts[index])
 130                 totalLength += exonLengths[index]
 131
 132             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 133             mapDict[uname] = []
 134         genedatafile.close()
 135
 136     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
 137
 138     if cachePages > rds.getDefaultCacheSize():
 139         if init:
 140             rds.setDBcache(cachePages, default=True)
 141         else:
 142             rds.setDBcache(cachePages)
 143
 144     if not init and doIndex:
 145         try:
 146             if rds.hasIndex():
 147                 rds.dropIndex()
 148         except:
 149             if verbose:
 150                 print "couldn't drop Index"
 151
 152     propertyList = []
 153     for arg in sys.argv:
 154         if '::' in arg:
 155             (pname, pvalue) = arg.strip().split('::')
 156             if pname == 'flowcell' and paired:
 157                 pvalue = pvalue + '/' + pairID
 158
 159             propertyList.append((pname, pvalue))
 160
 161     if len(propertyList) > 0:
 162         rds.insertMetadata(propertyList)
 163
 164     infile = open(filename,'r')
 165     line = infile.readline()
 166     fields = line.split()
 167     readsize = len(fields[1])
 168     readsizeString = str(readsize)
 169     if dataType == 'RNA' and readsize > 32:
 170         splicesizeString = '32'
 171     else:
 172         splicesizeString = readsizeString
 173
 174     print 'read size: %d bp' % readsize
 175     if init:
 176         rds.insertMetadata([('readsize', readsize)])
 177         rds.insertMetadata([('eland_mapped', 'True')])
 178         if extended:
 179             rds.insertMetadata([('eland_extended', 'True')])
 180
 181         if paired:
 182             rds.insertMetadata([('paired', 'True')])
 183
 184     trim = -4
 185     if dataType == 'RNA':
 186         maxBorder = readsize + trim
 187
 188     insertList = []
 189     infile = open(filename,'r')
 190     print 'mapping unique reads...'
 191     lineIndex = 0
 192     for line in infile:
 193         lineIndex += 1
 194         if lineIndex > maxLines:
 195             break
 196
 197         fields = line.split()
 198         if fields[2] in  ['QC','NM']:
 199             continue
 200
 201         (matchType, bestMatch) = getUniqueMatch(fields[2])
 202         if matchType == -1:
 203             continue
 204
 205         bestpos = []
 206         try:
 207             pos = fields[3].split(',')
 208         except:
 209             if verbose:
 210                 print 'problem with line: %s' % line.strip()
 211             continue
 212
 213         matchDict = {0:[], 1:[], 2:[], 3:[]}
 214         if len(pos) == 1:
 215             if 'splice' in pos:
 216                 continue
 217
 218             bestpos = pos
 219         else:
 220             currentChr = ''
 221             for apos in pos:
 222                 if 'splice' in apos:
 223                     continue
 224
 225                 if ':' in apos:
 226                     (front, back) = apos.split(':')
 227                     currentChr = front
 228                 else:
 229                     back = apos
 230                     apos = currentChr + ':' + apos
 231
 232                 if extended:
 233                     matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 234                     if matchType > 2:
 235                         matchType = 3
 236                 else:
 237                     matchType = int(apos[-1])
 238
 239                 matchDict[matchType].append(apos)
 240                 if bestMatch[matchType]:
 241                     bestpos.append(apos)
 242
 243         # for padded reads, mapped read might have more mismatches!
 244         if len(bestpos) == 0:
 245             # let's not worry about these yet.
 246             if 'splice' in line:
 247                 continue
 248
 249             for matchType in [1, 2, 3]:
 250                 if len(matchDict[matchType]) > 0:
 251                     if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
 252                         bestpos = matchDict[matchType]
 253                     break
 254
 255             if len(bestpos) == 0 and verbose:
 256                 print "couldn't pick best read from line: %s" % line
 257
 258         for apos in bestpos:
 259             try:
 260                 (chrom, back) = apos.split(':')
 261             except:
 262                 continue
 263
 264             if 'splice' in chrom:
 265                 continue
 266
 267             if '/' in chrom:
 268                 chromfields = chrom.split('/')
 269                 chrom = chromfields[-1]
 270
 271             if '.' in chrom:
 272                 try:
 273                     (chrom, fileExt) = chrom.split('.')
 274                 except:
 275                     if verbose:
 276                         print 'problem with chromosome on line %s' % line.strip()
 277
 278                     continue
 279
 280             if extended:
 281                 if 'F' in back:
 282                     sense = '+'
 283                     (start, matchPart) = back.split('F')
 284                 else:
 285                     sense = '-'
 286                     (start, matchPart) = back.split('R')
 287
 288                 start = int(start)
 289                 if matchPart == readsizeString:
 290                     matchType = ''
 291                 else:
 292                     matchType = decodeMismatches(fields[1], matchPart)
 293             else:
 294                 start = int(back[:-2])
 295                 if back[-2] == 'F':
 296                     sense = '+'
 297                 else:
 298                     sense = '-'
 299
 300             stop = int(start) + readsize - 1
 301             if paired:
 302                 readID = label + '-' + str(lineIndex) + '/' + pairID
 303             else:
 304                 readID = label + '-' + str(index)
 305
 306             if len(chrom) > 0:
 307                 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
 308
 309             if index % insertSize == 0:
 310                 rds.insertUniqs(insertList)
 311                 insertList = []
 312                 print '.',
 313                 sys.stdout.flush()
 314
 315             index += 1
 316
 317     if len(insertList) > 0:
 318         rds.insertUniqs(insertList)
 319         insertList = []
 320
 321     print
 322     print '%d unique reads' % index
 323     infile.close()
 324
 325     if dataType == 'RNA':
 326         print 'mapping splices...'
 327         index = 0
 328         lineIndex = 0
 329         mapfile = open(filename,'r')
 330         for line in mapfile:
 331             lineIndex += 1
 332             if lineIndex > maxLines:
 333                 break
 334
 335             if 'splice' not in line:
 336                 continue
 337
 338             fields = line.strip().split()
 339             (matchType, bestMatch) = getUniqueMatch(fields[2])
 340             if matchType == -1:
 341                 continue
 342
 343             bestpos = []
 344             pos = fields[3].split(',')
 345             matchDict = {0:[], 1:[], 2:[], 3:[]}
 346             if len(pos) == 1:
 347                 if 'chr' in pos:
 348                     continue
 349
 350                 bestpos = pos
 351             else:
 352                 currentSplice = ''
 353                 for apos in pos:
 354                     if 'splice' not in apos:
 355                         continue
 356
 357                     if ':' in apos:
 358                         if delimiter == ':':
 359                             try:
 360                                 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 361                             except:
 362                                 try:
 363                                     (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 364                                     extmodel = extmodel1 + ':' + extmodel2
 365                                 except:
 366                                     print 'warning: could not process splice %s' % apos
 367                                     continue
 368
 369                             currentSplice = extmodel + ':' + spliceID + ':' + regionStart
 370                         else:
 371                             try:
 372                                 (currentSplice, thepos) = apos.split(':')
 373                             except:
 374                                 try:
 375                                     (extmodel1, restSplice, thepos) = apos.split(':')
 376                                     currentSplice = extmodel1 + ':' + restSplice
 377                                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 378                                 except:
 379                                     print 'warning: could not process splice %s' % apos
 380                                     continue
 381                     else:
 382                         thepos = apos
 383                         apos = currentSplice + ':' + apos
 384
 385                     if extended:
 386                         matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
 387                         if matchType > 2:
 388                             matchType = 3
 389
 390                         # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
 391                         if thepos[:2] == splicesizeString:
 392                             matchType = 0
 393                     else:
 394                         matchType = int(apos[-1])
 395
 396                     if bestMatch[matchType]:
 397                         bestpos.append(apos)
 398
 399             # for padded reads, mapped read might have more mismatches!
 400             if len(bestpos) == 0:
 401                 for matchType in [1, 2, 3]:
 402                     if len(matchDict[matchType]) > 0:
 403                         if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
 404                             bestpos = matchDict[matchType]
 405
 406                         break
 407                 if len(bestpos) == 0 and verbose:
 408                     print "couldn't pick best read from line: %s" % line
 409
 410             for apos in bestpos:
 411                 if delimiter == ':':
 412                     try:
 413                         (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 414                     except:
 415                         try:
 416                             (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 417                             extmodel = extmodel1 + ':' + extmodel2
 418                         except:
 419                             print 'warning: could not process splice %s' % apos
 420                             continue
 421                 else:
 422                     try:
 423                         (currentSplice, thepos) = apos.split(':')
 424                     except:
 425                         try:
 426                             (extmodel1, restSplice, thepos) = apos.split(':')
 427                             currentSplice = extmodel1 + ':' + restSplice
 428                         except:
 429                             print 'warning: could not process splice %s' % apos
 430                             continue
 431
 432                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 433
 434                 modelfields = extmodel.split('/')
 435                 if len(modelfields) > 2:
 436                     model = string.join(modelfields[1:],'/')
 437                 else:
 438                     model = modelfields[1]
 439
 440                 if model not in geneDict:
 441                     print fields
 442                     continue
 443
 444                 (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 445                 if extended:
 446                     if 'F' in thepos:
 447                         rsense = '+'
 448                         (start, matchPart) = thepos.split('F')
 449                     else:
 450                         rsense = '-'
 451                         (start, matchPart) = thepos.split('R')
 452
 453                     rstart = int(start) - 2
 454                     if matchPart == readsizeString:
 455                         matchType = ''
 456                     elif matchPart[:2] == splicesizeString:
 457                         matchType = ''
 458                     else:
 459                         matchType = decodeMismatches(fields[1], matchPart)
 460                 else:
 461                     rstart = int(thepos[:-2]) - 2
 462                     if thepos[-2] == 'F':
 463                         rsense = '+'
 464                     else:
 465                         rsense = '-'
 466
 467                 if trim <= rstart <= maxBorder:
 468                     pass
 469                 else:
 470                     print rstart
 471                     continue
 472
 473                 currentSplice = model + delimiter + spliceID + delimiter + regionStart
 474                 spliceID = int(spliceID)
 475                 lefthalf = maxBorder - rstart
 476                 if lefthalf < 1 or lefthalf > maxBorder:
 477                     continue
 478
 479                 righthalf = readsize - lefthalf
 480                 startL = int(regionStart)  + rstart
 481                 stopL = startL + lefthalf
 482                 startR = chromstarts[spliceID + 1]
 483                 stopR = chromstarts[spliceID + 1] + righthalf
 484                 if paired:
 485                     readName = label + '-' + str(lineIndex) + '/' + pairID
 486                 else:
 487                     readName = model + '-' + str(thepos)
 488
 489                 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
 490                 index += 1
 491                 if index % insertSize == 0:
 492                     rds.insertSplices(insertList)
 493                     print '.',
 494                     sys.stdout.flush()
 495                     insertList = []
 496
 497                 if currentSplice not in seenSpliceList:
 498                     seenSpliceList.append(currentSplice)
 499
 500         mapfile.close()
 501         if len(insertList) > 0:
 502             rds.insertSplices(insertList)
 503             insertList = []
 504
 505         print
 506         print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
 507
 508     infile = open(filename,'r')
 509     print 'mapping multireads...'
 510     lineIndex = 0
 511     origReadid = rds.getMultiCount()
 512     try:
 513         readid = int(origReadid) + 1
 514     except:
 515         readid = 0
 516         origReadid = 0
 517
 518     print 'starting at %d' % (readid + 1)
 519
 520     for line in infile:
 521         lineIndex += 1
 522         if lineIndex > maxLines:
 523             break
 524
 525         fields = line.split()
 526         if len(fields) < 4:
 527             continue
 528
 529         if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
 530             continue
 531
 532         (zero, one, two) = fields[2].split(':')
 533         zero = int(zero)
 534         one = int(one)
 535         two = int(two)
 536
 537         bestMatch = [False] * readsize
 538         if zero > 1:
 539             bestMatch[0] = True
 540         elif zero == 0 and one > 1:
 541             bestMatch[1] = True
 542         elif zero == 0 and one == 0 and two > 1:
 543             bestMatch[2] = True
 544         else:
 545             continue
 546
 547         readcount = 0
 548         bestpos = []
 549         pos = fields[3].split(',')
 550         matchDict = {0:[], 1:[], 2:[], 3:[]}
 551         currentChr = ''
 552         for apos in pos:
 553             if ':' in apos:
 554                 try:
 555                     (front, back) = apos.split(':')
 556                 except:
 557                     if verbose:
 558                         print "problem splitting %s" % str(apos)
 559                     continue
 560
 561                 currentChr = front
 562             else:
 563                 back = apos
 564                 apos = currentChr + ':' + apos
 565
 566             if extended:
 567                 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 568             else:
 569                 matchType = int(apos[-1])
 570
 571             try:
 572                 matchDict[matchType].append(apos)
 573             except:
 574                 matchDict[matchType] = [apos]
 575
 576             if bestMatch[matchType]:
 577                 bestpos.append(apos)
 578
 579         # for padded reads, mapped read might have more mismatches!
 580         if len(bestpos) == 0:
 581             for matchType in [1, 2, 3]:
 582                 if len(matchDict[matchType]) > 0:
 583                     if len(matchDict[matchType]) > 1:
 584                         noSplice = True
 585                         for arg in matchDict[matchType]:
 586                             if 'splice' in arg:
 587                                 noSplice = False
 588
 589                         if noSplice:
 590                             bestpos = matchDict[matchType]
 591                     break
 592
 593             if len(bestpos) == 0 and verbose:
 594                 print "couldn't pick best read from line: %s" % line
 595                 continue
 596
 597         hasSplice = False
 598         for apos in bestpos:
 599             if 'splice' in apos:
 600                 hasSplice = True
 601
 602         # do not allow multireads that can also map accross splices for now
 603         if hasSplice:
 604             if verbose:
 605                 print "throwing out multiread because of splice conflict"
 606             continue
 607
 608         if len(bestpos) > 0:
 609             readid += 1
 610
 611         for apos in bestpos:
 612             readcount += 1
 613             (front, back) = apos.split(':')
 614             chrom = front[:-3]
 615             if extended:
 616                 if 'F' in back:
 617                     sense = '+'
 618                     (start, matchPart) = back.split('F')
 619                 else:
 620                     sense = '-'
 621                     (start, matchPart) = back.split('R')
 622
 623                 start = int(start)
 624                 if matchPart == readsizeString:
 625                     matchType = ''
 626                 else:
 627                     matchType = decodeMismatches(fields[1], matchPart)
 628             else:
 629                 start = int(back[:-2])
 630                 if back[-2] == 'F':
 631                     sense = '+'
 632                 else:
 633                     sense = '-'
 634
 635             stop = int(start) + readsize
 636             readName = '%dx%d' % (readid, len(bestpos))
 637             if paired:
 638                 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
 639
 640             insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
 641             if index % insertSize == 0:
 642                 rds.insertMulti(insertList)
 643                 insertList = []
 644                 print '.',
 645                 sys.stdout.flush()
 646
 647             index += 1
 648
 649     if len(insertList) > 0:
 650         rds.insertMulti(insertList)
 651         insertList = []
 652
 653     print
 654     print '%d multireads' % (readid - origReadid)
 655
 656     if doIndex:
 657         print 'building index....'
 658         rds.buildIndex(cachePages)
 659
 660
 661 def getUniqueMatch(elandCode):
 662     (zero, one, two) = elandCode.split(':')
 663     zero = int(zero)
 664     one = int(one)
 665     two = int(two)
 666     bestMatch = [False, False, False, False]
 667     if zero == 1:
 668         bestMatch[0] = True
 669         matchType = 0
 670     elif zero == 0 and one == 1:
 671         bestMatch[1] = True
 672         matchType = 1
 673     elif zero == 0 and one == 0 and two == 1:
 674         bestMatch[2] = True
 675         matchType = 2
 676     else:
 677         matchType = -1
 678
 679     return (matchType, bestMatch)
 680
 681
 682 def decodeMismatches(origSeq, code):
 683     output = []
 684     number = '0'
 685     index = 0
 686     for pos in code:
 687         if pos.isdigit():
 688             number += pos
 689         else:
 690             index += int(number) + 1
 691             origNT = origSeq[index - 1]
 692             output.append('%s%d%s' % (origNT, index, pos))
 693             number = '0'
 694
 695     return string.join(output, ',')
 696
 697
 698 if __name__ == "__main__":
 699     main(sys.argv)