makerdsfromeland2.py

   1 #
   2 #  makerdsfromeland2.py
   3 #  ENRAGE
   4 #
   5 try:
   6     import psyco
   7     psyco.full()
   8 except:
   9     pass
  10
  11 import sys, string, optparse
  12 from commoncode import readDataset
  13
  14 def main(argv=None):
  15     if not argv:
  16         argv = sys.argv
  17
  18     verstring = "%prog: version 3.4"
  19     print verstring
  20
  21     usage = "usage:  %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
  22             \ninput reads must be sorted to properly record multireads"
  23
  24     parser = optparse.OptionParser(usage=usage)
  25     parser.add_option("--append", action="store_false", dest="init",
  26                       help="append to existing rds file [default: create new]")
  27     parser.add_option("--RNA", dest="geneDataFileName",
  28                       help="set data type to RNA [default: DNA]")
  29     parser.add_option("--index", action="store_true", dest="doIndex",
  30                       help="index the output rds file")
  31     parser.add_option("--cache", type="int", dest="cachePages",
  32                       help="number of cache pages to use [default: 100000")
  33     parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
  34                       help="use : as the delimiter")
  35     parser.add_option("--paired", dest="pairID",
  36                       help="pairID value")
  37     parser.add_option("--extended", action="store_true", dest="extended",
  38                       help="use eland_extended input")
  39     parser.add_option("--verbose", action="store_true", dest="verbose")
  40     parser.add_option("--maxlines", type="int", dest="maxLines",
  41                       help="[default: 1000000000")
  42     parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
  43     (options, args) = parser.parse_args(argv[1:])
  44
  45     if len(args) < 3:
  46         print usage
  47         sys.exit(1)
  48
  49     label = args[0]
  50     filename = args[1]
  51     outdbname = args[2]
  52
  53     delimiter = '|'
  54     if options.useOldDelimiter:
  55         delimiter = ':'
  56
  57     paired = False
  58     pairID = '1'
  59     if options.pairID is not None:
  60         paired = True
  61         if options.pairID not in ['1','2']:
  62             print 'pairID value must be 1 or 2'
  63             sys.exit(-1)
  64
  65         print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
  66
  67     dataType = 'DNA'
  68     if options.geneDataFileName is not None:
  69         dataType = 'RNA'
  70
  71     makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
  72
  73
  74 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
  75     maxBorder = 0
  76     index = 0
  77     insertSize = 100000
  78
  79     geneDict = {}
  80     mapDict = {}
  81     seenSpliceList = []
  82     if dataType == 'RNA':
  83         genedatafile = open(geneDataFileName)
  84         for line in genedatafile:
  85             fields = line.strip().split('\t')
  86             blockCount = int(fields[7])
  87             if blockCount < 2:
  88                 continue
  89
  90             uname = fields[0]
  91             chrom = fields[1]
  92             sense = fields[2]
  93             chromstarts = fields[8][:-1].split(',')
  94             chromstops = fields[9][:-1].split(',')
  95             exonLengths = []
  96             totalLength = 0
  97             for index in range(blockCount):
  98                 chromstarts[index] = int(chromstarts[index])
  99                 chromstops[index] = int(chromstops[index])
 100                 exonLengths.append(chromstops[index] - chromstarts[index])
 101                 totalLength += exonLengths[index]
 102
 103             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
 104             mapDict[uname] = []
 105         genedatafile.close()
 106
 107     rds = readDataset(outdbname, init, dataType, verbose=True)
 108
 109     if cachePages > rds.getDefaultCacheSize():
 110         if init:
 111             rds.setDBcache(cachePages, default=True)
 112         else:
 113             rds.setDBcache(cachePages)
 114
 115     if not init and doIndex:
 116         try:
 117             if rds.hasIndex():
 118                 rds.dropIndex()
 119         except:
 120             if verbose:
 121                 print "couldn't drop Index"
 122
 123     propertyList = []
 124     for arg in sys.argv:
 125         if '::' in arg:
 126             (pname, pvalue) = arg.strip().split('::')
 127             if pname == 'flowcell' and paired:
 128                 pvalue = pvalue + '/' + pairID
 129
 130             propertyList.append((pname, pvalue))
 131
 132     if len(propertyList) > 0:
 133         rds.insertMetadata(propertyList)
 134
 135     infile = open(filename,'r')
 136     line = infile.readline()
 137     fields = line.split()
 138     readsize = len(fields[1])
 139     readsizeString = str(readsize)
 140     if dataType == 'RNA' and readsize > 32:
 141         splicesizeString = '32'
 142     else:
 143         splicesizeString = readsizeString
 144
 145     print 'read size: %d bp' % readsize
 146     if init:
 147         rds.insertMetadata([('readsize', readsize)])
 148         rds.insertMetadata([('eland_mapped', 'True')])
 149         if extended:
 150             rds.insertMetadata([('eland_extended', 'True')])
 151
 152         if paired:
 153             rds.insertMetadata([('paired', 'True')])
 154
 155     trim = -4
 156     if dataType == 'RNA':
 157         maxBorder = readsize + trim
 158
 159     insertList = []
 160     infile = open(filename,'r')
 161     print 'mapping unique reads...'
 162     lineIndex = 0
 163     for line in infile:
 164         lineIndex += 1
 165         if lineIndex > maxLines:
 166             break
 167
 168         fields = line.split()
 169         if fields[2] in  ['QC','NM']:
 170             continue
 171
 172         (matchType, bestMatch) = getUniqueMatch(fields[2])
 173         if matchType == -1:
 174             continue
 175
 176         bestpos = []
 177         try:
 178             pos = fields[3].split(',')
 179         except:
 180             if verbose:
 181                 print 'problem with line: %s' % line.strip()
 182             continue
 183
 184         matchDict = {0:[], 1:[], 2:[], 3:[]}
 185         if len(pos) == 1:
 186             if 'splice' in pos:
 187                 continue
 188
 189             bestpos = pos
 190         else:
 191             currentChr = ''
 192             for apos in pos:
 193                 if 'splice' in apos:
 194                     continue
 195
 196                 if ':' in apos:
 197                     (front, back) = apos.split(':')
 198                     currentChr = front
 199                 else:
 200                     back = apos
 201                     apos = currentChr + ':' + apos
 202
 203                 if extended:
 204                     matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 205                     if matchType > 2:
 206                         matchType = 3
 207                 else:
 208                     matchType = int(apos[-1])
 209
 210                 matchDict[matchType].append(apos)
 211                 if bestMatch[matchType]:
 212                     bestpos.append(apos)
 213
 214         # for padded reads, mapped read might have more mismatches!
 215         if len(bestpos) == 0:
 216             # let's not worry about these yet.
 217             if 'splice' in line:
 218                 continue
 219
 220             for matchType in [1, 2, 3]:
 221                 if len(matchDict[matchType]) > 0:
 222                     if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
 223                         bestpos = matchDict[matchType]
 224                     break
 225
 226             if len(bestpos) == 0 and verbose:
 227                 print "couldn't pick best read from line: %s" % line
 228
 229         for apos in bestpos:
 230             try:
 231                 (chrom, back) = apos.split(':')
 232             except:
 233                 continue
 234
 235             if 'splice' in chrom:
 236                 continue
 237
 238             if '/' in chrom:
 239                 chromfields = chrom.split('/')
 240                 chrom = chromfields[-1]
 241
 242             if '.' in chrom:
 243                 try:
 244                     (chrom, fileExt) = chrom.split('.')
 245                 except:
 246                     if verbose:
 247                         print 'problem with chromosome on line %s' % line.strip()
 248
 249                     continue
 250
 251             if extended:
 252                 if 'F' in back:
 253                     sense = '+'
 254                     (start, matchPart) = back.split('F')
 255                 else:
 256                     sense = '-'
 257                     (start, matchPart) = back.split('R')
 258
 259                 start = int(start)
 260                 if matchPart == readsizeString:
 261                     matchType = ''
 262                 else:
 263                     matchType = decodeMismatches(fields[1], matchPart)
 264             else:
 265                 start = int(back[:-2])
 266                 if back[-2] == 'F':
 267                     sense = '+'
 268                 else:
 269                     sense = '-'
 270
 271             stop = int(start) + readsize - 1
 272             if paired:
 273                 readID = label + '-' + str(lineIndex) + '/' + pairID
 274             else:
 275                 readID = label + '-' + str(index)
 276
 277             if len(chrom) > 0:
 278                 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
 279
 280             if index % insertSize == 0:
 281                 rds.insertUniqs(insertList)
 282                 insertList = []
 283                 print '.',
 284                 sys.stdout.flush()
 285
 286             index += 1
 287
 288     if len(insertList) > 0:
 289         rds.insertUniqs(insertList)
 290         insertList = []
 291
 292     print
 293     print '%d unique reads' % index
 294     infile.close()
 295
 296     if dataType == 'RNA':
 297         print 'mapping splices...'
 298         index = 0
 299         lineIndex = 0
 300         mapfile = open(filename,'r')
 301         for line in mapfile:
 302             lineIndex += 1
 303             if lineIndex > maxLines:
 304                 break
 305
 306             if 'splice' not in line:
 307                 continue
 308
 309             fields = line.strip().split()
 310             (matchType, bestMatch) = getUniqueMatch(fields[2])
 311             if matchType == -1:
 312                 continue
 313
 314             bestpos = []
 315             pos = fields[3].split(',')
 316             matchDict = {0:[], 1:[], 2:[], 3:[]}
 317             if len(pos) == 1:
 318                 if 'chr' in pos:
 319                     continue
 320
 321                 bestpos = pos
 322             else:
 323                 currentSplice = ''
 324                 for apos in pos:
 325                     if 'splice' not in apos:
 326                         continue
 327
 328                     if ':' in apos:
 329                         if delimiter == ':':
 330                             try:
 331                                 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 332                             except:
 333                                 try:
 334                                     (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 335                                     extmodel = extmodel1 + ':' + extmodel2
 336                                 except:
 337                                     print 'warning: could not process splice %s' % apos
 338                                     continue
 339
 340                             currentSplice = extmodel + ':' + spliceID + ':' + regionStart
 341                         else:
 342                             try:
 343                                 (currentSplice, thepos) = apos.split(':')
 344                             except:
 345                                 try:
 346                                     (extmodel1, restSplice, thepos) = apos.split(':')
 347                                     currentSplice = extmodel1 + ':' + restSplice
 348                                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 349                                 except:
 350                                     print 'warning: could not process splice %s' % apos
 351                                     continue
 352                     else:
 353                         thepos = apos
 354                         apos = currentSplice + ':' + apos
 355
 356                     if extended:
 357                         matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
 358                         if matchType > 2:
 359                             matchType = 3
 360
 361                         # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
 362                         if thepos[:2] == splicesizeString:
 363                             matchType = 0
 364                     else:
 365                         matchType = int(apos[-1])
 366
 367                     if bestMatch[matchType]:
 368                         bestpos.append(apos)
 369
 370             # for padded reads, mapped read might have more mismatches!
 371             if len(bestpos) == 0:
 372                 for matchType in [1, 2, 3]:
 373                     if len(matchDict[matchType]) > 0:
 374                         if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
 375                             bestpos = matchDict[matchType]
 376
 377                         break
 378                 if len(bestpos) == 0 and verbose:
 379                     print "couldn't pick best read from line: %s" % line
 380
 381             for apos in bestpos:
 382                 if delimiter == ':':
 383                     try:
 384                         (extmodel, spliceID, regionStart, thepos) = apos.split(':')
 385                     except:
 386                         try:
 387                             (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
 388                             extmodel = extmodel1 + ':' + extmodel2
 389                         except:
 390                             print 'warning: could not process splice %s' % apos
 391                             continue
 392                 else:
 393                     try:
 394                         (currentSplice, thepos) = apos.split(':')
 395                     except:
 396                         try:
 397                             (extmodel1, restSplice, thepos) = apos.split(':')
 398                             currentSplice = extmodel1 + ':' + restSplice
 399                         except:
 400                             print 'warning: could not process splice %s' % apos
 401                             continue
 402
 403                     (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
 404
 405                 modelfields = extmodel.split('/')
 406                 if len(modelfields) > 2:
 407                     model = string.join(modelfields[1:],'/')
 408                 else:
 409                     model = modelfields[1]
 410
 411                 if model not in geneDict:
 412                     print fields
 413                     continue
 414
 415                 (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
 416                 if extended:
 417                     if 'F' in thepos:
 418                         rsense = '+'
 419                         (start, matchPart) = thepos.split('F')
 420                     else:
 421                         rsense = '-'
 422                         (start, matchPart) = thepos.split('R')
 423
 424                     rstart = int(start) - 2
 425                     if matchPart == readsizeString:
 426                         matchType = ''
 427                     elif matchPart[:2] == splicesizeString:
 428                         matchType = ''
 429                     else:
 430                         matchType = decodeMismatches(fields[1], matchPart)
 431                 else:
 432                     rstart = int(thepos[:-2]) - 2
 433                     if thepos[-2] == 'F':
 434                         rsense = '+'
 435                     else:
 436                         rsense = '-'
 437
 438                 if trim <= rstart <= maxBorder:
 439                     pass
 440                 else:
 441                     print rstart
 442                     continue
 443
 444                 currentSplice = model + delimiter + spliceID + delimiter + regionStart
 445                 spliceID = int(spliceID)
 446                 lefthalf = maxBorder - rstart
 447                 if lefthalf < 1 or lefthalf > maxBorder:
 448                     continue
 449
 450                 righthalf = readsize - lefthalf
 451                 startL = int(regionStart)  + rstart
 452                 stopL = startL + lefthalf
 453                 startR = chromstarts[spliceID + 1]
 454                 stopR = chromstarts[spliceID + 1] + righthalf
 455                 if paired:
 456                     readName = label + '-' + str(lineIndex) + '/' + pairID
 457                 else:
 458                     readName = model + '-' + str(thepos)
 459
 460                 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
 461                 index += 1
 462                 if index % insertSize == 0:
 463                     rds.insertSplices(insertList)
 464                     print '.',
 465                     sys.stdout.flush()
 466                     insertList = []
 467
 468                 if currentSplice not in seenSpliceList:
 469                     seenSpliceList.append(currentSplice)
 470
 471         mapfile.close()
 472         if len(insertList) > 0:
 473             rds.insertSplices(insertList)
 474             insertList = []
 475
 476         print
 477         print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
 478
 479     infile = open(filename,'r')
 480     print 'mapping multireads...'
 481     lineIndex = 0
 482     origReadid = rds.getMultiCount()
 483     try:
 484         readid = int(origReadid) + 1
 485     except:
 486         readid = 0
 487         origReadid = 0
 488
 489     print 'starting at %d' % (readid + 1)
 490
 491     for line in infile:
 492         lineIndex += 1
 493         if lineIndex > maxLines:
 494             break
 495
 496         fields = line.split()
 497         if len(fields) < 4:
 498             continue
 499
 500         if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
 501             continue
 502
 503         (zero, one, two) = fields[2].split(':')
 504         zero = int(zero)
 505         one = int(one)
 506         two = int(two)
 507
 508         bestMatch = [False] * readsize
 509         if zero > 1:
 510             bestMatch[0] = True
 511         elif zero == 0 and one > 1:
 512             bestMatch[1] = True
 513         elif zero == 0 and one == 0 and two > 1:
 514             bestMatch[2] = True
 515         else:
 516             continue
 517
 518         readcount = 0
 519         bestpos = []
 520         pos = fields[3].split(',')
 521         matchDict = {0:[], 1:[], 2:[], 3:[]}
 522         currentChr = ''
 523         for apos in pos:
 524             if ':' in apos:
 525                 try:
 526                     (front, back) = apos.split(':')
 527                 except:
 528                     if verbose:
 529                         print "problem splitting %s" % str(apos)
 530                     continue
 531
 532                 currentChr = front
 533             else:
 534                 back = apos
 535                 apos = currentChr + ':' + apos
 536
 537             if extended:
 538                 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
 539             else:
 540                 matchType = int(apos[-1])
 541
 542             try:
 543                 matchDict[matchType].append(apos)
 544             except:
 545                 matchDict[matchType] = [apos]
 546
 547             if bestMatch[matchType]:
 548                 bestpos.append(apos)
 549
 550         # for padded reads, mapped read might have more mismatches!
 551         if len(bestpos) == 0:
 552             for matchType in [1, 2, 3]:
 553                 if len(matchDict[matchType]) > 0:
 554                     if len(matchDict[matchType]) > 1:
 555                         noSplice = True
 556                         for arg in matchDict[matchType]:
 557                             if 'splice' in arg:
 558                                 noSplice = False
 559
 560                         if noSplice:
 561                             bestpos = matchDict[matchType]
 562                     break
 563
 564             if len(bestpos) == 0 and verbose:
 565                 print "couldn't pick best read from line: %s" % line
 566                 continue
 567
 568         hasSplice = False
 569         for apos in bestpos:
 570             if 'splice' in apos:
 571                 hasSplice = True
 572
 573         # do not allow multireads that can also map accross splices for now
 574         if hasSplice:
 575             if verbose:
 576                 print "throwing out multiread because of splice conflict"
 577             continue
 578
 579         if len(bestpos) > 0:
 580             readid += 1
 581
 582         for apos in bestpos:
 583             readcount += 1
 584             (front, back) = apos.split(':')
 585             chrom = front[:-3]
 586             if extended:
 587                 if 'F' in back:
 588                     sense = '+'
 589                     (start, matchPart) = back.split('F')
 590                 else:
 591                     sense = '-'
 592                     (start, matchPart) = back.split('R')
 593
 594                 start = int(start)
 595                 if matchPart == readsizeString:
 596                     matchType = ''
 597                 else:
 598                     matchType = decodeMismatches(fields[1], matchPart)
 599             else:
 600                 start = int(back[:-2])
 601                 if back[-2] == 'F':
 602                     sense = '+'
 603                 else:
 604                     sense = '-'
 605
 606             stop = int(start) + readsize
 607             readName = '%dx%d' % (readid, len(bestpos))
 608             if paired:
 609                 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
 610
 611             insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
 612             if index % insertSize == 0:
 613                 rds.insertMulti(insertList)
 614                 insertList = []
 615                 print '.',
 616                 sys.stdout.flush()
 617
 618             index += 1
 619
 620     if len(insertList) > 0:
 621         rds.insertMulti(insertList)
 622         insertList = []
 623
 624     print
 625     print '%d multireads' % (readid - origReadid)
 626
 627     if doIndex:
 628         print 'building index....'
 629         rds.buildIndex(cachePages)
 630
 631
 632 def getUniqueMatch(elandCode):
 633     (zero, one, two) = elandCode.split(':')
 634     zero = int(zero)
 635     one = int(one)
 636     two = int(two)
 637     bestMatch = [False, False, False, False]
 638     if zero == 1:
 639         bestMatch[0] = True
 640         matchType = 0
 641     elif zero == 0 and one == 1:
 642         bestMatch[1] = True
 643         matchType = 1
 644     elif zero == 0 and one == 0 and two == 1:
 645         bestMatch[2] = True
 646         matchType = 2
 647     else:
 648         matchType = -1
 649
 650     return (matchType, bestMatch)
 651
 652
 653 def decodeMismatches(origSeq, code):
 654     output = []
 655     number = '0'
 656     index = 0
 657     for pos in code:
 658         if pos.isdigit():
 659             number += pos
 660         else:
 661             index += int(number) + 1
 662             origNT = origSeq[index - 1]
 663             output.append('%s%d%s' % (origNT, index, pos))
 664             number = '0'
 665
 666     return string.join(output, ',')
 667
 668
 669 if __name__ == "__main__":
 670     main(sys.argv)