15 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
21 verstring = "makerdsfromeland2: version 3.5"
24 usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
25 \ninput reads must be sorted to properly record multireads"
27 parser = getParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
39 if options.useOldDelimiter:
44 if options.pairID is not None:
46 if options.pairID not in ['1','2']:
47 print 'pairID value must be 1 or 2'
50 print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
53 if options.geneDataFileName is not None:
56 makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
57 options.pairID, dataType, options.geneDataFileName, options.cachePages,
58 options.maxLines, options.extended, options.verbose)
62 parser = optparse.OptionParser(usage=usage)
63 parser.add_option("--append", action="store_false", dest="init",
64 help="append to existing rds file [default: create new]")
65 parser.add_option("--RNA", dest="geneDataFileName",
66 help="set data type to RNA [default: DNA]")
67 parser.add_option("--index", action="store_true", dest="doIndex",
68 help="index the output rds file")
69 parser.add_option("--cache", type="int", dest="cachePages",
70 help="number of cache pages to use [default: 100000")
71 parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
72 help="use : as the delimiter")
73 parser.add_option("--paired", dest="pairID",
75 parser.add_option("--extended", action="store_true", dest="extended",
76 help="use eland_extended input")
77 parser.add_option("--verbose", action="store_true", dest="verbose")
78 parser.add_option("--maxlines", type="int", dest="maxLines",
79 help="[default: 1000000000")
81 configParser = getConfigParser()
82 section = "makerdsfromeland2"
83 init = getConfigBoolOption(configParser, section, "init", True)
84 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
85 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
86 geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
87 useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
88 pairID = getConfigOption(configParser, section, "pairID", None)
89 maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
90 extended = getConfigBoolOption(configParser, section, "extended", False)
91 verbose = getConfigBoolOption(configParser, section, "verbose", False)
93 parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
94 geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
95 pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
100 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
101 init=True, pairID="1", dataType="DNA", geneDataFileName=None,
102 cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
111 if dataType == 'RNA':
112 genedatafile = open(geneDataFileName)
113 for line in genedatafile:
114 fields = line.strip().split('\t')
115 blockCount = int(fields[7])
122 chromstarts = fields[8][:-1].split(',')
123 chromstops = fields[9][:-1].split(',')
126 for index in range(blockCount):
127 chromstarts[index] = int(chromstarts[index])
128 chromstops[index] = int(chromstops[index])
129 exonLengths.append(chromstops[index] - chromstarts[index])
130 totalLength += exonLengths[index]
132 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
136 rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
138 if cachePages > rds.getDefaultCacheSize():
140 rds.setDBcache(cachePages, default=True)
142 rds.setDBcache(cachePages)
144 if not init and doIndex:
150 print "couldn't drop Index"
155 (pname, pvalue) = arg.strip().split('::')
156 if pname == 'flowcell' and paired:
157 pvalue = pvalue + '/' + pairID
159 propertyList.append((pname, pvalue))
161 if len(propertyList) > 0:
162 rds.insertMetadata(propertyList)
164 infile = open(filename,'r')
165 line = infile.readline()
166 fields = line.split()
167 readsize = len(fields[1])
168 readsizeString = str(readsize)
169 if dataType == 'RNA' and readsize > 32:
170 splicesizeString = '32'
172 splicesizeString = readsizeString
174 print 'read size: %d bp' % readsize
176 rds.insertMetadata([('readsize', readsize)])
177 rds.insertMetadata([('eland_mapped', 'True')])
179 rds.insertMetadata([('eland_extended', 'True')])
182 rds.insertMetadata([('paired', 'True')])
185 if dataType == 'RNA':
186 maxBorder = readsize + trim
189 infile = open(filename,'r')
190 print 'mapping unique reads...'
194 if lineIndex > maxLines:
197 fields = line.split()
198 if fields[2] in ['QC','NM']:
201 (matchType, bestMatch) = getUniqueMatch(fields[2])
207 pos = fields[3].split(',')
210 print 'problem with line: %s' % line.strip()
213 matchDict = {0:[], 1:[], 2:[], 3:[]}
226 (front, back) = apos.split(':')
230 apos = currentChr + ':' + apos
233 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
237 matchType = int(apos[-1])
239 matchDict[matchType].append(apos)
240 if bestMatch[matchType]:
243 # for padded reads, mapped read might have more mismatches!
244 if len(bestpos) == 0:
245 # let's not worry about these yet.
249 for matchType in [1, 2, 3]:
250 if len(matchDict[matchType]) > 0:
251 if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
252 bestpos = matchDict[matchType]
255 if len(bestpos) == 0 and verbose:
256 print "couldn't pick best read from line: %s" % line
260 (chrom, back) = apos.split(':')
264 if 'splice' in chrom:
268 chromfields = chrom.split('/')
269 chrom = chromfields[-1]
273 (chrom, fileExt) = chrom.split('.')
276 print 'problem with chromosome on line %s' % line.strip()
283 (start, matchPart) = back.split('F')
286 (start, matchPart) = back.split('R')
289 if matchPart == readsizeString:
292 matchType = decodeMismatches(fields[1], matchPart)
294 start = int(back[:-2])
300 stop = int(start) + readsize - 1
302 readID = label + '-' + str(lineIndex) + '/' + pairID
304 readID = label + '-' + str(index)
307 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
309 if index % insertSize == 0:
310 rds.insertUniqs(insertList)
317 if len(insertList) > 0:
318 rds.insertUniqs(insertList)
322 print '%d unique reads' % index
325 if dataType == 'RNA':
326 print 'mapping splices...'
329 mapfile = open(filename,'r')
332 if lineIndex > maxLines:
335 if 'splice' not in line:
338 fields = line.strip().split()
339 (matchType, bestMatch) = getUniqueMatch(fields[2])
344 pos = fields[3].split(',')
345 matchDict = {0:[], 1:[], 2:[], 3:[]}
354 if 'splice' not in apos:
360 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
363 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
364 extmodel = extmodel1 + ':' + extmodel2
366 print 'warning: could not process splice %s' % apos
369 currentSplice = extmodel + ':' + spliceID + ':' + regionStart
372 (currentSplice, thepos) = apos.split(':')
375 (extmodel1, restSplice, thepos) = apos.split(':')
376 currentSplice = extmodel1 + ':' + restSplice
377 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
379 print 'warning: could not process splice %s' % apos
383 apos = currentSplice + ':' + apos
386 matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
390 # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
391 if thepos[:2] == splicesizeString:
394 matchType = int(apos[-1])
396 if bestMatch[matchType]:
399 # for padded reads, mapped read might have more mismatches!
400 if len(bestpos) == 0:
401 for matchType in [1, 2, 3]:
402 if len(matchDict[matchType]) > 0:
403 if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
404 bestpos = matchDict[matchType]
407 if len(bestpos) == 0 and verbose:
408 print "couldn't pick best read from line: %s" % line
413 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
416 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
417 extmodel = extmodel1 + ':' + extmodel2
419 print 'warning: could not process splice %s' % apos
423 (currentSplice, thepos) = apos.split(':')
426 (extmodel1, restSplice, thepos) = apos.split(':')
427 currentSplice = extmodel1 + ':' + restSplice
429 print 'warning: could not process splice %s' % apos
432 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
434 modelfields = extmodel.split('/')
435 if len(modelfields) > 2:
436 model = string.join(modelfields[1:],'/')
438 model = modelfields[1]
440 if model not in geneDict:
444 (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
448 (start, matchPart) = thepos.split('F')
451 (start, matchPart) = thepos.split('R')
453 rstart = int(start) - 2
454 if matchPart == readsizeString:
456 elif matchPart[:2] == splicesizeString:
459 matchType = decodeMismatches(fields[1], matchPart)
461 rstart = int(thepos[:-2]) - 2
462 if thepos[-2] == 'F':
467 if trim <= rstart <= maxBorder:
473 currentSplice = model + delimiter + spliceID + delimiter + regionStart
474 spliceID = int(spliceID)
475 lefthalf = maxBorder - rstart
476 if lefthalf < 1 or lefthalf > maxBorder:
479 righthalf = readsize - lefthalf
480 startL = int(regionStart) + rstart
481 stopL = startL + lefthalf
482 startR = chromstarts[spliceID + 1]
483 stopR = chromstarts[spliceID + 1] + righthalf
485 readName = label + '-' + str(lineIndex) + '/' + pairID
487 readName = model + '-' + str(thepos)
489 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
491 if index % insertSize == 0:
492 rds.insertSplices(insertList)
497 if currentSplice not in seenSpliceList:
498 seenSpliceList.append(currentSplice)
501 if len(insertList) > 0:
502 rds.insertSplices(insertList)
506 print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
508 infile = open(filename,'r')
509 print 'mapping multireads...'
511 origReadid = rds.getMultiCount()
513 readid = int(origReadid) + 1
518 print 'starting at %d' % (readid + 1)
522 if lineIndex > maxLines:
525 fields = line.split()
529 if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
532 (zero, one, two) = fields[2].split(':')
537 bestMatch = [False] * readsize
540 elif zero == 0 and one > 1:
542 elif zero == 0 and one == 0 and two > 1:
549 pos = fields[3].split(',')
550 matchDict = {0:[], 1:[], 2:[], 3:[]}
555 (front, back) = apos.split(':')
558 print "problem splitting %s" % str(apos)
564 apos = currentChr + ':' + apos
567 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
569 matchType = int(apos[-1])
572 matchDict[matchType].append(apos)
574 matchDict[matchType] = [apos]
576 if bestMatch[matchType]:
579 # for padded reads, mapped read might have more mismatches!
580 if len(bestpos) == 0:
581 for matchType in [1, 2, 3]:
582 if len(matchDict[matchType]) > 0:
583 if len(matchDict[matchType]) > 1:
585 for arg in matchDict[matchType]:
590 bestpos = matchDict[matchType]
593 if len(bestpos) == 0 and verbose:
594 print "couldn't pick best read from line: %s" % line
602 # do not allow multireads that can also map accross splices for now
605 print "throwing out multiread because of splice conflict"
613 (front, back) = apos.split(':')
618 (start, matchPart) = back.split('F')
621 (start, matchPart) = back.split('R')
624 if matchPart == readsizeString:
627 matchType = decodeMismatches(fields[1], matchPart)
629 start = int(back[:-2])
635 stop = int(start) + readsize
636 readName = '%dx%d' % (readid, len(bestpos))
638 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
640 insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
641 if index % insertSize == 0:
642 rds.insertMulti(insertList)
649 if len(insertList) > 0:
650 rds.insertMulti(insertList)
654 print '%d multireads' % (readid - origReadid)
657 print 'building index....'
658 rds.buildIndex(cachePages)
661 def getUniqueMatch(elandCode):
662 (zero, one, two) = elandCode.split(':')
666 bestMatch = [False, False, False, False]
670 elif zero == 0 and one == 1:
673 elif zero == 0 and one == 0 and two == 1:
679 return (matchType, bestMatch)
682 def decodeMismatches(origSeq, code):
690 index += int(number) + 1
691 origNT = origSeq[index - 1]
692 output.append('%s%d%s' % (origNT, index, pos))
695 return string.join(output, ',')
698 if __name__ == "__main__":