15 from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
21 verstring = "makerdsfromeland2: version 3.5"
24 usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
25 \ninput reads must be sorted to properly record multireads"
27 parser = getParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
39 if options.useOldDelimiter:
44 if options.pairID is not None:
46 if options.pairID not in ['1','2']:
47 print 'pairID value must be 1 or 2'
50 print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
53 if options.geneDataFileName is not None:
56 makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
57 options.pairID, dataType, options.geneDataFileName, options.cachePages,
58 options.maxLines, options.extended, options.verbose)
62 parser = optparse.OptionParser(usage=usage)
63 parser.add_option("--append", action="store_false", dest="init",
64 help="append to existing rds file [default: create new]")
65 parser.add_option("--RNA", dest="geneDataFileName",
66 help="set data type to RNA [default: DNA]")
67 parser.add_option("--index", action="store_true", dest="doIndex",
68 help="index the output rds file")
69 parser.add_option("--cache", type="int", dest="cachePages",
70 help="number of cache pages to use [default: 100000")
71 parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
72 help="use : as the delimiter")
73 parser.add_option("--paired", dest="pairID",
75 parser.add_option("--extended", action="store_true", dest="extended",
76 help="use eland_extended input")
77 parser.add_option("--verbose", action="store_true", dest="verbose")
78 parser.add_option("--maxlines", type="int", dest="maxLines",
79 help="[default: 1000000000")
81 configParser = getConfigParser()
82 section = "makerdsfromeland2"
83 init = getConfigBoolOption(configParser, section, "init", True)
84 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
85 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
86 geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
87 useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
88 pairID = getConfigOption(configParser, section, "pairID", None)
89 maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
90 extended = getConfigBoolOption(configParser, section, "extended", False)
91 verbose = getConfigBoolOption(configParser, section, "verbose", False)
93 parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
94 geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
95 pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
100 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
101 init=True, pairID="1", dataType="DNA", geneDataFileName=None,
102 cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
109 if dataType == 'RNA':
110 genedatafile = open(geneDataFileName)
111 for line in genedatafile:
112 fields = line.strip().split('\t')
113 blockCount = int(fields[7])
120 chromstarts = fields[8][:-1].split(',')
121 for index in range(blockCount):
122 chromstarts[index] = int(chromstarts[index])
124 geneDict[uname] = (sense, blockCount, chrom, chromstarts)
128 rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
130 if cachePages > rds.getDefaultCacheSize():
132 rds.setDBcache(cachePages, default=True)
134 rds.setDBcache(cachePages)
136 if not init and doIndex:
142 print "couldn't drop Index"
147 (pname, pvalue) = arg.strip().split('::')
148 if pname == 'flowcell' and paired:
149 pvalue = pvalue + '/' + pairID
151 propertyList.append((pname, pvalue))
153 if len(propertyList) > 0:
154 rds.insertMetadata(propertyList)
156 infile = open(filename,'r')
157 line = infile.readline()
158 fields = line.split()
159 readsize = len(fields[1])
160 readsizeString = str(readsize)
161 if dataType == 'RNA' and readsize > 32:
162 splicesizeString = '32'
164 splicesizeString = readsizeString
166 print 'read size: %d bp' % readsize
168 rds.insertMetadata([('readsize', readsize)])
169 rds.insertMetadata([('eland_mapped', 'True')])
171 rds.insertMetadata([('eland_extended', 'True')])
174 rds.insertMetadata([('paired', 'True')])
177 if dataType == 'RNA':
178 maxBorder = readsize + trim
181 infile = open(filename,'r')
182 print 'mapping unique reads...'
186 if lineIndex > maxLines:
189 fields = line.split()
190 if fields[2] in ['QC','NM']:
193 (matchType, bestMatch) = getUniqueMatch(fields[2])
199 pos = fields[3].split(',')
202 print 'problem with line: %s' % line.strip()
205 matchDict = {0:[], 1:[], 2:[], 3:[]}
218 (front, back) = apos.split(':')
222 apos = currentChr + ':' + apos
225 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
229 matchType = int(apos[-1])
231 matchDict[matchType].append(apos)
232 if bestMatch[matchType]:
235 # for padded reads, mapped read might have more mismatches!
236 if len(bestpos) == 0:
237 # let's not worry about these yet.
241 for matchType in [1, 2, 3]:
242 if len(matchDict[matchType]) > 0:
243 if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
244 bestpos = matchDict[matchType]
247 if len(bestpos) == 0 and verbose:
248 print "couldn't pick best read from line: %s" % line
252 (chrom, back) = apos.split(':')
256 if 'splice' in chrom:
260 chromfields = chrom.split('/')
261 chrom = chromfields[-1]
265 (chrom, fileExt) = chrom.split('.')
268 print 'problem with chromosome on line %s' % line.strip()
275 (start, matchPart) = back.split('F')
278 (start, matchPart) = back.split('R')
281 if matchPart == readsizeString:
284 matchType = decodeMismatches(fields[1], matchPart)
286 start = int(back[:-2])
292 stop = int(start) + readsize - 1
294 readID = label + '-' + str(lineIndex) + '/' + pairID
296 readID = label + '-' + str(index)
299 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
301 if index % insertSize == 0:
302 rds.insertUniqs(insertList)
309 if len(insertList) > 0:
310 rds.insertUniqs(insertList)
314 print '%d unique reads' % index
318 if dataType == 'RNA':
319 print 'mapping splices...'
322 mapfile = open(filename,'r')
325 if lineIndex > maxLines:
328 if 'splice' not in line:
331 fields = line.strip().split()
332 (matchType, bestMatch) = getUniqueMatch(fields[2])
337 pos = fields[3].split(',')
338 matchDict = {0:[], 1:[], 2:[], 3:[]}
347 if 'splice' not in apos:
353 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
356 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
357 extmodel = extmodel1 + ':' + extmodel2
359 print 'warning: could not process splice %s' % apos
362 currentSplice = extmodel + ':' + spliceID + ':' + regionStart
365 (currentSplice, thepos) = apos.split(':')
368 (extmodel1, restSplice, thepos) = apos.split(':')
369 currentSplice = extmodel1 + ':' + restSplice
370 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
372 print 'warning: could not process splice %s' % apos
376 apos = currentSplice + ':' + apos
379 matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
383 # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
384 if thepos[:2] == splicesizeString:
387 matchType = int(apos[-1])
389 if bestMatch[matchType]:
392 # for padded reads, mapped read might have more mismatches!
393 if len(bestpos) == 0:
394 for matchType in [1, 2, 3]:
395 if len(matchDict[matchType]) > 0:
396 if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
397 bestpos = matchDict[matchType]
400 if len(bestpos) == 0 and verbose:
401 print "couldn't pick best read from line: %s" % line
406 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
409 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
410 extmodel = extmodel1 + ':' + extmodel2
412 print 'warning: could not process splice %s' % apos
416 (currentSplice, thepos) = apos.split(':')
419 (extmodel1, restSplice, thepos) = apos.split(':')
420 currentSplice = extmodel1 + ':' + restSplice
422 print 'warning: could not process splice %s' % apos
425 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
427 modelfields = extmodel.split('/')
428 if len(modelfields) > 2:
429 model = string.join(modelfields[1:],'/')
431 model = modelfields[1]
433 if model not in geneDict:
437 (sense, blockCount, chrom, chromstarts) = geneDict[model]
441 (start, matchPart) = thepos.split('F')
444 (start, matchPart) = thepos.split('R')
446 rstart = int(start) - 2
447 if matchPart == readsizeString:
449 elif matchPart[:2] == splicesizeString:
452 matchType = decodeMismatches(fields[1], matchPart)
454 rstart = int(thepos[:-2]) - 2
455 if thepos[-2] == 'F':
460 if trim <= rstart <= maxBorder:
466 currentSplice = model + delimiter + spliceID + delimiter + regionStart
467 spliceID = int(spliceID)
468 lefthalf = maxBorder - rstart
469 if lefthalf < 1 or lefthalf > maxBorder:
472 righthalf = readsize - lefthalf
473 startL = int(regionStart) + rstart
474 stopL = startL + lefthalf
475 startR = chromstarts[spliceID + 1]
476 stopR = chromstarts[spliceID + 1] + righthalf
478 readName = label + '-' + str(lineIndex) + '/' + pairID
480 readName = model + '-' + str(thepos)
482 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
484 if index % insertSize == 0:
485 rds.insertSplices(insertList)
490 if currentSplice not in seenSpliceList:
491 seenSpliceList.append(currentSplice)
494 if len(insertList) > 0:
495 rds.insertSplices(insertList)
499 print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
501 infile = open(filename,'r')
502 print 'mapping multireads...'
504 origReadid = rds.getMultiCount()
506 readid = int(origReadid) + 1
511 print 'starting at %d' % (readid + 1)
515 if lineIndex > maxLines:
518 fields = line.split()
522 if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
525 (zero, one, two) = fields[2].split(':')
530 bestMatch = [False] * readsize
533 elif zero == 0 and one > 1:
535 elif zero == 0 and one == 0 and two > 1:
542 pos = fields[3].split(',')
543 matchDict = {0:[], 1:[], 2:[], 3:[]}
548 (front, back) = apos.split(':')
551 print "problem splitting %s" % str(apos)
557 apos = currentChr + ':' + apos
560 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
562 matchType = int(apos[-1])
565 matchDict[matchType].append(apos)
567 matchDict[matchType] = [apos]
569 if bestMatch[matchType]:
572 # for padded reads, mapped read might have more mismatches!
573 if len(bestpos) == 0:
574 for matchType in [1, 2, 3]:
575 if len(matchDict[matchType]) > 0:
576 if len(matchDict[matchType]) > 1:
578 for arg in matchDict[matchType]:
583 bestpos = matchDict[matchType]
586 if len(bestpos) == 0 and verbose:
587 print "couldn't pick best read from line: %s" % line
595 # do not allow multireads that can also map accross splices for now
598 print "throwing out multiread because of splice conflict"
606 (front, back) = apos.split(':')
611 (start, matchPart) = back.split('F')
614 (start, matchPart) = back.split('R')
617 if matchPart == readsizeString:
620 matchType = decodeMismatches(fields[1], matchPart)
622 start = int(back[:-2])
628 stop = int(start) + readsize
629 readName = '%dx%d' % (readid, len(bestpos))
631 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
633 insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
634 if index % insertSize == 0:
635 rds.insertMulti(insertList)
642 if len(insertList) > 0:
643 rds.insertMulti(insertList)
647 print '%d multireads' % (readid - origReadid)
650 print 'building index....'
651 rds.buildIndex(cachePages)
654 def getUniqueMatch(elandCode):
655 (zero, one, two) = elandCode.split(':')
659 bestMatch = [False, False, False, False]
663 elif zero == 0 and one == 1:
666 elif zero == 0 and one == 0 and two == 1:
672 return (matchType, bestMatch)
675 def decodeMismatches(origSeq, code):
683 index += int(number) + 1
684 origNT = origSeq[index - 1]
685 output.append('%s%d%s' % (origNT, index, pos))
688 return string.join(output, ',')
691 if __name__ == "__main__":