11 import sys, string, optparse
12 from commoncode import readDataset
18 verstring = "%prog: version 3.4"
21 usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
22 \ninput reads must be sorted to properly record multireads"
24 parser = optparse.OptionParser(usage=usage)
25 parser.add_option("--append", action="store_false", dest="init",
26 help="append to existing rds file [default: create new]")
27 parser.add_option("--RNA", dest="geneDataFileName",
28 help="set data type to RNA [default: DNA]")
29 parser.add_option("--index", action="store_true", dest="doIndex",
30 help="index the output rds file")
31 parser.add_option("--cache", type="int", dest="cachePages",
32 help="number of cache pages to use [default: 100000")
33 parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
34 help="use : as the delimiter")
35 parser.add_option("--paired", dest="pairID",
37 parser.add_option("--extended", action="store_true", dest="extended",
38 help="use eland_extended input")
39 parser.add_option("--verbose", action="store_true", dest="verbose")
40 parser.add_option("--maxlines", type="int", dest="maxLines",
41 help="[default: 1000000000")
42 parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
43 (options, args) = parser.parse_args(argv[1:])
54 if options.useOldDelimiter:
59 if options.pairID is not None:
61 if options.pairID not in ['1','2']:
62 print 'pairID value must be 1 or 2'
65 print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
68 if options.geneDataFileName is not None:
71 makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
74 def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
83 genedatafile = open(geneDataFileName)
84 for line in genedatafile:
85 fields = line.strip().split('\t')
86 blockCount = int(fields[7])
93 chromstarts = fields[8][:-1].split(',')
94 chromstops = fields[9][:-1].split(',')
97 for index in range(blockCount):
98 chromstarts[index] = int(chromstarts[index])
99 chromstops[index] = int(chromstops[index])
100 exonLengths.append(chromstops[index] - chromstarts[index])
101 totalLength += exonLengths[index]
103 geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
107 rds = readDataset(outdbname, init, dataType, verbose=True)
109 if cachePages > rds.getDefaultCacheSize():
111 rds.setDBcache(cachePages, default=True)
113 rds.setDBcache(cachePages)
115 if not init and doIndex:
121 print "couldn't drop Index"
126 (pname, pvalue) = arg.strip().split('::')
127 if pname == 'flowcell' and paired:
128 pvalue = pvalue + '/' + pairID
130 propertyList.append((pname, pvalue))
132 if len(propertyList) > 0:
133 rds.insertMetadata(propertyList)
135 infile = open(filename,'r')
136 line = infile.readline()
137 fields = line.split()
138 readsize = len(fields[1])
139 readsizeString = str(readsize)
140 if dataType == 'RNA' and readsize > 32:
141 splicesizeString = '32'
143 splicesizeString = readsizeString
145 print 'read size: %d bp' % readsize
147 rds.insertMetadata([('readsize', readsize)])
148 rds.insertMetadata([('eland_mapped', 'True')])
150 rds.insertMetadata([('eland_extended', 'True')])
153 rds.insertMetadata([('paired', 'True')])
156 if dataType == 'RNA':
157 maxBorder = readsize + trim
160 infile = open(filename,'r')
161 print 'mapping unique reads...'
165 if lineIndex > maxLines:
168 fields = line.split()
169 if fields[2] in ['QC','NM']:
172 (matchType, bestMatch) = getUniqueMatch(fields[2])
178 pos = fields[3].split(',')
181 print 'problem with line: %s' % line.strip()
184 matchDict = {0:[], 1:[], 2:[], 3:[]}
197 (front, back) = apos.split(':')
201 apos = currentChr + ':' + apos
204 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
208 matchType = int(apos[-1])
210 matchDict[matchType].append(apos)
211 if bestMatch[matchType]:
214 # for padded reads, mapped read might have more mismatches!
215 if len(bestpos) == 0:
216 # let's not worry about these yet.
220 for matchType in [1, 2, 3]:
221 if len(matchDict[matchType]) > 0:
222 if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
223 bestpos = matchDict[matchType]
226 if len(bestpos) == 0 and verbose:
227 print "couldn't pick best read from line: %s" % line
231 (chrom, back) = apos.split(':')
235 if 'splice' in chrom:
239 chromfields = chrom.split('/')
240 chrom = chromfields[-1]
244 (chrom, fileExt) = chrom.split('.')
247 print 'problem with chromosome on line %s' % line.strip()
254 (start, matchPart) = back.split('F')
257 (start, matchPart) = back.split('R')
260 if matchPart == readsizeString:
263 matchType = decodeMismatches(fields[1], matchPart)
265 start = int(back[:-2])
271 stop = int(start) + readsize - 1
273 readID = label + '-' + str(lineIndex) + '/' + pairID
275 readID = label + '-' + str(index)
278 insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
280 if index % insertSize == 0:
281 rds.insertUniqs(insertList)
288 if len(insertList) > 0:
289 rds.insertUniqs(insertList)
293 print '%d unique reads' % index
296 if dataType == 'RNA':
297 print 'mapping splices...'
300 mapfile = open(filename,'r')
303 if lineIndex > maxLines:
306 if 'splice' not in line:
309 fields = line.strip().split()
310 (matchType, bestMatch) = getUniqueMatch(fields[2])
315 pos = fields[3].split(',')
316 matchDict = {0:[], 1:[], 2:[], 3:[]}
325 if 'splice' not in apos:
331 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
334 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
335 extmodel = extmodel1 + ':' + extmodel2
337 print 'warning: could not process splice %s' % apos
340 currentSplice = extmodel + ':' + spliceID + ':' + regionStart
343 (currentSplice, thepos) = apos.split(':')
346 (extmodel1, restSplice, thepos) = apos.split(':')
347 currentSplice = extmodel1 + ':' + restSplice
348 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
350 print 'warning: could not process splice %s' % apos
354 apos = currentSplice + ':' + apos
357 matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
361 # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
362 if thepos[:2] == splicesizeString:
365 matchType = int(apos[-1])
367 if bestMatch[matchType]:
370 # for padded reads, mapped read might have more mismatches!
371 if len(bestpos) == 0:
372 for matchType in [1, 2, 3]:
373 if len(matchDict[matchType]) > 0:
374 if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
375 bestpos = matchDict[matchType]
378 if len(bestpos) == 0 and verbose:
379 print "couldn't pick best read from line: %s" % line
384 (extmodel, spliceID, regionStart, thepos) = apos.split(':')
387 (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
388 extmodel = extmodel1 + ':' + extmodel2
390 print 'warning: could not process splice %s' % apos
394 (currentSplice, thepos) = apos.split(':')
397 (extmodel1, restSplice, thepos) = apos.split(':')
398 currentSplice = extmodel1 + ':' + restSplice
400 print 'warning: could not process splice %s' % apos
403 (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
405 modelfields = extmodel.split('/')
406 if len(modelfields) > 2:
407 model = string.join(modelfields[1:],'/')
409 model = modelfields[1]
411 if model not in geneDict:
415 (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
419 (start, matchPart) = thepos.split('F')
422 (start, matchPart) = thepos.split('R')
424 rstart = int(start) - 2
425 if matchPart == readsizeString:
427 elif matchPart[:2] == splicesizeString:
430 matchType = decodeMismatches(fields[1], matchPart)
432 rstart = int(thepos[:-2]) - 2
433 if thepos[-2] == 'F':
438 if trim <= rstart <= maxBorder:
444 currentSplice = model + delimiter + spliceID + delimiter + regionStart
445 spliceID = int(spliceID)
446 lefthalf = maxBorder - rstart
447 if lefthalf < 1 or lefthalf > maxBorder:
450 righthalf = readsize - lefthalf
451 startL = int(regionStart) + rstart
452 stopL = startL + lefthalf
453 startR = chromstarts[spliceID + 1]
454 stopR = chromstarts[spliceID + 1] + righthalf
456 readName = label + '-' + str(lineIndex) + '/' + pairID
458 readName = model + '-' + str(thepos)
460 insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
462 if index % insertSize == 0:
463 rds.insertSplices(insertList)
468 if currentSplice not in seenSpliceList:
469 seenSpliceList.append(currentSplice)
472 if len(insertList) > 0:
473 rds.insertSplices(insertList)
477 print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
479 infile = open(filename,'r')
480 print 'mapping multireads...'
482 origReadid = rds.getMultiCount()
484 readid = int(origReadid) + 1
489 print 'starting at %d' % (readid + 1)
493 if lineIndex > maxLines:
496 fields = line.split()
500 if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
503 (zero, one, two) = fields[2].split(':')
508 bestMatch = [False] * readsize
511 elif zero == 0 and one > 1:
513 elif zero == 0 and one == 0 and two > 1:
520 pos = fields[3].split(',')
521 matchDict = {0:[], 1:[], 2:[], 3:[]}
526 (front, back) = apos.split(':')
529 print "problem splitting %s" % str(apos)
535 apos = currentChr + ':' + apos
538 matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
540 matchType = int(apos[-1])
543 matchDict[matchType].append(apos)
545 matchDict[matchType] = [apos]
547 if bestMatch[matchType]:
550 # for padded reads, mapped read might have more mismatches!
551 if len(bestpos) == 0:
552 for matchType in [1, 2, 3]:
553 if len(matchDict[matchType]) > 0:
554 if len(matchDict[matchType]) > 1:
556 for arg in matchDict[matchType]:
561 bestpos = matchDict[matchType]
564 if len(bestpos) == 0 and verbose:
565 print "couldn't pick best read from line: %s" % line
573 # do not allow multireads that can also map accross splices for now
576 print "throwing out multiread because of splice conflict"
584 (front, back) = apos.split(':')
589 (start, matchPart) = back.split('F')
592 (start, matchPart) = back.split('R')
595 if matchPart == readsizeString:
598 matchType = decodeMismatches(fields[1], matchPart)
600 start = int(back[:-2])
606 stop = int(start) + readsize
607 readName = '%dx%d' % (readid, len(bestpos))
609 readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
611 insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
612 if index % insertSize == 0:
613 rds.insertMulti(insertList)
620 if len(insertList) > 0:
621 rds.insertMulti(insertList)
625 print '%d multireads' % (readid - origReadid)
628 print 'building index....'
629 rds.buildIndex(cachePages)
632 def getUniqueMatch(elandCode):
633 (zero, one, two) = elandCode.split(':')
637 bestMatch = [False, False, False, False]
641 elif zero == 0 and one == 1:
644 elif zero == 0 and one == 0 and two == 1:
650 return (matchType, bestMatch)
653 def decodeMismatches(origSeq, code):
661 index += int(number) + 1
662 origNT = origSeq[index - 1]
663 output.append('%s%d%s' % (origNT, index, pos))
666 return string.join(output, ',')
669 if __name__ == "__main__":