8 from array import array
9 from commoncode import getReverseComplement, isSpliceEntry
11 currentRDSVersion = "3.0"
13 class ReadDatasetError(Exception):
22 def __init__(self, multiReadIDCount={}, sense="both"):
23 self.multiReadIDCount = multiReadIDCount
27 def __call__(self, read):
28 multiplicity = self.getReadMultiplicity(read)
30 self.multiReadCount += float(1.0/multiplicity)
31 elif isSpliceEntry(read.cigar):
32 self.spliceReadCount += 1
34 self.uniqReadCount += 1
37 def getReadMultiplicity(self, read):
38 pairReadSuffix = getPairedReadNumberSuffix(read)
39 readID = "%s%s" % (read.qname, pairReadSuffix)
41 multiplicity = self.multiReadIDCount[readID]
48 def isMulti(self, read):
49 pairReadSuffix = getPairedReadNumberSuffix(read)
50 readID = "%s%s" % (read.qname, pairReadSuffix)
51 return readID in self.multiReadIDCount
55 class MaxCoordFinder(ReadCounter):
60 def __call__(self, read):
61 if self.isMulti(read):
62 self.maxMultiStart = max(self.maxMultiStart, read.pos)
63 elif isSpliceEntry(read.cigar):
64 self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
66 self.maxUniqueStart = max(self.maxMultiStart, read.pos)
71 """ Class for storing reads from experiments. Assumes that custom scripts
72 will translate incoming data into a format that can be inserted into the
73 class using the insert* methods. Default class subtype ('DNA') includes
74 tables for unique and multireads, whereas 'RNA' subtype also includes a
79 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
80 cache=False, reportCount=True):
81 """ creates an rds datafile if initialize is set to true, otherwise
82 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
85 if initialize and datasetType not in ["DNA", "RNA"]:
86 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
89 self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
90 self.bamfile = pysam.Samfile(datafile, "rb")
92 self.totalReadWeight = None
94 self.metadata = {"dataType": datasetType}
97 self.dataType = datasetType
99 self.dataType = self.metadata["dataType"]
102 self.rdsVersion = self.metadata["rdsVersion"]
104 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
106 self.fullReadCounts = self.getFullReadCount()
108 self.printRDSInfo(datafile, reportCount, initialize)
112 """ return the number of reads in the bam file
113 This is not the same as the original as the original returned total weight.
116 references = self.bamfile.references
117 for reference in references:
118 count += self.bamfile.count(reference)
123 def getMultiReadIDCounts(self, samFileName, fileMode):
125 samfile = pysam.Samfile(samFileName, fileMode)
127 print "samfile index not found"
131 for read in samfile.fetch(until_eof=True):
132 pairReadSuffix = getPairedReadNumberSuffix(read)
133 readName = "%s%s" % (read.qname, pairReadSuffix)
135 readIDCounts[readName] += 1.0
137 readIDCounts[readName] = 1.0
139 for readID in readIDCounts.keys():
140 if int(readIDCounts[readID]) == 1:
141 del readIDCounts[readID]
146 def totalReadWeight(self):
147 """ return the total weight of usable reads in the bam file.
149 if self.totalReadWeight is None:
150 total = self.fullReadCounts["uniq"]
151 total += self.fullReadCounts["multi"]
153 if self.dataType == "RNA":
154 total += self.fullReadCounts["splice"]
156 self.totalReadWeight = int(total)
158 return self.totalReadWeight
162 """ cleanup copy in local cache, if present.
167 def printRDSInfo(self, datafile, reportCount, initialize):
169 print "INITIALIZED dataset %s" % datafile
171 print "dataset %s" % datafile
174 pnameList = self.metadata.keys()
176 for pname in pnameList:
177 print "\t" + pname + "\t" + str(self.metadata[pname])
179 if reportCount and not initialize:
180 self.printReadCounts()
182 print "default cache size is %d pages" % self.getDefaultCacheSize()
185 def printReadCounts(self):
186 ucount = self.fullReadCounts["uniq"]
187 mcount = self.fullReadCounts["multi"]
188 if self.dataType == "DNA":
189 print "\n%d unique reads and %d multireads" % (ucount, mcount)
190 elif self.dataType == "RNA":
191 scount = self.fullReadCounts["splice"]
192 print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
195 def cacheDB(self, filename):
199 def saveCacheDB(self, filename):
207 def attachDB(self, filename, dbName):
211 def detachDB(self, dbName):
215 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
219 def getTables(self, dbName=""):
223 def getSqlCursor(self):
227 def getMemCursor(self):
231 def getFileCursor(self):
235 def initializeTables(self, dbConnection, cache=100000):
239 def getMetadata(self, valueName=""):
243 def getReadSize(self):
244 """ This is following the same model as the original where it assumes that all
245 read have the same readsize and that this is reflected by the size of the
248 if self.readsize is None:
249 bamFileIterator = self.bamfile.fetch(until_eof=True)
250 for read in bamFileIterator:
251 self.calculateReadsizeFromCigar(read.cigar)
257 def calculateReadsizeFromCigar(self, cigar):
258 take = (0, 1) # CIGAR operation (M/match, I/insertion)
259 self.readsize = sum([length for op,length in cigar if op in take])
262 def getDefaultCacheSize(self):
263 """ returns 0 as cache is going to be deprecated
268 def getChromosomes(self, fullChrom=True):
269 """ returns a sorted list of distinct chromosomes in bam file reference list.
272 results = list(self.getFullReferenceNames())
274 results = list(self.getShortReferenceNames())
281 def getFullReferenceNames(self):
282 """ returns a set of bam file reference names.
284 return set(self.bamfile.references)
287 def getShortReferenceNames(self):
288 """ returns a set of bam file reference names after removing first 3 characters.
291 references = self.bamfile.references
292 for chromosome in references:
293 shortName = chromosome[3:]
294 if len(shortName.strip()) > 0:
295 results.add(shortName)
300 def getMaxCoordinate(self, chrom, doUniqs=True,
301 doMulti=False, doSplices=False):
302 """ returns the maximum coordinate for reads on a given chromosome.
305 coordFinder = MaxCoordFinder()
306 self.bamfile.fetch(reference=chrom, callback=coordFinder)
309 maxCoord = coordFinder.maxUniqueStart
312 maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
315 maxCoord = max(coordFinder.maxMultiStart, maxCoord)
320 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
321 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
322 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
323 readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
324 flagLike=False, strand='', combine5p=False):
325 """ First pass of rewrite
326 1) Leave as much in place as possible
327 2) For now treat multis just like uniqs
330 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
332 selectQuery = "select start, sense, sum(weight)"
334 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
336 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
338 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
340 stmt.append("UNION ALL")
341 stmt.append(selectQuery)
342 stmt.append("from multi")
343 stmt.append(whereQuery)
344 stmt.append(groupQuery)
346 stmt = [selectQuery, "from multi", whereQuery]
350 selectQuery = "select start, sense, weight, chrom"
353 subSelect = [selectQuery, "from uniqs", whereQuery]
355 subSelect.append("union all")
356 subSelect.append(selectQuery)
357 subSelect.append("from multi")
358 subSelect.append(whereQuery)
360 subSelect = [selectQuery, "from multi", whereQuery]
362 sqlStmt = string.join(subSelect)
364 selectQuery = "select start, sense, sum(weight)"
366 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
367 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
371 self.memcon.row_factory = None
373 self.dbcon.row_factory = None
375 stmt.append("order by start")
377 stmt.append("order by readID, start")
379 stmt.append("order by chrom, start")
381 sql = self.getSqlCursor()
382 sqlQuery = string.join(stmt)
383 sql.execute(sqlQuery)
386 # Edits are starting here - trying to ignore the entire sql construct
387 bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
389 if findallOptimize and chrom != "":
400 #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
403 for read in bamFileIterator:
404 pairReadSuffix = getPairedReadNumberSuffix(read)
405 readID = "%s%s" % (read.qname, pairReadSuffix)
406 if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
409 sense = getReadSense(read.is_reverse)
411 chrom = self.bamfile.getrname(read.rname)
413 chrom = self.bamfile.getrname(read.rname)[3:]
420 newrow = {"start": int(read.pos)}
422 if self.readsize is None:
423 self.calculateReadsizeFromCigar(read.cigar)
425 newrow["stop"] = int(read.pos + self.readsize)
428 newrow["sense"] = sense
432 newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
434 newrow["weight"] = 1.0
437 #newrow["flag"] = row["flag"]
442 mismatchTag = read.opt("MD")
443 mismatches = getMismatches(mismatchTag, read.seq, sense)
447 newrow["mismatch"] = mismatches
450 newrow["readID"] = readID
453 newrow["chrom"] = chrom
456 newrow["pairID"] = pairReadSuffix[1:]
459 resultsDict[dictKey].append(newrow)
461 resultsDict[dictKey] = [newrow]
466 def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
467 readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
470 #TODO: need to pick a tag code for the erange flag - using ZF for now
471 # although it looks like the flag is really just a label on a region.
472 # since BAM can retrieve based on location if we store the regions then
473 # we can just retrieve it directly from the BAM and we do not need to
474 # flag the individual reads. The location will be sufficient.
476 if flag != read.opt("ZF"):
477 readDoesNotMeetCriteria = True
479 readDoesNotMeetCriteria = True
484 mismatchTag = read.opt("MD")
486 readDoesNotMeetCriteria = True
488 if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
489 readDoesNotMeetCriteria = True
491 return readDoesNotMeetCriteria
494 def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
496 if readID in self.multiReadIDCounts.keys():
497 if self.multiReadIDCounts[readID] > threshold or not doMulti:
503 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
504 flag="", withWeight=False, withFlag=False, withMismatch=False,
505 withID=False, withChrom=False, withPairID=False, readIDDict=False,
506 splitRead=False, hasMismatch=False, flagLike=False, start=None,
507 stop=None, strand=""):
508 """ First pass of rewrite
509 1) Leave as much in place as possible
510 2) For now treat multis just like regular splices
513 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
514 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
515 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
516 sql = self.getSqlCursor()
518 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
522 # Edits are starting here - trying to ignore the entire sql construct
523 bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
526 for read in bamFileIterator:
527 if not isSpliceEntry(read.cigar):
530 sense = getReadSense(read.is_reverse)
531 pairReadSuffix = getPairedReadNumberSuffix(read)
532 readID = "%s%s" % (read.qname, pairReadSuffix)
534 chrom = self.bamfile.getrname(read.rname)
536 chrom = self.bamfile.getrname(read.rname)[3:]
543 if self.readsize is None:
544 self.calculateReadsizeFromCigar(read.cigar)
546 startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
547 newrow = {"startL": int(startL)}
548 newrow["stopL"] = int(stopL)
549 newrow["startR"] = int(startR)
550 newrow["stopR"] = int(stopR)
552 newrow["sense"] = sense
555 newrow["weight"] = 1.0
558 #newrow["flag"] = row["flag"]
563 mismatchTag = read.opt("MD")
564 mismatches = getMismatches(mismatchTag, read.seq, sense)
568 newrow["mismatch"] = mismatches
571 newrow["readID"] = readID
574 newrow["chrom"] = chrom
577 newrow["pairID"] = pairID
580 leftDict = newrow.copy()
581 del leftDict["startR"]
582 del leftDict["stopR"]
584 del rightDict["startL"]
585 del rightDict["stopL"]
587 resultsDict[dictKey].append(leftDict)
589 resultsDict[dictKey] = [leftDict]
591 resultsDict[dictKey].append(rightDict)
594 resultsDict[dictKey].append(newrow)
596 resultsDict[dictKey] = [newrow]
601 def getFullReadCount(self):
602 fullReadCount = {"uniq": 0,
607 for chromosome in self.getChromosomes():
608 uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
609 fullReadCount["uniq"] += uniqCount
610 fullReadCount["multi"] += multiCount
611 fullReadCount["splice"] += spliceCount
616 def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
617 splices=False, reportCombined=True, sense="both"):
618 """ return read counts for a given region.
623 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
626 ucount = readCounter.uniqReadCount
629 mcount = readCounter.multiReadCount
632 scount = readCounter.spliceReadCount
635 total = ucount + mcount + scount
638 return (ucount, mcount, scount)
641 def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
642 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
644 return readCounter.spliceReadCount
647 def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
648 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
650 return readCounter.uniqReadCount
653 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
654 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
656 return readCounter.multiReadCount
659 def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
660 readCounter = ReadCounter(self.multiReadIDCounts, sense)
661 self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
667 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
672 stmt.append("select readID from uniqs")
675 stmt.append("select readID from multi")
678 stmt.append("select readID from splices")
681 selectPart = string.join(stmt, " union ")
687 limitPart = "LIMIT %d" % limit
689 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
690 sql = self.getSqlCursor()
691 sql.execute(sqlQuery)
692 result = sql.fetchall()
695 return [x[0].split("/")[0] for x in result]
697 return [x[0] for x in result]
700 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
701 """ returns the uniq and spliced mismatches in a dictionary.
703 readlen = self.getReadSize()
705 hitChromList = [mischrom]
707 hitChromList = self.getChromosomes()
711 for chromosome in hitChromList:
713 print "getting mismatches from chromosome %s" % (chromosome)
715 snpDict[chromosome] = []
716 if useSplices and self.dataType == "RNA":
717 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
718 spliceIDList = spliceDict.keys()
719 for spliceID in spliceIDList:
720 spliceEntry = spliceDict[spliceID][0]
721 startpos = spliceEntry["startL"]
722 lefthalf = spliceEntry["stopL"]
723 rightstart = spliceEntry["startR"]
724 sense = spliceEntry["sense"]
725 mismatches = spliceEntry["mismatch"]
726 spMismatchList = mismatches.split(",")
727 for mismatch in spMismatchList:
731 change_len = len(mismatch)
733 change_from = mismatch[0]
734 change_base = mismatch[change_len-1]
735 change_pos = int(mismatch[1:change_len-1])
737 change_from = getReverseComplement(mismatch[0])
738 change_base = getReverseComplement(mismatch[change_len-1])
739 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
741 firsthalf = int(lefthalf)-int(startpos)+1
743 if int(change_pos) <= int(firsthalf):
744 change_at = startpos + change_pos - 1
746 secondhalf = change_pos - firsthalf
747 change_at = rightstart + secondhalf
749 snpDict[chromosome].append([startpos, change_at, change_base, change_from])
751 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
752 if chromosome not in hitDict.keys():
755 for readEntry in hitDict[chromosome]:
756 start = readEntry["start"]
757 sense = readEntry["sense"]
758 mismatches = readEntry["mismatch"]
759 mismatchList = mismatches.split(",")
760 for mismatch in mismatchList:
764 change_len = len(mismatch)
766 change_from = mismatch[0]
767 change_base = mismatch[change_len-1]
768 change_pos = int(mismatch[1:change_len-1])
770 change_from = getReverseComplement(mismatch[0])
771 change_base = getReverseComplement(mismatch[change_len-1])
772 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
774 change_at = start + change_pos - 1
775 snpDict[chromosome].append([start, change_at, change_base, change_from])
780 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
781 useSplices=False, normalizationFactor=1.0, trackStrand=False,
782 keepStrand="both", shiftValue=0):
783 """return a profile of the chromosome as an array of per-base read coverage....
784 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
785 Will also shift position of unique and multireads (but not splices) if shift is a natural number
787 metadata = self.getMetadata()
789 readlen = int(metadata["readsize"])
793 dataType = metadata["dataType"]
794 scale = 1. / normalizationFactor
796 shift["+"] = int(shiftValue)
797 shift["-"] = -1 * int(shiftValue)
800 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
802 lastNT = cstop - cstart + readlen + shift["+"]
804 chromModel = array("f",[0.] * lastNT)
805 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
809 for readEntry in hitDict[chromosome]:
810 hstart = readEntry["start"]
811 sense = readEntry ["sense"]
812 weight = readEntry["weight"]
813 hstart = hstart - cstart + shift[sense]
814 for currentpos in range(hstart,hstart+readlen):
816 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
817 chromModel[currentpos] += scale * weight
818 elif sense == "-" and keepStrand != "plusOnly":
819 chromModel[currentpos] -= scale * weight
824 if useSplices and dataType == "RNA":
826 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
828 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
830 if chromosome in spliceDict:
831 for spliceEntry in spliceDict[chromosome]:
832 Lstart = spliceEntry["startL"]
833 Lstop = spliceEntry["stopL"]
834 Rstart = spliceEntry["startR"]
835 Rstop = spliceEntry["stopR"]
836 rsense = spliceEntry["sense"]
837 if (Rstop - cstart) < lastNT:
838 for index in range(abs(Lstop - Lstart)):
839 currentpos = Lstart - cstart + index
840 # we only track unique splices
841 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
842 chromModel[currentpos] += scale
843 elif rsense == "-" and keepStrand != "plusOnly":
844 chromModel[currentpos] -= scale
846 for index in range(abs(Rstop - Rstart)):
847 currentpos = Rstart - cstart + index
848 # we only track unique splices
849 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
850 chromModel[currentpos] += scale
851 elif rsense == "-" and keepStrand != "plusOnly":
852 chromModel[currentpos] -= scale
859 def insertMetadata(self, valuesList):
860 for (pname, pvalue) in valuesList:
861 self.metadata[pname] = pvalue
864 def updateMetadata(self, pname, newValue, originalValue=""):
865 if originalValue != "":
866 if self.metadata[pname] == originalValue:
867 self.metadata[pname] = newValue
869 self.metadata[pname] = newValue
873 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
874 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
875 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
876 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
880 restrict = " and sense = ? "
883 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
886 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
888 if self.dataType == "RNA" and splices:
889 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
890 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
896 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
897 """ set the flag fields in the entire dataset.
900 self.setFlagsInDB("uniqs", flag)
903 self.setFlagsInDB("multi", flag)
905 if self.dataType == "RNA" and splices:
906 self.setFlagsInDB("splices", flag)
911 #TODO: Secondary to setFlags()
912 def setFlagsInDB(self, table, flag):
913 """ set the flag field for every entry in a table.
915 self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
919 def resetFlags(self, uniqs=True, multi=True, splices=True):
920 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
922 self.setFlags("", uniqs, multi, splices)
926 def reweighMultireads(self, readList):
927 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
930 def setSynchronousPragma(self, value="ON"):
934 def setDBcache(self, cache, default=False):
938 def execute(self, statement, returnResults=False):
942 def executeCommit(self, statement):
946 def buildIndex(self, cache=100000):
954 def memSync(self, chrom="", index=False):
958 def copyDBEntriesToMemory(self, dbName, whereClause=""):
962 def copySpliceDBEntriesToMemory(self, whereClause=""):
966 def getReadSense(reverse):
975 def getSpliceRightStart(start, cigarTupleList):
978 for operation,length in cigarTupleList:
980 stopL = int(start + offset)
981 startR = int(stopL + length)
988 def getSpliceBounds(start, readsize, cigarTupleList):
990 #TODO: check operations here - we want to make sure we are adding the correct ones
991 for operation,length in cigarTupleList:
993 stopL = int(start + offset)
994 startR = int(stopL + length)
999 stopR = startR + offset
1001 return start, startR, stopL, stopR
1004 def getPairedReadNumberSuffix(read):
1006 if not isPairedRead(read):
1017 def isPairedRead(read):
1018 return read.is_proper_pair and (read.is_read1 or read.is_read2)
1021 def getMismatches(mismatchTag, querySequence="", sense="+"):
1023 deletionMarker = "^"
1026 lengths = re.findall("\d+", mismatchTag)
1027 mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1029 for mismatchEntry in range(len(mismatchSequences)):
1030 mismatch = mismatchSequences[mismatchEntry]
1031 position = position + int(lengths[mismatchEntry])
1032 if string.find(mismatch, deletionMarker) == 0:
1037 genomicNucleotide = querySequence[position]
1039 genomicNucleotide = "N"
1042 mismatch = getReverseComplement(mismatch)
1043 genomicNucleotide = getReverseComplement(genomicNucleotide)
1045 erange1BasedElandCompatiblePosition = int(position + 1)
1046 output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1051 return string.join(output, ",")