8 from array import array
9 from commoncode import getReverseComplement
11 currentRDSVersion = "3.0"
14 class ReadDatasetError(Exception):
23 def __init__(self, multiReadIDCount={}, sense="both"):
24 self.multiReadIDCount = multiReadIDCount
28 def __call__(self, read):
29 multiplicity = self.getReadMultiplicity(read)
31 self.multiReadCount += float(1.0/multiplicity)
32 elif isSpliceEntry(read.cigar):
33 self.spliceReadCount += 1
35 self.uniqReadCount += 1
38 def getReadMultiplicity(self, read):
39 pairReadSuffix = getPairedReadNumberSuffix(read)
40 readID = "%s%s" % (read.qname, pairReadSuffix)
42 multiplicity = self.multiReadIDCount[readID]
49 def isMulti(self, read):
50 pairReadSuffix = getPairedReadNumberSuffix(read)
51 readID = "%s%s" % (read.qname, pairReadSuffix)
52 return readID in self.multiReadIDCount
56 class MaxCoordFinder(ReadCounter):
61 def __call__(self, read):
62 if self.isMulti(read):
63 self.maxMultiStart = max(self.maxMultiStart, read.pos)
64 elif isSpliceEntry(read.cigar):
65 self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
67 self.maxUniqueStart = max(self.maxMultiStart, read.pos)
71 class BamReadDataset():
72 """ Class for storing reads from experiments. Assumes that custom scripts
73 will translate incoming data into a format that can be inserted into the
74 class using the insert* methods. Default class subtype ('DNA') includes
75 tables for unique and multireads, whereas 'RNA' subtype also includes a
80 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
81 cache=False, reportCount=True):
82 """ creates an rds datafile if initialize is set to true, otherwise
83 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
86 if initialize and datasetType not in ["DNA", "RNA"]:
87 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
90 self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
91 self.bamfile = pysam.Samfile(datafile, "rb")
93 self.totalReadWeight = None
95 self.metadata = {"dataType": datasetType}
98 self.dataType = datasetType
100 self.dataType = self.metadata["dataType"]
103 self.rdsVersion = self.metadata["rdsVersion"]
105 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
107 self.fullReadCounts = self.getFullReadCount()
109 self.printRDSInfo(datafile, reportCount, initialize)
113 """ return the number of reads in the bam file
114 This is not the same as the original as the original returned total weight.
117 references = self.bamfile.references
118 for reference in references:
119 count += self.bamfile.count(reference)
124 def getMultiReadIDCounts(self, samFileName, fileMode):
126 samfile = pysam.Samfile(samFileName, fileMode)
128 print "samfile index not found"
132 for read in samfile.fetch(until_eof=True):
133 pairReadSuffix = getPairedReadNumberSuffix(read)
134 readName = "%s%s" % (read.qname, pairReadSuffix)
136 readIDCounts[readName] += 1.0
138 readIDCounts[readName] = 1.0
140 for readID in readIDCounts.keys():
141 if int(readIDCounts[readID]) == 1:
142 del readIDCounts[readID]
147 def totalReadWeight(self):
148 """ return the total weight of usable reads in the bam file.
150 if self.totalReadWeight is None:
151 total = self.fullReadCounts["uniq"]
152 total += self.fullReadCounts["multi"]
154 if self.dataType == "RNA":
155 total += self.fullReadCounts["splice"]
157 self.totalReadWeight = int(total)
159 return self.totalReadWeight
163 """ cleanup copy in local cache, if present.
168 def printRDSInfo(self, datafile, reportCount, initialize):
170 print "INITIALIZED dataset %s" % datafile
172 print "dataset %s" % datafile
175 pnameList = self.metadata.keys()
177 for pname in pnameList:
178 print "\t" + pname + "\t" + str(self.metadata[pname])
180 if reportCount and not initialize:
181 self.printReadCounts()
183 print "default cache size is %d pages" % self.getDefaultCacheSize()
186 def printReadCounts(self):
187 ucount = self.fullReadCounts["uniq"]
188 mcount = self.fullReadCounts["multi"]
189 if self.dataType == "DNA":
190 print "\n%d unique reads and %d multireads" % (ucount, mcount)
191 elif self.dataType == "RNA":
192 scount = self.fullReadCounts["splice"]
193 print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
196 def cacheDB(self, filename):
200 def saveCacheDB(self, filename):
208 def attachDB(self, filename, dbName):
212 def detachDB(self, dbName):
216 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
220 def getTables(self, dbName=""):
224 def getSqlCursor(self):
228 def getMemCursor(self):
232 def getFileCursor(self):
236 def initializeTables(self, dbConnection, cache=100000):
240 def getMetadata(self, valueName=""):
244 def getReadSize(self):
245 """ This is following the same model as the original where it assumes that all
246 read have the same readsize and that this is reflected by the size of the
249 if self.readsize is None:
250 bamFileIterator = self.bamfile.fetch(until_eof=True)
251 for read in bamFileIterator:
252 self.calculateReadsizeFromCigar(read.cigar)
258 def calculateReadsizeFromCigar(self, cigar):
259 take = (0, 1) # CIGAR operation (M/match, I/insertion)
260 self.readsize = sum([length for op,length in cigar if op in take])
263 def getDefaultCacheSize(self):
264 """ returns 0 as cache is going to be deprecated
269 def getChromosomes(self, fullChrom=True):
270 """ returns a sorted list of distinct chromosomes in bam file reference list.
273 results = list(self.getFullReferenceNames())
275 results = list(self.getShortReferenceNames())
282 def getFullReferenceNames(self):
283 """ returns a set of bam file reference names.
285 return set(self.bamfile.references)
288 def getShortReferenceNames(self):
289 """ returns a set of bam file reference names after removing first 3 characters.
292 references = self.bamfile.references
293 for chromosome in references:
294 shortName = chromosome[3:]
295 if len(shortName.strip()) > 0:
296 results.add(shortName)
301 def getMaxCoordinate(self, chrom, doUniqs=True,
302 doMulti=False, doSplices=False):
303 """ returns the maximum coordinate for reads on a given chromosome.
306 coordFinder = MaxCoordFinder()
307 self.bamfile.fetch(reference=chrom, callback=coordFinder)
310 maxCoord = coordFinder.maxUniqueStart
313 maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
316 maxCoord = max(coordFinder.maxMultiStart, maxCoord)
321 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
322 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
323 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
324 readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
325 flagLike=False, strand='', combine5p=False):
326 """ First pass of rewrite
327 1) Leave as much in place as possible
328 2) For now treat multis just like uniqs
331 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
333 selectQuery = "select start, sense, sum(weight)"
335 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
337 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
339 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
341 stmt.append("UNION ALL")
342 stmt.append(selectQuery)
343 stmt.append("from multi")
344 stmt.append(whereQuery)
345 stmt.append(groupQuery)
347 stmt = [selectQuery, "from multi", whereQuery]
351 selectQuery = "select start, sense, weight, chrom"
354 subSelect = [selectQuery, "from uniqs", whereQuery]
356 subSelect.append("union all")
357 subSelect.append(selectQuery)
358 subSelect.append("from multi")
359 subSelect.append(whereQuery)
361 subSelect = [selectQuery, "from multi", whereQuery]
363 sqlStmt = string.join(subSelect)
365 selectQuery = "select start, sense, sum(weight)"
367 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
368 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
372 self.memcon.row_factory = None
374 self.dbcon.row_factory = None
376 stmt.append("order by start")
378 stmt.append("order by readID, start")
380 stmt.append("order by chrom, start")
382 sql = self.getSqlCursor()
383 sqlQuery = string.join(stmt)
384 sql.execute(sqlQuery)
387 # Edits are starting here - trying to ignore the entire sql construct
388 bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
390 if findallOptimize and chrom != "":
401 #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
404 for read in bamFileIterator:
405 pairReadSuffix = getPairedReadNumberSuffix(read)
406 readID = "%s%s" % (read.qname, pairReadSuffix)
407 if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
410 sense = getReadSense(read.is_reverse)
412 chrom = self.bamfile.getrname(read.rname)
414 chrom = self.bamfile.getrname(read.rname)[3:]
421 newrow = {"start": int(read.pos)}
423 if self.readsize is None:
424 self.calculateReadsizeFromCigar(read.cigar)
426 newrow["stop"] = int(read.pos + self.readsize)
429 newrow["sense"] = sense
433 newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
435 newrow["weight"] = 1.0
438 #newrow["flag"] = row["flag"]
443 mismatchTag = read.opt("MD")
444 mismatches = getMismatches(mismatchTag, read.seq, sense)
448 newrow["mismatch"] = mismatches
451 newrow["readID"] = readID
454 newrow["chrom"] = chrom
457 newrow["pairID"] = pairID
460 resultsDict[dictKey].append(newrow)
462 resultsDict[dictKey] = [newrow]
467 def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
468 readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
471 #TODO: need to pick a tag code for the erange flag - using ZF for now
472 # although it looks like the flag is really just a label on a region.
473 # since BAM can retrieve based on location if we store the regions then
474 # we can just retrieve it directly from the BAM and we do not need to
475 # flag the individual reads. The location will be sufficient.
477 if flag != read.opt("ZF"):
478 readDoesNotMeetCriteria = True
480 readDoesNotMeetCriteria = True
485 mismatchTag = read.opt("MD")
487 readDoesNotMeetCriteria = True
489 if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
490 readDoesNotMeetCriteria = True
492 return readDoesNotMeetCriteria
495 def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
497 if readID in self.multiReadIDCounts.keys():
498 if self.multiReadIDCounts[readID] > threshold or not doMulti:
504 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
505 flag="", withWeight=False, withFlag=False, withMismatch=False,
506 withID=False, withChrom=False, withPairID=False, readIDDict=False,
507 splitRead=False, hasMismatch=False, flagLike=False, start=None,
508 stop=None, strand=""):
509 """ First pass of rewrite
510 1) Leave as much in place as possible
511 2) For now treat multis just like regular splices
514 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
515 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
516 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
517 sql = self.getSqlCursor()
519 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
523 # Edits are starting here - trying to ignore the entire sql construct
524 bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
527 for read in bamFileIterator:
528 if not isSpliceEntry(read.cigar):
531 sense = getReadSense(read.is_reverse)
532 pairReadSuffix = getPairedReadNumberSuffix(read)
533 readID = "%s%s" % (read.qname, pairReadSuffix)
535 chrom = self.bamfile.getrname(read.rname)
537 chrom = self.bamfile.getrname(read.rname)[3:]
544 if self.readsize is None:
545 self.calculateReadsizeFromCigar(read.cigar)
547 startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
548 newrow = {"startL": int(startL)}
549 newrow["stopL"] = int(stopL)
550 newrow["startR"] = int(startR)
551 newrow["stopR"] = int(stopR)
553 newrow["sense"] = sense
556 newrow["weight"] = 1.0
559 #newrow["flag"] = row["flag"]
564 mismatchTag = read.opt("MD")
565 mismatches = getMismatches(mismatchTag, read.seq, sense)
569 newrow["mismatch"] = mismatches
572 newrow["readID"] = readID
575 newrow["chrom"] = chrom
578 newrow["pairID"] = pairID
581 leftDict = newrow.copy()
582 del leftDict["startR"]
583 del leftDict["stopR"]
585 del rightDict["startL"]
586 del rightDict["stopL"]
588 resultsDict[dictKey].append(leftDict)
590 resultsDict[dictKey] = [leftDict]
592 resultsDict[dictKey].append(rightDict)
595 resultsDict[dictKey].append(newrow)
597 resultsDict[dictKey] = [newrow]
602 def getFullReadCount(self):
603 fullReadCount = {"uniq": 0,
608 for chromosome in self.getChromosomes():
609 uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
610 fullReadCount["uniq"] += uniqCount
611 fullReadCount["multi"] += multiCount
612 fullReadCount["splice"] += spliceCount
617 def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
618 splices=False, reportCombined=True, sense="both"):
619 """ return read counts for a given region.
624 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
627 ucount = readCounter.uniqReadCount
630 mcount = readCounter.multiReadCount
633 scount = readCounter.spliceReadCount
636 total = ucount + mcount + scount
639 return (ucount, mcount, scount)
642 def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
643 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
645 return readCounter.spliceReadCount
648 def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
649 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
651 return readCounter.uniqReadCount
654 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
655 readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
657 return readCounter.multiReadCount
660 def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
661 readCounter = ReadCounter(self.multiReadIDCounts, sense)
662 self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
668 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
673 stmt.append("select readID from uniqs")
676 stmt.append("select readID from multi")
679 stmt.append("select readID from splices")
682 selectPart = string.join(stmt, " union ")
688 limitPart = "LIMIT %d" % limit
690 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
691 sql = self.getSqlCursor()
692 sql.execute(sqlQuery)
693 result = sql.fetchall()
696 return [x[0].split("/")[0] for x in result]
698 return [x[0] for x in result]
701 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
702 """ returns the uniq and spliced mismatches in a dictionary.
704 readlen = self.getReadSize()
706 hitChromList = [mischrom]
708 hitChromList = self.getChromosomes()
712 for chromosome in hitChromList:
714 print "getting mismatches from chromosome %s" % (chromosome)
716 snpDict[chromosome] = []
717 if useSplices and self.dataType == "RNA":
718 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
719 spliceIDList = spliceDict.keys()
720 for spliceID in spliceIDList:
721 spliceEntry = spliceDict[spliceID][0]
722 startpos = spliceEntry["startL"]
723 lefthalf = spliceEntry["stopL"]
724 rightstart = spliceEntry["startR"]
725 sense = spliceEntry["sense"]
726 mismatches = spliceEntry["mismatch"]
727 spMismatchList = mismatches.split(",")
728 for mismatch in spMismatchList:
732 change_len = len(mismatch)
734 change_from = mismatch[0]
735 change_base = mismatch[change_len-1]
736 change_pos = int(mismatch[1:change_len-1])
738 change_from = getReverseComplement(mismatch[0])
739 change_base = getReverseComplement(mismatch[change_len-1])
740 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
742 firsthalf = int(lefthalf)-int(startpos)+1
744 if int(change_pos) <= int(firsthalf):
745 change_at = startpos + change_pos - 1
747 secondhalf = change_pos - firsthalf
748 change_at = rightstart + secondhalf
750 snpDict[chromosome].append([startpos, change_at, change_base, change_from])
752 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
753 if chromosome not in hitDict.keys():
756 for readEntry in hitDict[chromosome]:
757 start = readEntry["start"]
758 sense = readEntry["sense"]
759 mismatches = readEntry["mismatch"]
760 mismatchList = mismatches.split(",")
761 for mismatch in mismatchList:
765 change_len = len(mismatch)
767 change_from = mismatch[0]
768 change_base = mismatch[change_len-1]
769 change_pos = int(mismatch[1:change_len-1])
771 change_from = getReverseComplement(mismatch[0])
772 change_base = getReverseComplement(mismatch[change_len-1])
773 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
775 change_at = start + change_pos - 1
776 snpDict[chromosome].append([start, change_at, change_base, change_from])
781 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
782 useSplices=False, normalizationFactor=1.0, trackStrand=False,
783 keepStrand="both", shiftValue=0):
784 """return a profile of the chromosome as an array of per-base read coverage....
785 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
786 Will also shift position of unique and multireads (but not splices) if shift is a natural number
788 metadata = self.getMetadata()
790 readlen = int(metadata["readsize"])
794 dataType = metadata["dataType"]
795 scale = 1. / normalizationFactor
797 shift["+"] = int(shiftValue)
798 shift["-"] = -1 * int(shiftValue)
801 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
803 lastNT = cstop - cstart + readlen + shift["+"]
805 chromModel = array("f",[0.] * lastNT)
806 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
810 for readEntry in hitDict[chromosome]:
811 hstart = readEntry["start"]
812 sense = readEntry ["sense"]
813 weight = readEntry["weight"]
814 hstart = hstart - cstart + shift[sense]
815 for currentpos in range(hstart,hstart+readlen):
817 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
818 chromModel[currentpos] += scale * weight
819 elif sense == "-" and keepStrand != "plusOnly":
820 chromModel[currentpos] -= scale * weight
825 if useSplices and dataType == "RNA":
827 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
829 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
831 if chromosome in spliceDict:
832 for spliceEntry in spliceDict[chromosome]:
833 Lstart = spliceEntry["startL"]
834 Lstop = spliceEntry["stopL"]
835 Rstart = spliceEntry["startR"]
836 Rstop = spliceEntry["stopR"]
837 rsense = spliceEntry["sense"]
838 if (Rstop - cstart) < lastNT:
839 for index in range(abs(Lstop - Lstart)):
840 currentpos = Lstart - cstart + index
841 # we only track unique splices
842 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
843 chromModel[currentpos] += scale
844 elif rsense == "-" and keepStrand != "plusOnly":
845 chromModel[currentpos] -= scale
847 for index in range(abs(Rstop - Rstart)):
848 currentpos = Rstart - cstart + index
849 # we only track unique splices
850 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
851 chromModel[currentpos] += scale
852 elif rsense == "-" and keepStrand != "plusOnly":
853 chromModel[currentpos] -= scale
860 def insertMetadata(self, valuesList):
861 for (pname, pvalue) in valuesList:
862 self.metadata[pname] = pvalue
865 def updateMetadata(self, pname, newValue, originalValue=""):
866 if originalValue != "":
867 if self.metadata[pname] == originalValue:
868 self.metadata[pname] = newValue
870 self.metadata[pname] = newValue
874 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
875 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
876 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
877 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
881 restrict = " and sense = ? "
884 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
887 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
889 if self.dataType == "RNA" and splices:
890 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
891 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
897 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
898 """ set the flag fields in the entire dataset.
901 self.setFlagsInDB("uniqs", flag)
904 self.setFlagsInDB("multi", flag)
906 if self.dataType == "RNA" and splices:
907 self.setFlagsInDB("splices", flag)
912 #TODO: Secondary to setFlags()
913 def setFlagsInDB(self, table, flag):
914 """ set the flag field for every entry in a table.
916 self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
920 def resetFlags(self, uniqs=True, multi=True, splices=True):
921 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
923 self.setFlags("", uniqs, multi, splices)
927 def reweighMultireads(self, readList):
928 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
931 def setSynchronousPragma(self, value="ON"):
935 def setDBcache(self, cache, default=False):
939 def execute(self, statement, returnResults=False):
943 def executeCommit(self, statement):
947 def buildIndex(self, cache=100000):
955 def memSync(self, chrom="", index=False):
959 def copyDBEntriesToMemory(self, dbName, whereClause=""):
963 def copySpliceDBEntriesToMemory(self, whereClause=""):
967 def getReadSense(reverse):
976 def isSpliceEntry(cigarTupleList):
978 for operation,length in cigarTupleList:
986 def getSpliceRightStart(start, cigarTupleList):
989 for operation,length in cigarTupleList:
991 stopL = int(start + offset)
992 startR = int(stopL + length)
999 def getSpliceBounds(start, readsize, cigarTupleList):
1001 #TODO: check operations here - we want to make sure we are adding the correct ones
1002 for operation,length in cigarTupleList:
1004 stopL = int(start + offset)
1005 startR = int(stopL + length)
1010 stopR = startR + offset
1012 return start, startR, stopL, stopR
1015 def getPairedReadNumberSuffix(read):
1017 if not isPairedRead(read):
1028 def isPairedRead(read):
1029 return read.is_proper_pair and (read.is_read1 or read.is_read2)
1032 def getMismatches(mismatchTag, querySequence="", sense="+"):
1034 deletionMarker = "^"
1037 lengths = re.findall("\d+", mismatchTag)
1038 mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1040 for mismatchEntry in range(len(mismatchSequences)):
1041 mismatch = mismatchSequences[mismatchEntry]
1042 position = position + int(lengths[mismatchEntry])
1043 if string.find(mismatch, deletionMarker) == 0:
1048 genomicNucleotide = querySequence[position]
1050 genomicNucleotide = "N"
1053 mismatch = getReverseComplement(mismatch)
1054 genomicNucleotide = getReverseComplement(genomicNucleotide)
1056 erange1BasedElandCompatiblePosition = int(position + 1)
1057 output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1062 return string.join(output, ",")