11 import sqlite3 as sqlite
12 from time import strftime
13 from array import array
14 from collections import defaultdict
16 commoncodeVersion = 5.5
17 currentRDSversion = 1.1
19 if environ.get("CISTEMATIC_TEMP"):
20 cisTemp = environ.get("CISTEMATIC_TEMP")
24 tempfile.tempdir = cisTemp
27 def getReverseComplement(base):
38 def countDuplicatesInList(listToCheck):
39 tally = defaultdict(int)
40 for item in listToCheck:
46 def writeLog(logFile, messenger, message):
47 """ create a log file to write a message from a messenger or append to an existing file.
50 logfile = open(logFile)
52 logfile = open(logFile, "w")
54 logfile = open(logFile, "a")
56 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
60 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
61 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
62 doMerge=True, keepPeak=False, returnTop=0):
64 """ returns a list of merged overlapping regions;
65 can optionally filter regions that have a scoreField fewer than minHits.
66 Can also optionally return the label of each region, as well as the
67 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
68 Can return the top regions based on score if higher than minHits.
70 infile = open(regionfilename)
71 lines = infile.readlines()
72 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
73 fullChrom, chromField, scoreField, pad, compact,
74 doMerge, keepPeak, returnTop)
81 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
82 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
83 doMerge=True, keepPeak=False, returnTop=0):
84 """ returns a list of merged overlapping regions;
85 can optionally filter regions that have a scoreField fewer than minHits.
86 Can also optionally return the label of each region, as well as the
87 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
88 Can return the top regions based on score if higher than minHits.
93 if 0 < returnTop < len(regionList):
95 for regionEntry in regionList:
96 if regionEntry[0] == "#":
97 if "pvalue" in regionEntry:
100 if "readShift" in regionEntry:
105 fields = regionEntry.strip().split("\t")
106 hits = float(fields[scoreField].strip())
110 returnTop = -1 * returnTop
111 minScore = scores[returnTop]
112 if minScore > minHits:
116 chromField = int(chromField)
118 #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
119 # exits at the first merge. This is not a problem when the input is sorted by start position, but in
120 # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
121 # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
122 # be merged with A but that will exit the loop and the merge with C will be missed.
123 for regionEntry in regionList:
124 if regionEntry[0] == "#":
125 if "pvalue" in regionEntry:
128 if "readShift" in regionEntry:
133 fields = regionEntry.strip().split("\t")
136 hits = float(fields[scoreField].strip())
137 except (IndexError, ValueError):
144 (chrom, pos) = fields[chromField].split(":")
145 (front, back) = pos.split("-")
149 label = string.join(fields[:chromField],"\t")
150 chrom = fields[chromField]
151 start = int(fields[chromField + 1]) - pad
152 stop = int(fields[chromField + 2]) + pad
156 start = int(fields[2]) - pad
157 stop = int(fields[3]) + pad
162 length = abs(stop - start)
164 peakPos = int(fields[-2 - hasPvalue - hasShift])
165 peakHeight = float(fields[-1 - hasPvalue - hasShift])
167 if chrom not in regions:
172 if doMerge and len(regions[chrom]) > 0:
173 for index in range(len(regions[chrom])):
174 if keepLabel and keepPeak:
175 (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
177 (rlabel, rstart, rstop, rlen) = regions[chrom][index]
179 (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
181 (rstart, rstop, rlen) = regions[chrom][index]
183 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
190 rlen = abs(rstop - rstart)
192 if peakHeight > rpeakHeight:
193 rpeakHeight = peakHeight
196 if keepLabel and keepPeak:
197 regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight)
199 regions[chrom][index] = (label, rstart, rstop, rlen)
201 regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight)
203 regions[chrom][index] = (rstart, rstop, rlen)
210 if keepLabel and keepPeak:
211 regions[chrom].append((label, start, stop, length, peakPos, peakHeight))
213 regions[chrom].append((label, start, stop, length))
215 regions[chrom].append((start, stop, length, peakPos, peakHeight))
217 regions[chrom].append((start, stop, length))
221 if verbose and (count % 100000 == 0):
225 for chrom in regions:
226 regionCount += len(regions[chrom])
228 regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1]))
230 regions[chrom].sort()
233 print "merged %d times" % mergeCount
234 print "returning %d regions" % regionCount
239 def regionsOverlap(start, stop, rstart, rstop):
241 (start, stop) = (stop, start)
244 (rstart, rstop) = (rstop, rstart)
246 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
249 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
251 (start, stop) = (stop, start)
254 (rstart, rstop) = (rstop, rstart)
256 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
259 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
260 shift=0, returnShift=False, maxshift=75):
261 """ find the peak in a list of reads (hitlist) in a region
262 of a given length and absolute start point. returns a
263 list of peaks, the number of hits, a triangular-smoothed
264 version of hitlist, and the number of reads that are
265 forward (plus) sense.
266 If doWeight is True, weight the reads accordingly.
267 If leftPlus is True, return the number of plus reads left of
268 the peak, taken to be the first TopPos position.
271 seqArray = array("f", [0.] * length)
272 smoothArray = array("f", [0.] * length)
277 shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
279 # once we have the best shift, compute seqArray
281 currentpos = read[0] - start
287 if (currentpos < 1 - readlen) or (currentpos >= length):
298 regionArray.append(read)
300 while currentpos < 0:
304 while hitIndex < readlen and currentpos < length:
305 seqArray[currentpos] += weight
312 # implementing a triangular smooth
313 for pos in range(2,length -2):
314 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
318 for currentpos in xrange(length):
319 if topNucleotide < smoothArray[currentpos]:
320 topNucleotide = smoothArray[currentpos]
321 topPos = [currentpos]
322 elif topNucleotide == smoothArray[currentpos]:
323 topPos.append(currentpos)
328 for read in regionArray:
334 currentPos = read[0] - start
335 if currentPos <= maxPos and read[1] == "+":
336 numLeftPlus += weight
339 return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift)
341 return (topPos, numHits, smoothArray, numPlus, numLeftPlus)
344 return (topPos, numHits, smoothArray, numPlus, shift)
346 return (topPos, numHits, smoothArray, numPlus)
349 def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
351 lowestScore = 20000000000
352 for testShift in xrange(maxShift + 1):
353 shiftArray = array("f", [0.] * length)
355 currentpos = read[0] - start
357 currentpos += testShift
359 currentpos -= testShift
361 if (currentpos < 1) or (currentpos >= length):
370 shiftArray[currentpos] += weight
372 shiftArray[currentpos] -= weight
375 for score in shiftArray:
376 currentScore += abs(score)
379 if currentScore < lowestScore:
380 bestShift = testShift
381 lowestScore = currentScore
386 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
387 restrictList=[], regionComplement=False, maxStop=250000000):
388 """ return a dictionary of cistematic gene features. Requires
389 cistematic, obviously. Can filter-out pseudogenes. Will use
390 additional regions dict to supplement gene models, if available.
391 Can restrict output to a list of GIDs.
392 If regionComplement is set to true, returns the regions *outside* of the
393 calculated boundaries, which is useful for retrieving intronic and
394 intergenic regions. maxStop is simply used to define the uppermost
395 boundary of the complement region.
397 featuresDict = genomeObject.getallGeneFeatures()
399 if len(restrictList) > 0:
402 if len(additionalRegionsDict) > 0:
404 for chrom in additionalRegionsDict:
405 for (label, start, stop, length) in additionalRegionsDict[chrom]:
406 if label not in sortList:
407 sortList.append(label)
409 if label not in featuresDict:
410 featuresDict[label] = []
413 sense = featuresDict[label][0][-1]
415 featuresDict[label].append(("custom", chrom, start, stop, sense))
418 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
420 featuresByChromDict = {}
421 for gid in featuresDict:
422 if restrictGID and gid not in restrictList:
425 featureList = featuresDict[gid]
428 for (ftype, chrom, start, stop, sense) in featureList:
429 if ftype == "PSEUDO":
432 if (start, stop, ftype) not in newFeatureList:
435 for (fstart, fstop, ftype2) in newFeatureList:
436 if start >= fstart and stop <= fstop:
439 if start < fstart and stop > fstop:
440 containedList.append((fstart, fstop))
442 if len(containedList) > 0:
445 for (fstart, fstop, ftype2) in newFeatureList:
446 if (fstart, fstop) not in containedList:
447 newFList.append((fstart, fstop, ftype2))
448 if start >= fstart and stop <= fstop:
451 newFeatureList = newFList
453 newFeatureList.append((start, stop, ftype))
455 if ignorePseudo and isPseudo:
458 if chrom not in featuresByChromDict:
459 featuresByChromDict[chrom] = []
461 for (start, stop, ftype) in newFeatureList:
462 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
464 for chrom in featuresByChromDict:
465 featuresByChromDict[chrom].sort()
468 complementByChromDict = {}
470 for chrom in featuresByChromDict:
471 complementByChromDict[chrom] = []
472 listLength = len(featuresByChromDict[chrom])
475 for index in range(listLength):
476 currentStop = featuresByChromDict[chrom][index][0]
478 if currentStart < currentStop:
479 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
481 currentStart = featuresByChromDict[chrom][index][1]
483 currentStop = maxStop
484 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
486 return (featuresByChromDict, complementByChromDict)
488 return featuresByChromDict
491 def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
492 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
493 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
494 """ return a dictionary of gene loci. Can be used to retrieve additional
495 sequence upstream or downstream of gene, up to the next gene. Requires
496 cistematic, obviously.
497 Can filter-out pseudogenes and use additional regions outside of existing
498 gene models. Use upstreamSpanTSS to overlap half of the upstream region
500 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
501 lengthCDS < 0bp, return only the last X bp from CDS.
503 locusByChromDict = {}
504 if upstream == 0 and downstream == 0 and not useCDS:
505 print "getLocusByChromDict: asked for no sequence - returning empty dict"
506 return locusByChromDict
507 elif upstream > 0 and downstream > 0 and not useCDS:
508 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
509 return locusByChromDict
510 elif lengthCDS != 0 and not useCDS:
511 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
512 return locusByChromDict
513 elif upstreamSpanTSS and lengthCDS != 0:
514 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
515 return locusByChromDict
516 elif lengthCDS > 0 and downstream > 0:
517 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
518 return locusByChromDict
519 elif lengthCDS < 0 and upstream > 0:
520 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
521 return locusByChromDict
523 genome = genomeObject.genome
524 featuresDict = genomeObject.getallGeneFeatures()
525 if len(additionalRegionsDict) > 0:
527 for chrom in additionalRegionsDict:
528 for (label, start, stop, length) in additionalRegionsDict[chrom]:
529 if label not in sortList:
530 sortList.append(label)
532 if label not in featuresDict:
533 featuresDict[label] = []
536 sense = featuresDict[label][0][-1]
538 featuresDict[label].append(("custom", chrom, start, stop, sense))
541 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
543 for gid in featuresDict:
544 featureList = featuresDict[gid]
546 for (ftype, chrom, start, stop, sense) in featureList:
547 newFeatureList.append((start, stop))
549 if ignorePseudo and ftype == "PSEUDO":
552 newFeatureList.sort()
554 sense = featureList[0][-1]
555 gstart = newFeatureList[0][0]
556 gstop = newFeatureList[-1][1]
557 glen = abs(gstart - gstop)
559 if not useCDS and upstream > 0:
561 if gstop > (gstart + upstream / 2):
562 gstop = gstart + upstream / 2
565 elif not useCDS and downstream > 0:
570 distance = upstream / 2
575 nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
576 if nextGene < distance * 2:
577 distance = nextGene / 2
585 distance = downstream
587 nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
588 if nextGene < downstream * 2:
589 distance = nextGene / 2
598 gstop = newFeatureList[0][0] + lengthCDS
601 if abs(lengthCDS) < glen:
602 gstart = newFeatureList[-1][1] + lengthCDS
604 if not useCDS and upstream > 0:
606 if gstart < (gstop - upstream / 2):
607 gstart = gstop - upstream / 2
610 elif not useCDS and downstream > 0:
615 distance = upstream /2
620 nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
621 if nextGene < distance * 2:
622 distance = nextGene / 2
630 distance = downstream
632 nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
633 if nextGene < downstream * 2:
634 distance = nextGene / 2
643 gstart = newFeatureList[-1][-1] - lengthCDS
646 if abs(lengthCDS) < glen:
647 gstop = newFeatureList[0][0] - lengthCDS
649 glen = abs(gstop - gstart)
650 if chrom not in locusByChromDict:
651 locusByChromDict[chrom] = []
654 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
656 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
658 for chrom in locusByChromDict:
659 locusByChromDict[chrom].sort()
661 return locusByChromDict
664 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
665 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
667 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
668 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
669 or a different weight / tag.
675 if defaultRegionFormat:
688 print "entering computeRegionBins"
689 if len(regionList) > 0:
690 for readID in regionList:
691 regionsBins[readID] = [0.] * bins
693 for chrom in regionsByChromDict:
694 for regionTuple in regionsByChromDict[chrom]:
695 regionID = regionTuple[regionIDField]
696 regionsBins[regionID] = [0.] * bins
698 for chrom in hitDict:
699 if chrom not in regionsByChromDict:
702 for regionTuple in regionsByChromDict[chrom]:
703 regionID = regionTuple[regionIDField]
704 regionsLen[regionID] = regionTuple[lengthField]
708 for (tagStart, sense, weight) in hitDict[chrom]:
710 if index % 100000 == 0:
711 print "read %d " % index,
713 stopPoint = tagStart + readlen
717 for regionTuple in regionsByChromDict[chrom][startRegion:]:
718 start = regionTuple[startField]
719 stop = regionTuple[stopField]
720 regionID = regionTuple[regionIDField]
721 rlen = regionTuple[lengthField]
723 rsense = regionTuple[senseField]
731 if start > stopPoint:
735 if start <= tagStart <= stop:
737 regionBinLength = rlen / bins
739 regionBinLength = binLength
741 startdist = tagStart - start
743 # we are relying on python's integer division quirk
744 binID = startdist / regionBinLength
745 if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
747 elif fixedFirstBin > 0:
754 regionsBins[regionID][binID] += normalizedTag * weight
756 print "%s %s" % (regionID, str(binID))
758 rdist = rlen - startdist
759 binID = rdist / regionBinLength
760 if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
762 elif fixedFirstBin > 0:
769 regionsBins[regionID][binID] += normalizedTag * weight
771 print "%s %s" % (regionID, str(binID))
775 return (regionsBins, regionsLen)
778 # TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will
779 # require going through all the code to make the changes needed. Major project for another
780 # day, but it really needs to be done
782 """ Class for storing reads from experiments. Assumes that custom scripts
783 will translate incoming data into a format that can be inserted into the
784 class using the insert* methods. Default class subtype ('DNA') includes
785 tables for unique and multireads, whereas 'RNA' subtype also includes a
789 def __init__(self, datafile, initialize=False, datasetType='', verbose=False,
790 cache=False, reportCount=True):
791 """ creates an rds datafile if initialize is set to true, otherwise
792 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
797 self.rdsVersion = "1.1"
798 self.memBacked = False
801 self.cachedDBFile = ""
807 self.cacheDB(datafile)
808 dbfile = self.cachedDBFile
812 self.dbcon = sqlite.connect(dbfile)
813 self.dbcon.row_factory = sqlite.Row
814 self.dbcon.execute("PRAGMA temp_store = MEMORY")
816 if datasetType == "":
817 self.dataType = "DNA"
819 self.dataType = datasetType
821 self.initializeTables(self.dbcon)
823 metadata = self.getMetadata("dataType")
824 self.dataType = metadata["dataType"]
827 metadata = self.getMetadata("rdsVersion")
828 self.rdsVersion = metadata["rdsVersion"]
831 self.insertMetadata([("rdsVersion", currentRDSversion)])
833 print "could not add rdsVersion - read-only ?"
834 self.rdsVersion = "pre-1.0"
838 print "INITIALIZED dataset %s" % datafile
840 print "dataset %s" % datafile
842 metadata = self.getMetadata()
844 pnameList = metadata.keys()
846 for pname in pnameList:
847 print "\t" + pname + "\t" + metadata[pname]
850 ucount = self.getUniqsCount()
851 mcount = self.getMultiCount()
852 if self.dataType == "DNA" and not initialize:
854 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
856 print "\n%s unique reads and %s multireads" % (ucount, mcount)
857 elif self.dataType == 'RNA' and not initialize:
858 scount = self.getSplicesCount()
860 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
862 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
864 print "default cache size is %d pages" % self.getDefaultCacheSize()
872 """ return the number of usable reads in the dataset.
875 total = self.getUniqsCount()
880 total += self.getMultiCount()
884 if self.dataType == "RNA":
886 total += self.getSplicesCount()
899 """ cleanup copy in local cache, if present.
901 if self.cachedDBFile != "":
905 def cacheDB(self, filename):
906 """ copy geneinfoDB to a local cache.
908 self.cachedDBFile = tempfile.mktemp() + ".db"
909 shutil.copyfile(filename, self.cachedDBFile)
912 def saveCacheDB(self, filename):
913 """ copy geneinfoDB to a local cache.
915 shutil.copyfile(self.cachedDBFile, filename)
919 """ delete geneinfoDB from local cache.
922 if self.cachedDBFile != "":
924 os.remove(self.cachedDBFile)
926 print "could not delete %s" % self.cachedDBFile
931 def attachDB(self, filename, asname):
932 """ attach another database file to the readDataset.
934 stmt = "attach '%s' as %s" % (filename, asname)
938 def detachDB(self, asname):
939 """ detach a database file to the readDataset.
941 stmt = "detach %s" % (asname)
945 def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
946 """ import into current RDS the table (with columns destcolumns,
947 with default all columns) from the database file asname,
948 using the column specification of ascolumns (default all).
950 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
952 stmt += " where flag = '%s' " % flagged
954 self.execute(stmt, forceCommit=True)
957 def getTables(self, asname=""):
958 """ get a list of table names in a particular database file.
963 sql = self.memcon.cursor()
965 sql = self.dbcon.cursor()
970 stmt = "select name from %ssqlite_master where type='table'" % asname
972 results = sql.fetchall()
975 resultList.append(row["name"])
981 """ check whether the RDS file has at least one index.
983 stmt = "select count(*) from sqlite_master where type='index'"
984 count = int(self.execute(stmt, returnResults=True)[0][0])
991 def initializeTables(self, acon, cache=100000):
992 """ creates table schema in database connection acon, which is
993 typically a database file or an in-memory database.
995 acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
996 acon.execute("create table metadata (name varchar, value varchar)")
997 acon.execute("insert into metadata values('dataType','%s')" % self.dataType)
998 acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
999 acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
1000 if self.dataType == "RNA":
1001 acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)")
1006 def getFileCursor(self):
1007 """ returns a cursor to file database for low-level (SQL)
1010 return self.dbcon.cursor()
1013 def getMemCursor(self):
1014 """ returns a cursor to memory database for low-level (SQL)
1017 return self.memcon.cursor()
1020 def getMetadata(self, valueName=""):
1021 """ returns a dictionary of metadata.
1027 whereClause = " where name = '%s' " % valueName
1030 sql = self.memcon.cursor()
1032 sql = self.dbcon.cursor()
1034 sql.execute("select name, value from metadata" + whereClause)
1035 results = sql.fetchall()
1039 pvalue = row["value"]
1040 if pname not in resultsDict:
1041 resultsDict[pname] = pvalue
1046 newName = pname + ":" + str(index)
1047 if newName not in resultsDict:
1048 resultsDict[newName] = pvalue
1056 def getReadSize(self):
1057 """ returns readsize if defined in metadata.
1059 metadata = self.getMetadata()
1060 if "readsize" not in metadata:
1061 print "no readsize parameter defined - returning 0"
1064 mysize = metadata["readsize"]
1065 if "import" in mysize:
1066 mysize = mysize.split()[0]
1071 def getDefaultCacheSize(self):
1072 """ returns the default cache size.
1074 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
1077 def getChromosomes(self, table="uniqs", fullChrom=True):
1078 """ returns a list of distinct chromosomes in table.
1080 statement = "select distinct chrom from %s" % table
1082 sql = self.memcon.cursor()
1084 sql = self.dbcon.cursor()
1086 sql.execute(statement)
1090 if row["chrom"] not in results:
1091 results.append(row["chrom"])
1093 if len(row["chrom"][3:].strip()) < 1:
1096 if row["chrom"][3:] not in results:
1097 results.append(row["chrom"][3:])
1104 def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
1105 doMulti=False, doSplices=False):
1106 """ returns the maximum coordinate for reads on a given chromosome.
1110 sql = self.memcon.cursor()
1112 sql = self.dbcon.cursor()
1116 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
1117 maxCoord = int(sql.fetchall()[0][0])
1119 print "couldn't retrieve coordMax for chromosome %s" % chrom
1122 sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
1124 spliceMax = int(sql.fetchall()[0][0])
1125 if spliceMax > maxCoord:
1126 maxCoord = spliceMax
1131 sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
1133 multiMax = int(sql.fetchall()[0][0])
1134 if multiMax > maxCoord:
1140 print "%s maxCoord: %d" % (chrom, maxCoord)
1145 def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="",
1146 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
1147 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
1148 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
1149 flagLike=False, strand="", entryDict=False, combine5p=False):
1150 """ returns a dictionary of reads in a variety of formats
1151 and which can be restricted by chromosome or custom-flag.
1152 Returns unique reads by default, but can return multireads
1153 with doMulti set to True.
1158 if chrom != "" and chrom != self.memChrom:
1159 whereClause.append("chrom = '%s'" % chrom)
1163 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
1164 whereClause.append(flagLikeClause)
1166 whereClause.append("flag = '%s'" % flag)
1169 whereClause.append("start > %d" % start)
1172 whereClause.append("stop < %d" % stop)
1174 if len(readLike) > 0:
1175 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
1176 whereClause.append(readIDClause)
1179 whereClause.append("mismatch != ''")
1181 if strand in ["+", "-"]:
1182 whereClause.append("sense = '%s'" % strand)
1184 if len(whereClause) > 0:
1185 whereStatement = string.join(whereClause, " and ")
1186 whereQuery = "where %s" % whereStatement
1192 selectClause = ["select start, sense, sum(weight)"]
1193 groupBy = ["GROUP BY start, sense"]
1195 selectClause = ["select ID, chrom, start, readID"]
1197 selectClause.append("stop")
1200 selectClause.append("sense")
1203 selectClause.append("weight")
1206 selectClause.append("flag")
1209 selectClause.append("mismatch")
1211 if limit > 0 and not combine5p:
1212 groupBy.append("LIMIT %d" % limit)
1214 selectQuery = string.join(selectClause, ",")
1215 groupQuery = string.join(groupBy)
1217 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
1219 stmt.append("UNION ALL")
1220 stmt.append(selectQuery)
1221 stmt.append("from multi")
1222 stmt.append(whereQuery)
1223 stmt.append(groupQuery)
1225 stmt = [selectQuery, "from multi", whereQuery]
1229 selectQuery = "select start, sense, weight, chrom"
1232 subSelect = [selectQuery, "from uniqs", whereQuery]
1234 subSelect.append("union all")
1235 subSelect.append(selectQuery)
1236 subSelect.append("from multi")
1237 subSelect.append(whereQuery)
1239 subSelect = [selectQuery, "from multi", whereQuery]
1241 sqlStmt = string.join(subSelect)
1243 selectQuery = "select start, sense, sum(weight)"
1245 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
1246 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
1250 self.memcon.row_factory = None
1251 sql = self.memcon.cursor()
1253 self.dbcon.row_factory = None
1254 sql = self.dbcon.cursor()
1256 stmt.append("order by start")
1259 sql = self.memcon.cursor()
1261 sql = self.dbcon.cursor()
1263 stmt.append("order by readID, start")
1266 sql = self.memcon.cursor()
1268 sql = self.dbcon.cursor()
1270 stmt.append("order by chrom, start")
1272 sqlQuery = string.join(stmt)
1273 sql.execute(sqlQuery)
1276 resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql]
1278 self.memcon.row_factory = sqlite.Row
1280 self.dbcon.row_factory = sqlite.Row
1286 readID = row["readID"]
1288 chrom = row["chrom"]
1290 chrom = row["chrom"][3:]
1292 if not readIDDict and chrom != currentChrom:
1293 resultsDict[chrom] = []
1294 currentChrom = chrom
1299 (theReadID, multiplicity) = readID.split("::")
1301 if "/" in theReadID and withPairID:
1302 (theReadID, pairID) = readID.split("/")
1304 if theReadID != currentReadID:
1305 resultsDict[theReadID] = []
1306 currentReadID = theReadID
1310 newrow = {"start": int(row["start"])}
1312 newrow["stop"] = int(row["stop"])
1315 newrow["sense"] = row["sense"]
1318 newrow["weight"] = float(row["weight"])
1321 newrow["flag"] = row["flag"]
1324 newrow["mismatch"] = row["mismatch"]
1327 newrow["readID"] = readID
1330 newrow["chrom"] = chrom
1333 newrow["pairID"] = pairID
1335 newrow = [int(row["start"])]
1337 newrow.append(int(row["stop"]))
1340 newrow.append(row["sense"])
1343 newrow.append(float(row["weight"]))
1346 newrow.append(row["flag"])
1349 newrow.append(row["mismatch"])
1352 newrow.append(readID)
1355 newrow.append(chrom)
1358 newrow.append(pairID)
1360 resultsDict[dictKey].append(newrow)
1365 def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="",
1366 flag="", withWeight=False, withFlag=False, withMismatch=False,
1367 withID=False, withChrom=False, withPairID=False, readIDDict=False,
1368 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
1369 stop=-1, strand="", entryDict=False):
1370 """ returns a dictionary of spliced reads in a variety of
1371 formats and which can be restricted by chromosome or custom-flag.
1372 Returns unique spliced reads for now.
1377 if chrom != "" and chrom != self.memChrom:
1378 whereClause = ["chrom = '%s'" % chrom]
1382 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
1383 whereClause.append(flagLikeClause)
1385 whereClause.append("flag = '%s'" % flag)
1388 whereClause.append("mismatch != ''")
1391 whereClause.append("sense = '%s'" % strand)
1394 whereClause.append("startL > %d" % start)
1397 whereClause.append("stopR < %d" % stop)
1399 if len(whereClause) > 0:
1400 whereStatement = string.join(whereClause, " and ")
1401 whereQuery = "where %s" % whereStatement
1405 selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
1407 selectClause.append("sense")
1410 selectClause.append("weight")
1413 selectClause.append("flag")
1416 selectClause.append("mismatch")
1418 selectQuery = string.join(selectClause, " ,")
1420 sql = self.memcon.cursor()
1422 sql = self.dbcon.cursor()
1424 if chrom == "" and not readIDDict:
1425 stmt = "select distinct chrom from splices %s" % whereQuery
1429 chrom = row["chrom"]
1431 chrom = row["chrom"][3:]
1433 resultsDict[chrom] = []
1434 elif chrom != "" and not readIDDict:
1435 resultsDict[chrom] = []
1437 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
1442 readID = row["readID"]
1444 chrom = row["chrom"]
1446 chrom = row["chrom"][3:]
1450 (theReadID, pairID) = readID.split("/")
1454 if theReadID != currentReadID:
1455 resultsDict[theReadID] = []
1456 currentReadID = theReadID
1462 newrow = {"startL": int(row["startL"])}
1463 newrow["stopL"] = int(row["stopL"])
1464 newrow["startR"] = int(row["startR"])
1465 newrow["stopR"] = int(row["stopR"])
1467 newrow["sense"] = row["sense"]
1470 newrow["weight"] = float(row["weight"])
1473 newrow["flag"] = row["flag"]
1476 newrow["mismatch"] = row["mismatch"]
1479 newrow["readID"] = readID
1482 newrow["chrom"] = chrom
1485 newrow["pairID"] = pairID
1489 del leftDict["startR"]
1490 del leftDict["stopR"]
1492 del rightDict["start"]
1493 del rightDict["stopL"]
1494 resultsDict[dictKey].append(leftDict)
1495 resultsDict[dictKey].append(rightDict)
1497 resultsDict[dictKey].append(newrow)
1499 newrow = [int(row["startL"])]
1500 newrow.append(int(row["stopL"]))
1501 newrow.append(int(row["startR"]))
1502 newrow.append(int(row["stopR"]))
1504 newrow.append(row["sense"])
1507 newrow.append(float(row["weight"]))
1510 newrow.append(row["flag"])
1513 newrow.append(row["mismatch"])
1516 newrow.append(readID)
1519 newrow.append(chrom)
1522 newrow.append(pairID)
1525 resultsDict[dictKey].append(newrow[:2] + newrow[4:])
1526 resultsDict[dictKey].append(newrow[2:])
1528 resultsDict[dictKey].append(newrow)
1533 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
1534 splices=False, reportCombined=True, sense="both"):
1535 """ return read counts for a given region.
1541 if sense in ["+", "-"]:
1542 restrict = " sense ='%s' " % sense
1546 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
1552 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
1558 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
1563 total = ucount + mcount + scount
1566 return (ucount, mcount, scount)
1569 def getTotalCounts(self, chrom="", rmin="", rmax=""):
1570 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
1573 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
1574 """ returns the number of row in the uniqs table.
1579 if chrom !="" and chrom != self.memChrom:
1580 whereClause = ["chrom='%s'" % chrom]
1583 whereClause.append("%s >= %s" % (startField, str(rmin)))
1586 whereClause.append("%s <= %s" % (startField, str(rmax)))
1589 whereClause.append(restrict)
1591 if len(whereClause) > 0:
1592 whereStatement = string.join(whereClause, " and ")
1593 whereQuery = "where %s" % whereStatement
1598 sql = self.memcon.cursor()
1600 sql = self.dbcon.cursor()
1603 sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery))
1605 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
1607 result = sql.fetchone()
1610 count = int(result[0])
1617 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
1618 """ returns the number of row in the splices table.
1620 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
1623 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
1624 """ returns the number of distinct readIDs in the uniqs table.
1626 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
1629 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
1630 """ returns the total weight of readIDs in the multi table.
1632 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
1635 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
1641 limitPart = "LIMIT %d" % limit
1644 stmt.append("select readID from uniqs")
1647 stmt.append("select readID from multi")
1650 stmt.append("select readID from splices")
1653 selectPart = string.join(stmt, " union ")
1657 sqlQuery = "%s group by readID %s" (selectPart, limitPart)
1659 sql = self.memcon.cursor()
1661 sql = self.dbcon.cursor()
1663 sql.execute(sqlQuery)
1664 result = sql.fetchall()
1667 return [x.split("/")[0][0] for x in result]
1669 return [x[0] for x in result]
1672 def getMismatches(self, mischrom = None, verbose=False, useSplices=True):
1673 """ returns the uniq and spliced mismatches in a dictionary.
1675 revcomp = {"A": "T",
1682 readlen = self.getReadSize()
1684 hitChromList = [mischrom]
1686 hitChromList = self.getChromosomes()
1690 for achrom in hitChromList:
1692 print "getting mismatches from chromosome %s" % (achrom)
1694 snpDict[achrom] = []
1695 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True)
1696 if useSplices and self.dataType == "RNA":
1697 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
1698 spliceIDList = spliceDict.keys()
1699 for k in spliceIDList:
1700 (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0]
1701 spMismatchList = mismatches.split(",")
1702 for mismatch in spMismatchList:
1706 change_len = len(mismatch)
1708 change_from = mismatch[0]
1709 change_base = mismatch[change_len-1]
1710 change_pos = int(mismatch[1:change_len-1])
1712 change_from = revcomp[mismatch[0]]
1713 change_base = revcomp[mismatch[change_len-1]]
1714 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
1716 firsthalf = int(lefthalf)-int(startpos)+1
1718 if int(change_pos) <= int(firsthalf):
1719 change_at = startpos + change_pos - 1
1721 secondhalf = change_pos - firsthalf
1722 change_at = rightstart + secondhalf
1724 snpDict[achrom].append([startpos, change_at, change_base, change_from])
1726 if achrom not in hitDict:
1729 for (start, sense, mismatches) in hitDict[achrom]:
1730 mismatchList = mismatches.split(",")
1731 for mismatch in mismatchList:
1735 change_len = len(mismatch)
1737 change_from = mismatch[0]
1738 change_base = mismatch[change_len-1]
1739 change_pos = int(mismatch[1:change_len-1])
1741 change_from = revcomp[mismatch[0]]
1742 change_base = revcomp[mismatch[change_len-1]]
1743 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
1745 change_at = start + change_pos - 1
1746 snpDict[achrom].append([start, change_at, change_base, change_from])
1751 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
1752 useSplices=False, normalizationFactor = 1.0, trackStrand=False,
1753 keepStrand="both", shiftValue=0):
1754 """return a profile of the chromosome as an array of per-base read coverage....
1755 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
1756 Will also shift position of unique and multireads (but not splices) if shift is a natural number
1758 metadata = self.getMetadata()
1759 readlen = int(metadata["readsize"])
1760 dataType = metadata["dataType"]
1761 scale = 1. / normalizationFactor
1763 shift["+"] = int(shiftValue)
1764 shift["-"] = -1 * int(shiftValue)
1767 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
1769 lastNT = cstop - cstart + readlen + shift["+"]
1771 chromModel = array("f", [0.] * lastNT)
1772 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
1776 for (hstart, sense, weight) in hitDict[chromosome]:
1777 hstart = hstart - cstart + shift[sense]
1778 for currentpos in range(hstart,hstart+readlen):
1780 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
1781 chromModel[currentpos] += scale * weight
1782 elif sense == '-' and keepStrand != "plusOnly":
1783 chromModel[currentpos] -= scale * weight
1788 if useSplices and dataType == "RNA":
1790 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
1792 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
1794 if chromosome in spliceDict:
1795 for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]:
1796 if (Rstop - cstart) < lastNT:
1797 for index in range(abs(Lstop - Lstart)):
1798 currentpos = Lstart - cstart + index
1799 # we only track unique splices
1800 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
1801 chromModel[currentpos] += scale
1802 elif rsense == "-" and keepStrand != "plusOnly":
1803 chromModel[currentpos] -= scale
1805 for index in range(abs(Rstop - Rstart)):
1806 currentpos = Rstart - cstart + index
1807 # we only track unique splices
1808 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
1809 chromModel[currentpos] += scale
1810 elif rsense == "-" and keepStrand != "plusOnly":
1811 chromModel[currentpos] -= scale
1818 def insertMetadata(self, valuesList):
1819 """ inserts a list of (pname, pvalue) into the metadata
1822 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
1826 def updateMetadata(self, pname, newValue, originalValue=""):
1827 """ update a metadata field given the original value and the new value.
1829 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1830 if originalValue != "":
1831 stmt += " and value='%s' " % str(originalValue)
1833 self.dbcon.execute(stmt)
1837 def insertUniqs(self, valuesList):
1838 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1839 into the uniqs table.
1841 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1845 def insertMulti(self, valuesList):
1846 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1847 into the multi table.
1849 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1853 def insertSplices(self, valuesList):
1854 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1855 into the splices table.
1857 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1861 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1862 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1863 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1864 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1868 restrict = " and sense = ? "
1871 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1874 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1876 if self.dataType == "RNA" and splices:
1877 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1878 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1883 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1884 """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1887 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1890 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1892 if self.dataType == 'RNA' and splices:
1893 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1898 def resetFlags(self, uniqs=True, multi=True, splices=True):
1899 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1902 self.dbcon.execute("UPDATE uniqs SET flag = ''")
1905 self.dbcon.execute("UPDATE multi SET flag = ''")
1907 if self.dataType == "RNA" and splices:
1908 self.dbcon.execute("UPDATE splices SET flag = ''")
1913 def reweighMultireads(self, readList):
1914 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1917 def setSynchronousPragma(self, value="ON"):
1919 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1921 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1924 def setDBcache(self, cache, default=False):
1925 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1927 self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache)
1930 def execute(self, statement, returnResults=False, forceCommit=False):
1932 sql = self.memcon.cursor()
1934 sql = self.dbcon.cursor()
1936 sql.execute(statement)
1938 result = sql.fetchall()
1943 self.memcon.commit()
1948 def buildIndex(self, cache=100000):
1949 """ Builds the file indeces for the main tables.
1950 Cache is the number of 1.5 kb pages to keep in memory.
1951 100000 pages translates into 150MB of RAM, which is our default.
1953 if cache > self.getDefaultCacheSize():
1954 self.setDBcache(cache)
1955 self.setSynchronousPragma("OFF")
1956 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1957 print "built uPosIndex"
1958 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1959 print "built uChromIndex"
1960 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1961 print "built mPosIndex"
1962 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1963 print "built mChromIndex"
1965 if self.dataType == "RNA":
1966 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1967 print "built sPosIndex"
1968 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1969 print "built sPosIndex2"
1970 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1971 print "built sChromIndex"
1974 self.setSynchronousPragma("ON")
1977 def dropIndex(self):
1978 """ drops the file indices for the main tables.
1981 self.setSynchronousPragma("OFF")
1982 self.dbcon.execute("DROP INDEX uPosIndex")
1983 self.dbcon.execute("DROP INDEX uChromIndex")
1984 self.dbcon.execute("DROP INDEX mPosIndex")
1985 self.dbcon.execute("DROP INDEX mChromIndex")
1987 if self.dataType == "RNA":
1988 self.dbcon.execute("DROP INDEX sPosIndex")
1990 self.dbcon.execute("DROP INDEX sPosIndex2")
1994 self.dbcon.execute("DROP INDEX sChromIndex")
1998 print "problem dropping index"
2000 self.setSynchronousPragma("ON")
2003 def memSync(self, chrom="", index=False):
2004 """ makes a copy of the dataset into memory for faster access.
2005 Can be restricted to a "full" chromosome. Can also build the
2009 self.memcon = sqlite.connect(":memory:")
2010 self.initializeTables(self.memcon)
2011 cursor = self.dbcon.cursor()
2014 print "memSync %s" % chrom
2015 whereclause = " where chrom = '%s' " % chrom
2016 self.memChrom = chrom
2020 self.memcon.execute("PRAGMA temp_store = MEMORY")
2021 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
2022 # copy metadata to memory
2023 self.memcon.execute("delete from metadata")
2024 results = cursor.execute("select name, value from metadata")
2027 results2.append((row["name"], row["value"]))
2029 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
2030 # copy uniqs to memory
2031 results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause)
2034 results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
2036 self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
2037 # copy multi to memory
2038 results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause)
2041 results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
2043 self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
2044 # copy splices to memory
2045 if self.dataType == "RNA":
2046 results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause)
2049 results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
2051 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2)
2054 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
2055 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
2056 if self.dataType == "RNA":
2057 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
2058 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
2060 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
2061 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
2062 if self.dataType == "RNA":
2063 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
2064 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
2066 self.memBacked = True
2067 self.memcon.row_factory = sqlite.Row
2068 self.memcon.commit()