9a795c8cde5c5ba6d257da963ebf4c32f7d6e42d
[erange.git] / ReadDataset.py
1 import string
2 import tempfile
3 import shutil
4 import os
5 import re
6 import sys
7 import pysam
8 from array import array
9 from commoncode import getReverseComplement
10
11 currentRDSVersion = "3.0"
12
13
14 class ReadDatasetError(Exception):
15     pass
16
17
18 class ReadCounter():
19     uniqReadCount = 0
20     multiReadCount = 0.0
21     spliceReadCount = 0
22
23     def __init__(self, multiReadIDCount={}, sense="both"):
24         self.multiReadIDCount = multiReadIDCount
25         self.sense = sense
26
27
28     def __call__(self, read):
29         multiplicity = self.getReadMultiplicity(read)
30         if multiplicity > 1:
31             self.multiReadCount += float(1.0/multiplicity)
32         elif isSpliceEntry(read.cigar):
33             self.spliceReadCount += 1
34         else:
35             self.uniqReadCount += 1
36
37
38     def getReadMultiplicity(self, read):
39         pairReadSuffix = getPairedReadNumberSuffix(read)
40         readID = "%s%s" % (read.qname, pairReadSuffix)
41         try:
42             multiplicity = self.multiReadIDCount[readID]
43         except KeyError:
44             multiplicity = 1
45
46         return multiplicity
47
48
49     def isMulti(self, read):
50         pairReadSuffix = getPairedReadNumberSuffix(read)
51         readID = "%s%s" % (read.qname, pairReadSuffix)
52         return readID in self.multiReadIDCount
53
54
55
56 class MaxCoordFinder(ReadCounter):
57     maxUniqueStart = 0
58     maxMultiStart = 0
59     maxSpliceStart = 0
60
61     def __call__(self, read):
62         if self.isMulti(read):
63             self.maxMultiStart = max(self.maxMultiStart, read.pos)
64         elif isSpliceEntry(read.cigar):
65             self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
66         else:
67             self.maxUniqueStart = max(self.maxMultiStart, read.pos)
68
69
70
71 class BamReadDataset():
72     """ Class for storing reads from experiments. Assumes that custom scripts
73     will translate incoming data into a format that can be inserted into the
74     class using the insert* methods. Default class subtype ('DNA') includes
75     tables for unique and multireads, whereas 'RNA' subtype also includes a
76     splices table.
77     """
78
79
80     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
81                  cache=False, reportCount=True):
82         """ creates an rds datafile if initialize is set to true, otherwise
83         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
84         """
85
86         if initialize and datasetType not in ["DNA", "RNA"]:
87             raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
88
89         self.dataType = ""
90         self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
91         self.bamfile = pysam.Samfile(datafile, "rb")
92         self.readsize = None
93         self.totalReadWeight = None
94
95         self.metadata = {"dataType": datasetType}
96
97         if initialize:
98             self.dataType = datasetType
99         else:
100             self.dataType = self.metadata["dataType"]
101
102         try:
103             self.rdsVersion = self.metadata["rdsVersion"]
104         except KeyError:
105             self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
106
107         self.fullReadCounts = self.getFullReadCount()
108         if verbose:
109             self.printRDSInfo(datafile, reportCount, initialize)
110
111
112     def __len__(self):
113         """ return the number of reads in the bam file
114             This is not the same as the original as the original returned total weight.
115         """
116         count = 0
117         references = self.bamfile.references
118         for reference in references:
119             count += self.bamfile.count(reference)
120
121         return count
122
123
124     def getMultiReadIDCounts(self, samFileName, fileMode):
125         try:
126             samfile = pysam.Samfile(samFileName, fileMode)
127         except ValueError:
128             print "samfile index not found"
129             sys.exit(1)
130
131         readIDCounts = {}
132         for read in samfile.fetch(until_eof=True):
133             pairReadSuffix = getPairedReadNumberSuffix(read)
134             readName = "%s%s" % (read.qname, pairReadSuffix)
135             try:
136                 readIDCounts[readName] += 1.0
137             except KeyError:
138                 readIDCounts[readName] = 1.0
139
140         for readID in readIDCounts.keys():
141             if int(readIDCounts[readID]) == 1:
142                 del readIDCounts[readID]
143
144         return readIDCounts
145
146
147     def totalReadWeight(self):
148         """ return the total weight of usable reads in the bam file.
149         """
150         if self.totalReadWeight is None:
151             total = self.fullReadCounts["uniq"]
152             total += self.fullReadCounts["multi"]
153
154             if self.dataType == "RNA":
155                 total += self.fullReadCounts["splice"]
156
157             self.totalReadWeight = int(total)
158
159         return self.totalReadWeight
160
161
162     def __del__(self):
163         """ cleanup copy in local cache, if present.
164         """
165         pass
166
167
168     def printRDSInfo(self, datafile, reportCount, initialize):
169         if initialize:
170             print "INITIALIZED dataset %s" % datafile
171         else:
172             print "dataset %s" % datafile
173
174         print "metadata:"
175         pnameList = self.metadata.keys()
176         pnameList.sort()
177         for pname in pnameList:
178             print "\t" + pname + "\t" + str(self.metadata[pname])
179
180         if reportCount and not initialize:
181             self.printReadCounts()
182
183         print "default cache size is %d pages" % self.getDefaultCacheSize()
184
185
186     def printReadCounts(self):
187         ucount = self.fullReadCounts["uniq"]
188         mcount = self.fullReadCounts["multi"]
189         if self.dataType == "DNA":
190             print "\n%d unique reads and %d multireads" % (ucount, mcount)
191         elif self.dataType == "RNA":
192             scount = self.fullReadCounts["splice"]
193             print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
194
195
196     def cacheDB(self, filename):
197         pass
198
199
200     def saveCacheDB(self, filename):
201         pass
202
203
204     def uncacheDB(self):
205         pass
206
207
208     def attachDB(self, filename, dbName):
209         pass
210
211
212     def detachDB(self, dbName):
213         pass
214
215
216     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
217         pass
218
219
220     def getTables(self, dbName=""):
221         pass
222
223
224     def getSqlCursor(self):
225         pass
226
227
228     def getMemCursor(self):
229         pass
230
231
232     def getFileCursor(self):
233         pass
234
235
236     def initializeTables(self, dbConnection, cache=100000):
237         pass
238
239
240     def getMetadata(self, valueName=""):        
241         return self.metadata
242
243
244     def getReadSize(self):
245         """ This is following the same model as the original where it assumes that all
246             read have the same readsize and that this is reflected by the size of the
247             first read.
248         """
249         if self.readsize is None:
250             bamFileIterator = self.bamfile.fetch(until_eof=True)
251             for read in bamFileIterator:
252                 self.calculateReadsizeFromCigar(read.cigar)
253                 break
254
255         return self.readsize
256
257
258     def calculateReadsizeFromCigar(self, cigar):
259         take = (0, 1) # CIGAR operation (M/match, I/insertion)
260         self.readsize = sum([length for op,length in cigar if op in take])
261
262
263     def getDefaultCacheSize(self):
264         """ returns 0 as cache is going to be deprecated
265         """
266         return 0
267
268
269     def getChromosomes(self, fullChrom=True):
270         """ returns a sorted list of distinct chromosomes in bam file reference list.
271         """
272         if fullChrom:
273             results = list(self.getFullReferenceNames())
274         else:
275             results = list(self.getShortReferenceNames())
276
277         results.sort()
278
279         return results
280
281
282     def getFullReferenceNames(self):
283         """ returns a set of bam file reference names.
284         """
285         return set(self.bamfile.references)
286
287
288     def getShortReferenceNames(self):
289         """ returns a set of bam file reference names after removing first 3 characters.
290         """
291         results = set()
292         references = self.bamfile.references
293         for chromosome in references:
294             shortName = chromosome[3:]
295             if len(shortName.strip()) > 0:
296                 results.add(shortName)
297
298         return results
299
300
301     def getMaxCoordinate(self, chrom, doUniqs=True,
302                          doMulti=False, doSplices=False):
303         """ returns the maximum coordinate for reads on a given chromosome.
304         """
305         maxCoord = 0
306         coordFinder = MaxCoordFinder()
307         self.bamfile.fetch(reference=chrom, callback=coordFinder)
308
309         if doUniqs:
310             maxCoord = coordFinder.maxUniqueStart
311
312         if doSplices:
313             maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
314
315         if doMulti:
316             maxCoord = max(coordFinder.maxMultiStart, maxCoord)
317
318         return maxCoord
319
320
321     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
322                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
323                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
324                      readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
325                      flagLike=False, strand='', combine5p=False):
326         """ First pass of rewrite
327             1) Leave as much in place as possible
328             2) For now treat multis just like uniqs
329         """
330         """
331         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
332         if findallOptimize:
333             selectQuery = "select start, sense, sum(weight)"
334         else:
335             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
336
337         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
338         if doUniqs:
339             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
340             if doMulti:
341                 stmt.append("UNION ALL")
342                 stmt.append(selectQuery)
343                 stmt.append("from multi")
344                 stmt.append(whereQuery)
345                 stmt.append(groupQuery)
346         else:
347             stmt = [selectQuery, "from multi", whereQuery]
348
349         if combine5p:
350             if findallOptimize:
351                 selectQuery = "select start, sense, weight, chrom"
352
353             if doUniqs:
354                 subSelect = [selectQuery, "from uniqs", whereQuery]
355                 if doMulti:
356                     subSelect.append("union all")
357                     subSelect.append(selectQuery)
358                     subSelect.append("from multi")
359                     subSelect.append(whereQuery)
360             else:
361                 subSelect = [selectQuery, "from multi", whereQuery]
362
363             sqlStmt = string.join(subSelect)
364             if findallOptimize:
365                 selectQuery = "select start, sense, sum(weight)"
366
367             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
368                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
369
370         if findallOptimize:
371             if self.memBacked:
372                 self.memcon.row_factory = None
373             else:
374                 self.dbcon.row_factory = None
375
376             stmt.append("order by start")
377         elif readIDDict:
378             stmt.append("order by readID, start")
379         else:
380             stmt.append("order by chrom, start")
381
382         sql = self.getSqlCursor()
383         sqlQuery = string.join(stmt)
384         sql.execute(sqlQuery)
385         """
386
387         # Edits are starting here - trying to ignore the entire sql construct
388         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
389         resultsDict = {}
390         if findallOptimize and chrom != "":
391             readIDDict = False
392             bothEnds = False
393             noSense = False
394             withWeight = True
395             withFlag = False
396             withMismatch = False
397             withID = False
398             withChrom = False
399             withPairID = False
400             fullChrom = True
401             #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
402
403         pairID = 0
404         for read in bamFileIterator:
405             pairReadSuffix = getPairedReadNumberSuffix(read)
406             readID = "%s%s" % (read.qname, pairReadSuffix)
407             if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
408                 continue
409
410             sense = getReadSense(read.is_reverse)
411             if fullChrom:
412                 chrom = self.bamfile.getrname(read.rname)
413             else:
414                 chrom = self.bamfile.getrname(read.rname)[3:]
415
416             if readIDDict:
417                 dictKey = read.qname
418             else:
419                 dictKey = chrom
420
421             newrow = {"start": int(read.pos)}
422             if bothEnds:
423                 if self.readsize is None:
424                     self.calculateReadsizeFromCigar(read.cigar)
425
426                 newrow["stop"] = int(read.pos + self.readsize)
427
428             if not noSense:
429                 newrow["sense"] = sense
430
431             if withWeight:
432                 try:
433                     newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
434                 except KeyError:
435                     newrow["weight"] = 1.0
436
437             if withFlag:
438                 #newrow["flag"] = row["flag"]
439                 pass
440
441             if withMismatch:
442                 try:
443                     mismatchTag = read.opt("MD")
444                     mismatches = getMismatches(mismatchTag, read.seq, sense)
445                 except KeyError:
446                     mismatches = ""
447
448                 newrow["mismatch"] = mismatches
449
450             if withID:
451                 newrow["readID"] = readID
452
453             if withChrom:
454                 newrow["chrom"] = chrom
455
456             if withPairID:
457                 newrow["pairID"] = pairID
458
459             try:
460                 resultsDict[dictKey].append(newrow)
461             except KeyError:
462                 resultsDict[dictKey] = [newrow]
463
464         return resultsDict
465
466
467     def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
468         readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
469
470         if flag != "":
471             #TODO: need to pick a tag code for the erange flag - using ZF for now
472             # although it looks like the flag is really just a label on a region.
473             # since BAM can retrieve based on location if we store the regions then
474             # we can just retrieve it directly from the BAM and we do not need to
475             # flag the individual reads.  The location will be sufficient.
476             try:
477                 if flag != read.opt("ZF"):
478                     readDoesNotMeetCriteria = True
479             except KeyError:
480                 readDoesNotMeetCriteria = True
481
482
483         if hasMismatch:
484             try:
485                 mismatchTag = read.opt("MD")
486             except KeyError:
487                 readDoesNotMeetCriteria = True
488
489         if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
490             readDoesNotMeetCriteria = True
491
492         return readDoesNotMeetCriteria
493
494
495     def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
496         rejectRead = False
497         if readID in self.multiReadIDCounts.keys():
498             if self.multiReadIDCounts[readID] > threshold or not doMulti:
499                 rejectRead = True
500
501         return rejectRead
502
503
504     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
505                        flag="", withWeight=False, withFlag=False, withMismatch=False,
506                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
507                        splitRead=False, hasMismatch=False, flagLike=False, start=None,
508                        stop=None, strand=""):
509         """ First pass of rewrite
510             1) Leave as much in place as possible
511             2) For now treat multis just like regular splices
512         """
513         """
514         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
515         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
516         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
517         sql = self.getSqlCursor()
518
519         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
520         sql.execute(stmt)
521         """
522
523         # Edits are starting here - trying to ignore the entire sql construct
524         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
525         resultsDict = {}
526         pairID = 0
527         for read in bamFileIterator:
528             if not isSpliceEntry(read.cigar):
529                 continue
530
531             sense = getReadSense(read.is_reverse)
532             pairReadSuffix = getPairedReadNumberSuffix(read)
533             readID = "%s%s" % (read.qname, pairReadSuffix)
534             if fullChrom:
535                 chrom = self.bamfile.getrname(read.rname)
536             else:
537                 chrom = self.bamfile.getrname(read.rname)[3:]
538
539             if readIDDict:
540                 dictKey = read.qname
541             else:
542                 dictKey = chrom
543
544             if self.readsize is None:
545                 self.calculateReadsizeFromCigar(read.cigar)
546
547             startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
548             newrow = {"startL": int(startL)}
549             newrow["stopL"] = int(stopL)
550             newrow["startR"] = int(startR)
551             newrow["stopR"] = int(stopR)
552             if not noSense:
553                 newrow["sense"] = sense
554
555             if withWeight:
556                 newrow["weight"] = 1.0
557
558             if withFlag:
559                 #newrow["flag"] = row["flag"]
560                 pass
561
562             if withMismatch:
563                 try:
564                     mismatchTag = read.opt("MD")
565                     mismatches = getMismatches(mismatchTag, read.seq, sense)
566                 except KeyError:
567                     mismatches = ""
568
569                 newrow["mismatch"] = mismatches
570
571             if withID:
572                 newrow["readID"] = readID
573
574             if withChrom:
575                 newrow["chrom"] = chrom
576
577             if withPairID:
578                 newrow["pairID"] = pairID
579
580             if splitRead:
581                 leftDict = newrow.copy()
582                 del leftDict["startR"]
583                 del leftDict["stopR"]
584                 rightDict = newrow
585                 del rightDict["startL"]
586                 del rightDict["stopL"]
587                 try:
588                     resultsDict[dictKey].append(leftDict)
589                 except KeyError:
590                     resultsDict[dictKey] = [leftDict]
591
592                 resultsDict[dictKey].append(rightDict)
593             else:
594                 try:
595                     resultsDict[dictKey].append(newrow)
596                 except KeyError:
597                     resultsDict[dictKey] = [newrow]
598
599         return resultsDict
600
601
602     def getFullReadCount(self):
603         fullReadCount = {"uniq": 0,
604                          "multi": 0,
605                          "splice": 0
606         }
607
608         for chromosome in self.getChromosomes():
609             uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
610             fullReadCount["uniq"] += uniqCount
611             fullReadCount["multi"] += multiCount
612             fullReadCount["splice"] += spliceCount
613
614         return fullReadCount
615
616
617     def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
618                   splices=False, reportCombined=True, sense="both"):
619         """ return read counts for a given region.
620         """
621         ucount = 0
622         mcount = 0
623         scount = 0
624         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
625
626         if uniqs:
627             ucount = readCounter.uniqReadCount
628
629         if multi:
630             mcount = readCounter.multiReadCount
631
632         if splices:
633             scount = readCounter.spliceReadCount
634
635         if reportCombined:
636             total = ucount + mcount + scount
637             return total
638         else:
639             return (ucount, mcount, scount)
640
641
642     def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
643         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
644
645         return readCounter.spliceReadCount
646
647
648     def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
649         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
650
651         return readCounter.uniqReadCount
652
653
654     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
655         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
656
657         return readCounter.multiReadCount
658
659
660     def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
661         readCounter = ReadCounter(self.multiReadIDCounts, sense)
662         self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
663
664         return readCounter
665
666
667     #TODO: Primary
668     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
669         """ get readID's.
670         """
671         stmt = []
672         if uniqs:
673             stmt.append("select readID from uniqs")
674
675         if multi:
676             stmt.append("select readID from multi")
677
678         if splices:
679             stmt.append("select readID from splices")
680
681         if len(stmt) > 0:
682             selectPart = string.join(stmt, " union ")
683         else:
684             selectPart = ""
685
686         limitPart = ""
687         if limit > 0:
688             limitPart = "LIMIT %d" % limit
689
690         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
691         sql = self.getSqlCursor()
692         sql.execute(sqlQuery)
693         result = sql.fetchall()
694
695         if paired:
696             return [x[0].split("/")[0] for x in result]
697         else:
698             return [x[0] for x in result]
699
700
701     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
702         """ returns the uniq and spliced mismatches in a dictionary.
703         """
704         readlen = self.getReadSize()
705         if mischrom:
706             hitChromList = [mischrom]
707         else:
708             hitChromList = self.getChromosomes()
709             hitChromList.sort()
710
711         snpDict = {}
712         for chromosome in hitChromList:
713             if verbose:
714                 print "getting mismatches from chromosome %s" % (chromosome)
715
716             snpDict[chromosome] = []
717             if useSplices and self.dataType == "RNA":
718                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
719                 spliceIDList = spliceDict.keys()
720                 for spliceID in spliceIDList:
721                     spliceEntry = spliceDict[spliceID][0]
722                     startpos = spliceEntry["startL"]
723                     lefthalf = spliceEntry["stopL"]
724                     rightstart = spliceEntry["startR"]
725                     sense = spliceEntry["sense"]
726                     mismatches = spliceEntry["mismatch"]
727                     spMismatchList = mismatches.split(",")
728                     for mismatch in spMismatchList:
729                         if "N" in mismatch:
730                             continue
731
732                         change_len = len(mismatch)
733                         if sense == "+":
734                             change_from = mismatch[0]
735                             change_base = mismatch[change_len-1]
736                             change_pos = int(mismatch[1:change_len-1])
737                         elif sense == "-":
738                             change_from = getReverseComplement(mismatch[0])
739                             change_base = getReverseComplement(mismatch[change_len-1])
740                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
741
742                         firsthalf = int(lefthalf)-int(startpos)+1
743                         secondhalf = 0
744                         if int(change_pos) <= int(firsthalf):
745                             change_at = startpos + change_pos - 1
746                         else:
747                             secondhalf = change_pos - firsthalf
748                             change_at = rightstart + secondhalf
749
750                         snpDict[chromosome].append([startpos, change_at, change_base, change_from])
751
752             hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
753             if chromosome not in hitDict.keys():
754                 continue
755
756             for readEntry in hitDict[chromosome]:
757                 start = readEntry["start"]
758                 sense = readEntry["sense"]
759                 mismatches = readEntry["mismatch"]
760                 mismatchList = mismatches.split(",")
761                 for mismatch in mismatchList:
762                     if "N" in mismatch:
763                         continue
764
765                     change_len = len(mismatch)
766                     if sense == "+":
767                         change_from = mismatch[0]
768                         change_base = mismatch[change_len-1]
769                         change_pos = int(mismatch[1:change_len-1])
770                     elif sense == "-":
771                         change_from = getReverseComplement(mismatch[0])
772                         change_base = getReverseComplement(mismatch[change_len-1])
773                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
774
775                     change_at = start + change_pos - 1
776                     snpDict[chromosome].append([start, change_at, change_base, change_from])
777
778         return snpDict
779
780
781     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
782                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
783                         keepStrand="both", shiftValue=0):
784         """return a profile of the chromosome as an array of per-base read coverage....
785             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
786             Will also shift position of unique and multireads (but not splices) if shift is a natural number
787         """
788         metadata = self.getMetadata()
789         try:
790             readlen = int(metadata["readsize"])
791         except KeyError:
792             readlen = 0
793
794         dataType = metadata["dataType"]
795         scale = 1. / normalizationFactor
796         shift = {}
797         shift["+"] = int(shiftValue)
798         shift["-"] = -1 * int(shiftValue)
799
800         if cstop > 0:
801             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
802         else:
803             lastNT = cstop - cstart + readlen + shift["+"]
804
805         chromModel = array("f",[0.] * lastNT)
806         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
807         if cstart < 0:
808             cstart = 0
809
810         for readEntry in hitDict[chromosome]:
811             hstart = readEntry["start"]
812             sense =  readEntry ["sense"]
813             weight = readEntry["weight"]
814             hstart = hstart - cstart + shift[sense]
815             for currentpos in range(hstart,hstart+readlen):
816                 try:
817                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
818                         chromModel[currentpos] += scale * weight
819                     elif sense == "-" and keepStrand != "plusOnly":
820                         chromModel[currentpos] -= scale * weight
821                 except:
822                     continue
823
824         del hitDict
825         if useSplices and dataType == "RNA":
826             if cstop > 0:
827                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
828             else:
829                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
830
831             if chromosome in spliceDict:
832                 for spliceEntry in spliceDict[chromosome]:
833                     Lstart = spliceEntry["startL"]
834                     Lstop = spliceEntry["stopL"]
835                     Rstart = spliceEntry["startR"]
836                     Rstop = spliceEntry["stopR"]
837                     rsense = spliceEntry["sense"]
838                     if (Rstop - cstart) < lastNT:
839                         for index in range(abs(Lstop - Lstart)):
840                             currentpos = Lstart - cstart + index
841                             # we only track unique splices
842                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
843                                 chromModel[currentpos] += scale
844                             elif rsense == "-" and keepStrand != "plusOnly":
845                                 chromModel[currentpos] -= scale
846
847                         for index in range(abs(Rstop - Rstart)):
848                             currentpos = Rstart - cstart + index
849                             # we only track unique splices
850                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
851                                 chromModel[currentpos] += scale
852                             elif rsense == "-" and keepStrand != "plusOnly":
853                                 chromModel[currentpos] -= scale
854
855             del spliceDict
856
857         return chromModel
858
859
860     def insertMetadata(self, valuesList):
861         for (pname, pvalue) in valuesList:
862             self.metadata[pname] = pvalue
863
864
865     def updateMetadata(self, pname, newValue, originalValue=""):
866         if originalValue != "":
867             if self.metadata[pname] == originalValue:
868                 self.metadata[pname] = newValue
869         else:
870             self.metadata[pname] = newValue
871
872
873     #TODO: Primary
874     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
875         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
876             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
877             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
878         """
879         restrict = ""
880         if sense != "both":
881             restrict = " and sense = ? "
882
883         if uniqs:
884             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
885
886         if multi:
887             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
888
889         if self.dataType == "RNA" and splices:
890             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
891             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
892
893         self.dbcon.commit()
894
895
896     #TODO: Primary
897     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
898         """ set the flag fields in the entire dataset.
899         """
900         if uniqs:
901             self.setFlagsInDB("uniqs", flag)
902
903         if multi:
904             self.setFlagsInDB("multi", flag)
905
906         if self.dataType == "RNA" and splices:
907             self.setFlagsInDB("splices", flag)
908
909         self.dbcon.commit()
910
911
912     #TODO: Secondary to setFlags()
913     def setFlagsInDB(self, table, flag):
914         """ set the flag field for every entry in a table.
915         """
916         self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
917
918
919     #TODO: Primary
920     def resetFlags(self, uniqs=True, multi=True, splices=True):
921         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
922         """
923         self.setFlags("", uniqs, multi, splices)
924
925
926     #TODO: Primary
927     def reweighMultireads(self, readList):
928         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
929
930
931     def setSynchronousPragma(self, value="ON"):
932         pass
933
934
935     def setDBcache(self, cache, default=False):
936         pass
937
938
939     def execute(self, statement, returnResults=False):
940         pass
941
942
943     def executeCommit(self, statement):
944         pass
945
946
947     def buildIndex(self, cache=100000):
948         pass
949
950
951     def dropIndex(self):
952         pass
953
954
955     def memSync(self, chrom="", index=False):
956         pass
957
958
959     def copyDBEntriesToMemory(self, dbName, whereClause=""):
960         pass
961
962
963     def copySpliceDBEntriesToMemory(self, whereClause=""):
964         pass
965
966
967 def getReadSense(reverse):
968     if reverse:
969         sense = "-"
970     else:
971         sense = "+"
972
973     return sense
974
975
976 def isSpliceEntry(cigarTupleList):
977     isSplice = False
978     for operation,length in cigarTupleList:
979         if operation == 3:
980             isSplice = True
981             break
982
983     return isSplice
984
985
986 def getSpliceRightStart(start, cigarTupleList):
987     offset = 0
988
989     for operation,length in cigarTupleList:
990         if operation == 3:
991             stopL = int(start + offset)
992             startR = int(stopL + length)
993
994             return startR
995         else:
996             offset += length
997
998
999 def getSpliceBounds(start, readsize, cigarTupleList):
1000     offset = 0
1001     #TODO: check operations here - we want to make sure we are adding the correct ones
1002     for operation,length in cigarTupleList:
1003         if operation == 3:
1004             stopL = int(start + offset)
1005             startR = int(stopL + length)
1006             offset = 0
1007         else:
1008             offset += length
1009
1010     stopR = startR + offset
1011
1012     return start, startR, stopL, stopR
1013
1014
1015 def getPairedReadNumberSuffix(read):
1016     readSuffix = ""
1017     if not isPairedRead(read):
1018         return ""
1019
1020     if read.is_read1:
1021         readSuffix = "/1"
1022     elif read.is_read2:
1023         readSuffix = "/2"
1024
1025     return readSuffix
1026
1027
1028 def isPairedRead(read):
1029     return read.is_proper_pair and (read.is_read1 or read.is_read2)
1030
1031
1032 def getMismatches(mismatchTag, querySequence="", sense="+"):
1033     output = []
1034     deletionMarker = "^"
1035     position = 0
1036
1037     lengths = re.findall("\d+", mismatchTag)
1038     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1039
1040     for mismatchEntry in range(len(mismatchSequences)):
1041         mismatch = mismatchSequences[mismatchEntry]
1042         position = position + int(lengths[mismatchEntry])
1043         if string.find(mismatch, deletionMarker) == 0:
1044             continue
1045
1046         try:
1047             if querySequence:
1048                 genomicNucleotide = querySequence[position]
1049             else:
1050                 genomicNucleotide = "N"
1051
1052             if sense == "-":
1053                 mismatch = getReverseComplement(mismatch)
1054                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
1055
1056             erange1BasedElandCompatiblePosition = int(position + 1)
1057             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1058             position += 1
1059         except IndexError:
1060             return ""
1061
1062     return string.join(output, ",")