first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / ReadDataset.py
1 import string
2 import tempfile
3 import shutil
4 import os
5 import re
6 import sys
7 import pysam
8 from array import array
9 from commoncode import getReverseComplement, isSpliceEntry
10
11 currentRDSVersion = "3.0"
12
13 class ReadDatasetError(Exception):
14     pass
15
16
17 class ReadCounter():
18     uniqReadCount = 0
19     multiReadCount = 0.0
20     spliceReadCount = 0
21
22     def __init__(self, multiReadIDCount={}, sense="both"):
23         self.multiReadIDCount = multiReadIDCount
24         self.sense = sense
25
26
27     def __call__(self, read):
28         multiplicity = self.getReadMultiplicity(read)
29         if multiplicity > 1:
30             self.multiReadCount += float(1.0/multiplicity)
31         elif isSpliceEntry(read.cigar):
32             self.spliceReadCount += 1
33         else:
34             self.uniqReadCount += 1
35
36
37     def getReadMultiplicity(self, read):
38         pairReadSuffix = getPairedReadNumberSuffix(read)
39         readID = "%s%s" % (read.qname, pairReadSuffix)
40         try:
41             multiplicity = self.multiReadIDCount[readID]
42         except KeyError:
43             multiplicity = 1
44
45         return multiplicity
46
47
48     def isMulti(self, read):
49         pairReadSuffix = getPairedReadNumberSuffix(read)
50         readID = "%s%s" % (read.qname, pairReadSuffix)
51         return readID in self.multiReadIDCount
52
53
54
55 class MaxCoordFinder(ReadCounter):
56     maxUniqueStart = 0
57     maxMultiStart = 0
58     maxSpliceStart = 0
59
60     def __call__(self, read):
61         if self.isMulti(read):
62             self.maxMultiStart = max(self.maxMultiStart, read.pos)
63         elif isSpliceEntry(read.cigar):
64             self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
65         else:
66             self.maxUniqueStart = max(self.maxMultiStart, read.pos)
67
68
69
70 class ReadDataset():
71     """ Class for storing reads from experiments. Assumes that custom scripts
72     will translate incoming data into a format that can be inserted into the
73     class using the insert* methods. Default class subtype ('DNA') includes
74     tables for unique and multireads, whereas 'RNA' subtype also includes a
75     splices table.
76     """
77
78
79     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
80                  cache=False, reportCount=True):
81         """ creates an rds datafile if initialize is set to true, otherwise
82         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
83         """
84
85         if initialize and datasetType not in ["DNA", "RNA"]:
86             raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
87
88         self.dataType = ""
89         self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
90         self.bamfile = pysam.Samfile(datafile, "rb")
91         self.readsize = None
92         self.totalReadWeight = None
93
94         self.metadata = {"dataType": datasetType}
95
96         if initialize:
97             self.dataType = datasetType
98         else:
99             self.dataType = self.metadata["dataType"]
100
101         try:
102             self.rdsVersion = self.metadata["rdsVersion"]
103         except KeyError:
104             self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
105
106         self.fullReadCounts = self.getFullReadCount()
107         if verbose:
108             self.printRDSInfo(datafile, reportCount, initialize)
109
110
111     def __len__(self):
112         """ return the number of reads in the bam file
113             This is not the same as the original as the original returned total weight.
114         """
115         count = 0
116         references = self.bamfile.references
117         for reference in references:
118             count += self.bamfile.count(reference)
119
120         return count
121
122
123     def getMultiReadIDCounts(self, samFileName, fileMode):
124         try:
125             samfile = pysam.Samfile(samFileName, fileMode)
126         except ValueError:
127             print "samfile index not found"
128             sys.exit(1)
129
130         readIDCounts = {}
131         for read in samfile.fetch(until_eof=True):
132             pairReadSuffix = getPairedReadNumberSuffix(read)
133             readName = "%s%s" % (read.qname, pairReadSuffix)
134             try:
135                 readIDCounts[readName] += 1.0
136             except KeyError:
137                 readIDCounts[readName] = 1.0
138
139         for readID in readIDCounts.keys():
140             if int(readIDCounts[readID]) == 1:
141                 del readIDCounts[readID]
142
143         return readIDCounts
144
145
146     def totalReadWeight(self):
147         """ return the total weight of usable reads in the bam file.
148         """
149         if self.totalReadWeight is None:
150             total = self.fullReadCounts["uniq"]
151             total += self.fullReadCounts["multi"]
152
153             if self.dataType == "RNA":
154                 total += self.fullReadCounts["splice"]
155
156             self.totalReadWeight = int(total)
157
158         return self.totalReadWeight
159
160
161     def __del__(self):
162         """ cleanup copy in local cache, if present.
163         """
164         pass
165
166
167     def printRDSInfo(self, datafile, reportCount, initialize):
168         if initialize:
169             print "INITIALIZED dataset %s" % datafile
170         else:
171             print "dataset %s" % datafile
172
173         print "metadata:"
174         pnameList = self.metadata.keys()
175         pnameList.sort()
176         for pname in pnameList:
177             print "\t" + pname + "\t" + str(self.metadata[pname])
178
179         if reportCount and not initialize:
180             self.printReadCounts()
181
182         print "default cache size is %d pages" % self.getDefaultCacheSize()
183
184
185     def printReadCounts(self):
186         ucount = self.fullReadCounts["uniq"]
187         mcount = self.fullReadCounts["multi"]
188         if self.dataType == "DNA":
189             print "\n%d unique reads and %d multireads" % (ucount, mcount)
190         elif self.dataType == "RNA":
191             scount = self.fullReadCounts["splice"]
192             print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
193
194
195     def cacheDB(self, filename):
196         pass
197
198
199     def saveCacheDB(self, filename):
200         pass
201
202
203     def uncacheDB(self):
204         pass
205
206
207     def attachDB(self, filename, dbName):
208         pass
209
210
211     def detachDB(self, dbName):
212         pass
213
214
215     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
216         pass
217
218
219     def getTables(self, dbName=""):
220         pass
221
222
223     def getSqlCursor(self):
224         pass
225
226
227     def getMemCursor(self):
228         pass
229
230
231     def getFileCursor(self):
232         pass
233
234
235     def initializeTables(self, dbConnection, cache=100000):
236         pass
237
238
239     def getMetadata(self, valueName=""):        
240         return self.metadata
241
242
243     def getReadSize(self):
244         """ This is following the same model as the original where it assumes that all
245             read have the same readsize and that this is reflected by the size of the
246             first read.
247         """
248         if self.readsize is None:
249             bamFileIterator = self.bamfile.fetch(until_eof=True)
250             for read in bamFileIterator:
251                 self.calculateReadsizeFromCigar(read.cigar)
252                 break
253
254         return self.readsize
255
256
257     def calculateReadsizeFromCigar(self, cigar):
258         take = (0, 1) # CIGAR operation (M/match, I/insertion)
259         self.readsize = sum([length for op,length in cigar if op in take])
260
261
262     def getDefaultCacheSize(self):
263         """ returns 0 as cache is going to be deprecated
264         """
265         return 0
266
267
268     def getChromosomes(self, fullChrom=True):
269         """ returns a sorted list of distinct chromosomes in bam file reference list.
270         """
271         if fullChrom:
272             results = list(self.getFullReferenceNames())
273         else:
274             results = list(self.getShortReferenceNames())
275
276         results.sort()
277
278         return results
279
280
281     def getFullReferenceNames(self):
282         """ returns a set of bam file reference names.
283         """
284         return set(self.bamfile.references)
285
286
287     def getShortReferenceNames(self):
288         """ returns a set of bam file reference names after removing first 3 characters.
289         """
290         results = set()
291         references = self.bamfile.references
292         for chromosome in references:
293             shortName = chromosome[3:]
294             if len(shortName.strip()) > 0:
295                 results.add(shortName)
296
297         return results
298
299
300     def getMaxCoordinate(self, chrom, doUniqs=True,
301                          doMulti=False, doSplices=False):
302         """ returns the maximum coordinate for reads on a given chromosome.
303         """
304         maxCoord = 0
305         coordFinder = MaxCoordFinder()
306         self.bamfile.fetch(reference=chrom, callback=coordFinder)
307
308         if doUniqs:
309             maxCoord = coordFinder.maxUniqueStart
310
311         if doSplices:
312             maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
313
314         if doMulti:
315             maxCoord = max(coordFinder.maxMultiStart, maxCoord)
316
317         return maxCoord
318
319
320     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
321                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
322                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
323                      readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
324                      flagLike=False, strand='', combine5p=False):
325         """ First pass of rewrite
326             1) Leave as much in place as possible
327             2) For now treat multis just like uniqs
328         """
329         """
330         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
331         if findallOptimize:
332             selectQuery = "select start, sense, sum(weight)"
333         else:
334             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
335
336         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
337         if doUniqs:
338             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
339             if doMulti:
340                 stmt.append("UNION ALL")
341                 stmt.append(selectQuery)
342                 stmt.append("from multi")
343                 stmt.append(whereQuery)
344                 stmt.append(groupQuery)
345         else:
346             stmt = [selectQuery, "from multi", whereQuery]
347
348         if combine5p:
349             if findallOptimize:
350                 selectQuery = "select start, sense, weight, chrom"
351
352             if doUniqs:
353                 subSelect = [selectQuery, "from uniqs", whereQuery]
354                 if doMulti:
355                     subSelect.append("union all")
356                     subSelect.append(selectQuery)
357                     subSelect.append("from multi")
358                     subSelect.append(whereQuery)
359             else:
360                 subSelect = [selectQuery, "from multi", whereQuery]
361
362             sqlStmt = string.join(subSelect)
363             if findallOptimize:
364                 selectQuery = "select start, sense, sum(weight)"
365
366             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
367                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
368
369         if findallOptimize:
370             if self.memBacked:
371                 self.memcon.row_factory = None
372             else:
373                 self.dbcon.row_factory = None
374
375             stmt.append("order by start")
376         elif readIDDict:
377             stmt.append("order by readID, start")
378         else:
379             stmt.append("order by chrom, start")
380
381         sql = self.getSqlCursor()
382         sqlQuery = string.join(stmt)
383         sql.execute(sqlQuery)
384         """
385
386         # Edits are starting here - trying to ignore the entire sql construct
387         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
388         resultsDict = {}
389         if findallOptimize and chrom != "":
390             readIDDict = False
391             bothEnds = False
392             noSense = False
393             withWeight = True
394             withFlag = False
395             withMismatch = False
396             withID = False
397             withChrom = False
398             withPairID = False
399             fullChrom = True
400             #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
401
402         pairID = 0
403         for read in bamFileIterator:
404             pairReadSuffix = getPairedReadNumberSuffix(read)
405             readID = "%s%s" % (read.qname, pairReadSuffix)
406             if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
407                 continue
408
409             sense = getReadSense(read.is_reverse)
410             if fullChrom:
411                 chrom = self.bamfile.getrname(read.rname)
412             else:
413                 chrom = self.bamfile.getrname(read.rname)[3:]
414
415             if readIDDict:
416                 dictKey = read.qname
417             else:
418                 dictKey = chrom
419
420             newrow = {"start": int(read.pos)}
421             if bothEnds:
422                 if self.readsize is None:
423                     self.calculateReadsizeFromCigar(read.cigar)
424
425                 newrow["stop"] = int(read.pos + self.readsize)
426
427             if not noSense:
428                 newrow["sense"] = sense
429
430             if withWeight:
431                 try:
432                     newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
433                 except KeyError:
434                     newrow["weight"] = 1.0
435
436             if withFlag:
437                 #newrow["flag"] = row["flag"]
438                 pass
439
440             if withMismatch:
441                 try:
442                     mismatchTag = read.opt("MD")
443                     mismatches = getMismatches(mismatchTag, read.seq, sense)
444                 except KeyError:
445                     mismatches = ""
446
447                 newrow["mismatch"] = mismatches
448
449             if withID:
450                 newrow["readID"] = readID
451
452             if withChrom:
453                 newrow["chrom"] = chrom
454
455             if withPairID:
456                 newrow["pairID"] = pairReadSuffix[1:]
457
458             try:
459                 resultsDict[dictKey].append(newrow)
460             except KeyError:
461                 resultsDict[dictKey] = [newrow]
462
463         return resultsDict
464
465
466     def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
467         readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
468
469         if flag != "":
470             #TODO: need to pick a tag code for the erange flag - using ZF for now
471             # although it looks like the flag is really just a label on a region.
472             # since BAM can retrieve based on location if we store the regions then
473             # we can just retrieve it directly from the BAM and we do not need to
474             # flag the individual reads.  The location will be sufficient.
475             try:
476                 if flag != read.opt("ZF"):
477                     readDoesNotMeetCriteria = True
478             except KeyError:
479                 readDoesNotMeetCriteria = True
480
481
482         if hasMismatch:
483             try:
484                 mismatchTag = read.opt("MD")
485             except KeyError:
486                 readDoesNotMeetCriteria = True
487
488         if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
489             readDoesNotMeetCriteria = True
490
491         return readDoesNotMeetCriteria
492
493
494     def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
495         rejectRead = False
496         if readID in self.multiReadIDCounts.keys():
497             if self.multiReadIDCounts[readID] > threshold or not doMulti:
498                 rejectRead = True
499
500         return rejectRead
501
502
503     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
504                        flag="", withWeight=False, withFlag=False, withMismatch=False,
505                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
506                        splitRead=False, hasMismatch=False, flagLike=False, start=None,
507                        stop=None, strand=""):
508         """ First pass of rewrite
509             1) Leave as much in place as possible
510             2) For now treat multis just like regular splices
511         """
512         """
513         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
514         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
515         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
516         sql = self.getSqlCursor()
517
518         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
519         sql.execute(stmt)
520         """
521
522         # Edits are starting here - trying to ignore the entire sql construct
523         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
524         resultsDict = {}
525         pairID = 0
526         for read in bamFileIterator:
527             if not isSpliceEntry(read.cigar):
528                 continue
529
530             sense = getReadSense(read.is_reverse)
531             pairReadSuffix = getPairedReadNumberSuffix(read)
532             readID = "%s%s" % (read.qname, pairReadSuffix)
533             if fullChrom:
534                 chrom = self.bamfile.getrname(read.rname)
535             else:
536                 chrom = self.bamfile.getrname(read.rname)[3:]
537
538             if readIDDict:
539                 dictKey = read.qname
540             else:
541                 dictKey = chrom
542
543             if self.readsize is None:
544                 self.calculateReadsizeFromCigar(read.cigar)
545
546             startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
547             newrow = {"startL": int(startL)}
548             newrow["stopL"] = int(stopL)
549             newrow["startR"] = int(startR)
550             newrow["stopR"] = int(stopR)
551             if not noSense:
552                 newrow["sense"] = sense
553
554             if withWeight:
555                 newrow["weight"] = 1.0
556
557             if withFlag:
558                 #newrow["flag"] = row["flag"]
559                 pass
560
561             if withMismatch:
562                 try:
563                     mismatchTag = read.opt("MD")
564                     mismatches = getMismatches(mismatchTag, read.seq, sense)
565                 except KeyError:
566                     mismatches = ""
567
568                 newrow["mismatch"] = mismatches
569
570             if withID:
571                 newrow["readID"] = readID
572
573             if withChrom:
574                 newrow["chrom"] = chrom
575
576             if withPairID:
577                 newrow["pairID"] = pairID
578
579             if splitRead:
580                 leftDict = newrow.copy()
581                 del leftDict["startR"]
582                 del leftDict["stopR"]
583                 rightDict = newrow
584                 del rightDict["startL"]
585                 del rightDict["stopL"]
586                 try:
587                     resultsDict[dictKey].append(leftDict)
588                 except KeyError:
589                     resultsDict[dictKey] = [leftDict]
590
591                 resultsDict[dictKey].append(rightDict)
592             else:
593                 try:
594                     resultsDict[dictKey].append(newrow)
595                 except KeyError:
596                     resultsDict[dictKey] = [newrow]
597
598         return resultsDict
599
600
601     def getFullReadCount(self):
602         fullReadCount = {"uniq": 0,
603                          "multi": 0,
604                          "splice": 0
605         }
606
607         for chromosome in self.getChromosomes():
608             uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
609             fullReadCount["uniq"] += uniqCount
610             fullReadCount["multi"] += multiCount
611             fullReadCount["splice"] += spliceCount
612
613         return fullReadCount
614
615
616     def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
617                   splices=False, reportCombined=True, sense="both"):
618         """ return read counts for a given region.
619         """
620         ucount = 0
621         mcount = 0
622         scount = 0
623         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
624
625         if uniqs:
626             ucount = readCounter.uniqReadCount
627
628         if multi:
629             mcount = readCounter.multiReadCount
630
631         if splices:
632             scount = readCounter.spliceReadCount
633
634         if reportCombined:
635             total = ucount + mcount + scount
636             return total
637         else:
638             return (ucount, mcount, scount)
639
640
641     def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
642         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
643
644         return readCounter.spliceReadCount
645
646
647     def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
648         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
649
650         return readCounter.uniqReadCount
651
652
653     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
654         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
655
656         return readCounter.multiReadCount
657
658
659     def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
660         readCounter = ReadCounter(self.multiReadIDCounts, sense)
661         self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
662
663         return readCounter
664
665
666     #TODO: Primary
667     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
668         """ get readID's.
669         """
670         stmt = []
671         if uniqs:
672             stmt.append("select readID from uniqs")
673
674         if multi:
675             stmt.append("select readID from multi")
676
677         if splices:
678             stmt.append("select readID from splices")
679
680         if len(stmt) > 0:
681             selectPart = string.join(stmt, " union ")
682         else:
683             selectPart = ""
684
685         limitPart = ""
686         if limit > 0:
687             limitPart = "LIMIT %d" % limit
688
689         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
690         sql = self.getSqlCursor()
691         sql.execute(sqlQuery)
692         result = sql.fetchall()
693
694         if paired:
695             return [x[0].split("/")[0] for x in result]
696         else:
697             return [x[0] for x in result]
698
699
700     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
701         """ returns the uniq and spliced mismatches in a dictionary.
702         """
703         readlen = self.getReadSize()
704         if mischrom:
705             hitChromList = [mischrom]
706         else:
707             hitChromList = self.getChromosomes()
708             hitChromList.sort()
709
710         snpDict = {}
711         for chromosome in hitChromList:
712             if verbose:
713                 print "getting mismatches from chromosome %s" % (chromosome)
714
715             snpDict[chromosome] = []
716             if useSplices and self.dataType == "RNA":
717                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
718                 spliceIDList = spliceDict.keys()
719                 for spliceID in spliceIDList:
720                     spliceEntry = spliceDict[spliceID][0]
721                     startpos = spliceEntry["startL"]
722                     lefthalf = spliceEntry["stopL"]
723                     rightstart = spliceEntry["startR"]
724                     sense = spliceEntry["sense"]
725                     mismatches = spliceEntry["mismatch"]
726                     spMismatchList = mismatches.split(",")
727                     for mismatch in spMismatchList:
728                         if "N" in mismatch:
729                             continue
730
731                         change_len = len(mismatch)
732                         if sense == "+":
733                             change_from = mismatch[0]
734                             change_base = mismatch[change_len-1]
735                             change_pos = int(mismatch[1:change_len-1])
736                         elif sense == "-":
737                             change_from = getReverseComplement(mismatch[0])
738                             change_base = getReverseComplement(mismatch[change_len-1])
739                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
740
741                         firsthalf = int(lefthalf)-int(startpos)+1
742                         secondhalf = 0
743                         if int(change_pos) <= int(firsthalf):
744                             change_at = startpos + change_pos - 1
745                         else:
746                             secondhalf = change_pos - firsthalf
747                             change_at = rightstart + secondhalf
748
749                         snpDict[chromosome].append([startpos, change_at, change_base, change_from])
750
751             hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
752             if chromosome not in hitDict.keys():
753                 continue
754
755             for readEntry in hitDict[chromosome]:
756                 start = readEntry["start"]
757                 sense = readEntry["sense"]
758                 mismatches = readEntry["mismatch"]
759                 mismatchList = mismatches.split(",")
760                 for mismatch in mismatchList:
761                     if "N" in mismatch:
762                         continue
763
764                     change_len = len(mismatch)
765                     if sense == "+":
766                         change_from = mismatch[0]
767                         change_base = mismatch[change_len-1]
768                         change_pos = int(mismatch[1:change_len-1])
769                     elif sense == "-":
770                         change_from = getReverseComplement(mismatch[0])
771                         change_base = getReverseComplement(mismatch[change_len-1])
772                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
773
774                     change_at = start + change_pos - 1
775                     snpDict[chromosome].append([start, change_at, change_base, change_from])
776
777         return snpDict
778
779
780     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
781                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
782                         keepStrand="both", shiftValue=0):
783         """return a profile of the chromosome as an array of per-base read coverage....
784             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
785             Will also shift position of unique and multireads (but not splices) if shift is a natural number
786         """
787         metadata = self.getMetadata()
788         try:
789             readlen = int(metadata["readsize"])
790         except KeyError:
791             readlen = 0
792
793         dataType = metadata["dataType"]
794         scale = 1. / normalizationFactor
795         shift = {}
796         shift["+"] = int(shiftValue)
797         shift["-"] = -1 * int(shiftValue)
798
799         if cstop > 0:
800             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
801         else:
802             lastNT = cstop - cstart + readlen + shift["+"]
803
804         chromModel = array("f",[0.] * lastNT)
805         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
806         if cstart < 0:
807             cstart = 0
808
809         for readEntry in hitDict[chromosome]:
810             hstart = readEntry["start"]
811             sense =  readEntry ["sense"]
812             weight = readEntry["weight"]
813             hstart = hstart - cstart + shift[sense]
814             for currentpos in range(hstart,hstart+readlen):
815                 try:
816                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
817                         chromModel[currentpos] += scale * weight
818                     elif sense == "-" and keepStrand != "plusOnly":
819                         chromModel[currentpos] -= scale * weight
820                 except:
821                     continue
822
823         del hitDict
824         if useSplices and dataType == "RNA":
825             if cstop > 0:
826                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
827             else:
828                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
829
830             if chromosome in spliceDict:
831                 for spliceEntry in spliceDict[chromosome]:
832                     Lstart = spliceEntry["startL"]
833                     Lstop = spliceEntry["stopL"]
834                     Rstart = spliceEntry["startR"]
835                     Rstop = spliceEntry["stopR"]
836                     rsense = spliceEntry["sense"]
837                     if (Rstop - cstart) < lastNT:
838                         for index in range(abs(Lstop - Lstart)):
839                             currentpos = Lstart - cstart + index
840                             # we only track unique splices
841                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
842                                 chromModel[currentpos] += scale
843                             elif rsense == "-" and keepStrand != "plusOnly":
844                                 chromModel[currentpos] -= scale
845
846                         for index in range(abs(Rstop - Rstart)):
847                             currentpos = Rstart - cstart + index
848                             # we only track unique splices
849                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
850                                 chromModel[currentpos] += scale
851                             elif rsense == "-" and keepStrand != "plusOnly":
852                                 chromModel[currentpos] -= scale
853
854             del spliceDict
855
856         return chromModel
857
858
859     def insertMetadata(self, valuesList):
860         for (pname, pvalue) in valuesList:
861             self.metadata[pname] = pvalue
862
863
864     def updateMetadata(self, pname, newValue, originalValue=""):
865         if originalValue != "":
866             if self.metadata[pname] == originalValue:
867                 self.metadata[pname] = newValue
868         else:
869             self.metadata[pname] = newValue
870
871
872     #TODO: Primary
873     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
874         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
875             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
876             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
877         """
878         restrict = ""
879         if sense != "both":
880             restrict = " and sense = ? "
881
882         if uniqs:
883             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
884
885         if multi:
886             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
887
888         if self.dataType == "RNA" and splices:
889             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
890             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
891
892         self.dbcon.commit()
893
894
895     #TODO: Primary
896     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
897         """ set the flag fields in the entire dataset.
898         """
899         if uniqs:
900             self.setFlagsInDB("uniqs", flag)
901
902         if multi:
903             self.setFlagsInDB("multi", flag)
904
905         if self.dataType == "RNA" and splices:
906             self.setFlagsInDB("splices", flag)
907
908         self.dbcon.commit()
909
910
911     #TODO: Secondary to setFlags()
912     def setFlagsInDB(self, table, flag):
913         """ set the flag field for every entry in a table.
914         """
915         self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
916
917
918     #TODO: Primary
919     def resetFlags(self, uniqs=True, multi=True, splices=True):
920         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
921         """
922         self.setFlags("", uniqs, multi, splices)
923
924
925     #TODO: Primary
926     def reweighMultireads(self, readList):
927         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
928
929
930     def setSynchronousPragma(self, value="ON"):
931         pass
932
933
934     def setDBcache(self, cache, default=False):
935         pass
936
937
938     def execute(self, statement, returnResults=False):
939         pass
940
941
942     def executeCommit(self, statement):
943         pass
944
945
946     def buildIndex(self, cache=100000):
947         pass
948
949
950     def dropIndex(self):
951         pass
952
953
954     def memSync(self, chrom="", index=False):
955         pass
956
957
958     def copyDBEntriesToMemory(self, dbName, whereClause=""):
959         pass
960
961
962     def copySpliceDBEntriesToMemory(self, whereClause=""):
963         pass
964
965
966 def getReadSense(reverse):
967     if reverse:
968         sense = "-"
969     else:
970         sense = "+"
971
972     return sense
973
974
975 def getSpliceRightStart(start, cigarTupleList):
976     offset = 0
977
978     for operation,length in cigarTupleList:
979         if operation == 3:
980             stopL = int(start + offset)
981             startR = int(stopL + length)
982
983             return startR
984         else:
985             offset += length
986
987
988 def getSpliceBounds(start, readsize, cigarTupleList):
989     offset = 0
990     #TODO: check operations here - we want to make sure we are adding the correct ones
991     for operation,length in cigarTupleList:
992         if operation == 3:
993             stopL = int(start + offset)
994             startR = int(stopL + length)
995             offset = 0
996         else:
997             offset += length
998
999     stopR = startR + offset
1000
1001     return start, startR, stopL, stopR
1002
1003
1004 def getPairedReadNumberSuffix(read):
1005     readSuffix = ""
1006     if not isPairedRead(read):
1007         return ""
1008
1009     if read.is_read1:
1010         readSuffix = "/1"
1011     elif read.is_read2:
1012         readSuffix = "/2"
1013
1014     return readSuffix
1015
1016
1017 def isPairedRead(read):
1018     return read.is_proper_pair and (read.is_read1 or read.is_read2)
1019
1020
1021 def getMismatches(mismatchTag, querySequence="", sense="+"):
1022     output = []
1023     deletionMarker = "^"
1024     position = 0
1025
1026     lengths = re.findall("\d+", mismatchTag)
1027     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1028
1029     for mismatchEntry in range(len(mismatchSequences)):
1030         mismatch = mismatchSequences[mismatchEntry]
1031         position = position + int(lengths[mismatchEntry])
1032         if string.find(mismatch, deletionMarker) == 0:
1033             continue
1034
1035         try:
1036             if querySequence:
1037                 genomicNucleotide = querySequence[position]
1038             else:
1039                 genomicNucleotide = "N"
1040
1041             if sense == "-":
1042                 mismatch = getReverseComplement(mismatch)
1043                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
1044
1045             erange1BasedElandCompatiblePosition = int(position + 1)
1046             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1047             position += 1
1048         except IndexError:
1049             return ""
1050
1051     return string.join(output, ",")