From: Sean Upchurch Date: Mon, 2 Jul 2012 17:17:00 +0000 (-0700) Subject: development release: conversion of ReadDataset to use BAM files X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=commitdiff_plain;h=cfc5602b26323ad2365295145e3f6c622d912eb4 development release: conversion of ReadDataset to use BAM files --- diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py index 730c20b..0b4946b 100644 --- a/MakeBamFromRds.py +++ b/MakeBamFromRds.py @@ -176,7 +176,7 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, d pass else: for read in spliceDict[chrom]: - if fastaSequenceDict.has_key(chrom): + if chrom in fastaSequenceDict: read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"]) noncanonicalSplices += noncanonical @@ -205,7 +205,7 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength): alignedRead.rname = outfile.references.index(chrom) - if outputDict.has_key("startL"): + if "startL" in outputDict: startL = outputDict["startL"] - 1 stopL = outputDict["stopL"] - 1 startR = outputDict["startR"] - 1 @@ -217,6 +217,13 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength): alignedRead.pos = outputDict["start"] - 1 alignedRead.cigar = [(0, readlength)] + #TODO: add sequence info + #cistematic.core.retrieveSequence(genome, chrom, start, stop, sense="F", db="") + #ooooo. let's do this with PyGr + + #TODO: add quality - max for each base, but default is to not include just like current + # rdsToFastq() sets everything to max + if paired: if pairID == "1": alignedRead.is_read1 = True @@ -227,7 +234,7 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength): else: pass - if outputDict.has_key("mismatch"): + if "mismatch" in outputDict: mismatchTag = getMismatches(outputDict["mismatch"]) if mismatchTag: tagList.append(("MD", str(mismatchTag))) @@ -300,7 +307,7 @@ def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""): spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "") spliceJunction = spliceJunction.upper() print spliceJunction - if spliceSense.has_key(spliceJunction): + if spliceJunction in spliceSense: sense = spliceSense[spliceJunction] else: noncanonical += 1 diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py index 13c9013..b16bccd 100644 --- a/MakeRdsFromBam.py +++ b/MakeRdsFromBam.py @@ -418,13 +418,13 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False): def getSpliceBounds(start, readsize, cigarTupleList): - stopR = int(start + readsize) offset = 0 for operation,length in cigarTupleList: if operation == 3: stopL = int(start + offset) startR = int(stopL + length) + stopR = int(startR + readsize - offset) return start, startR, stopL, stopR else: diff --git a/ReadDataset.py b/ReadDataset.py index 4b51471..9a795c8 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -1,19 +1,74 @@ -import sqlite3 as sqlite import string import tempfile import shutil import os +import re +import sys +import pysam from array import array -from commoncode import getReverseComplement, getConfigParser, getConfigOption +from commoncode import getReverseComplement -currentRDSVersion = "2.1" +currentRDSVersion = "3.0" class ReadDatasetError(Exception): pass -class ReadDataset(): +class ReadCounter(): + uniqReadCount = 0 + multiReadCount = 0.0 + spliceReadCount = 0 + + def __init__(self, multiReadIDCount={}, sense="both"): + self.multiReadIDCount = multiReadIDCount + self.sense = sense + + + def __call__(self, read): + multiplicity = self.getReadMultiplicity(read) + if multiplicity > 1: + self.multiReadCount += float(1.0/multiplicity) + elif isSpliceEntry(read.cigar): + self.spliceReadCount += 1 + else: + self.uniqReadCount += 1 + + + def getReadMultiplicity(self, read): + pairReadSuffix = getPairedReadNumberSuffix(read) + readID = "%s%s" % (read.qname, pairReadSuffix) + try: + multiplicity = self.multiReadIDCount[readID] + except KeyError: + multiplicity = 1 + + return multiplicity + + + def isMulti(self, read): + pairReadSuffix = getPairedReadNumberSuffix(read) + readID = "%s%s" % (read.qname, pairReadSuffix) + return readID in self.multiReadIDCount + + + +class MaxCoordFinder(ReadCounter): + maxUniqueStart = 0 + maxMultiStart = 0 + maxSpliceStart = 0 + + def __call__(self, read): + if self.isMulti(read): + self.maxMultiStart = max(self.maxMultiStart, read.pos) + elif isSpliceEntry(read.cigar): + self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar)) + else: + self.maxUniqueStart = max(self.maxMultiStart, read.pos) + + + +class BamReadDataset(): """ Class for storing reads from experiments. Assumes that custom scripts will translate incoming data into a format that can be inserted into the class using the insert* methods. Default class subtype ('DNA') includes @@ -21,75 +76,93 @@ class ReadDataset(): splices table. """ + def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, cache=False, reportCount=True): """ creates an rds datafile if initialize is set to true, otherwise will append to existing tables. datasetType can be either 'DNA' or 'RNA'. """ - self.dbcon = "" - self.memcon = "" - self.dataType = "" - self.rdsVersion = currentRDSVersion - self.memBacked = False - self.memChrom = "" - self.memCursor = "" - self.cachedDBFile = "" if initialize and datasetType not in ["DNA", "RNA"]: raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") - if cache: - if verbose: - print "caching ...." + self.dataType = "" + self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb") + self.bamfile = pysam.Samfile(datafile, "rb") + self.readsize = None + self.totalReadWeight = None - self.cacheDB(datafile) - dbFile = self.cachedDBFile - else: - dbFile = datafile + self.metadata = {"dataType": datasetType} - self.dbcon = sqlite.connect(dbFile) - self.dbcon.row_factory = sqlite.Row - self.dbcon.execute("PRAGMA temp_store = MEMORY") if initialize: self.dataType = datasetType - self.initializeTables(self.dbcon) else: - metadata = self.getMetadata("dataType") - self.dataType = metadata["dataType"] + self.dataType = self.metadata["dataType"] try: - metadata = self.getMetadata("rdsVersion") - self.rdsVersion = metadata["rdsVersion"] - except: - try: - self.insertMetadata([("rdsVersion", float(currentRDSVersion))]) - except IOError: - print "could not add rdsVersion - read-only ?" - self.rdsVersion = "pre-1.0" + self.rdsVersion = self.metadata["rdsVersion"] + except KeyError: + self.insertMetadata([("rdsVersion", float(currentRDSVersion))]) + self.fullReadCounts = self.getFullReadCount() if verbose: self.printRDSInfo(datafile, reportCount, initialize) def __len__(self): - """ return the number of usable reads in the dataset. + """ return the number of reads in the bam file + This is not the same as the original as the original returned total weight. """ - total = self.getUniqsCount() - total += self.getMultiCount() + count = 0 + references = self.bamfile.references + for reference in references: + count += self.bamfile.count(reference) + + return count + - if self.dataType == "RNA": - total += self.getSplicesCount() + def getMultiReadIDCounts(self, samFileName, fileMode): + try: + samfile = pysam.Samfile(samFileName, fileMode) + except ValueError: + print "samfile index not found" + sys.exit(1) + + readIDCounts = {} + for read in samfile.fetch(until_eof=True): + pairReadSuffix = getPairedReadNumberSuffix(read) + readName = "%s%s" % (read.qname, pairReadSuffix) + try: + readIDCounts[readName] += 1.0 + except KeyError: + readIDCounts[readName] = 1.0 - total = int(total) + for readID in readIDCounts.keys(): + if int(readIDCounts[readID]) == 1: + del readIDCounts[readID] - return total + return readIDCounts + + + def totalReadWeight(self): + """ return the total weight of usable reads in the bam file. + """ + if self.totalReadWeight is None: + total = self.fullReadCounts["uniq"] + total += self.fullReadCounts["multi"] + + if self.dataType == "RNA": + total += self.fullReadCounts["splice"] + + self.totalReadWeight = int(total) + + return self.totalReadWeight def __del__(self): """ cleanup copy in local cache, if present. """ - if self.cachedDBFile != "": - self.uncacheDB() + pass def printRDSInfo(self, datafile, reportCount, initialize): @@ -98,237 +171,129 @@ class ReadDataset(): else: print "dataset %s" % datafile - metadata = self.getMetadata() print "metadata:" - pnameList = metadata.keys() + pnameList = self.metadata.keys() pnameList.sort() for pname in pnameList: - print "\t" + pname + "\t" + metadata[pname] + print "\t" + pname + "\t" + str(self.metadata[pname]) if reportCount and not initialize: self.printReadCounts() print "default cache size is %d pages" % self.getDefaultCacheSize() - if self.hasIndex(): - print "found index" - else: - print "not indexed" def printReadCounts(self): - ucount = self.getUniqsCount() - mcount = self.getMultiCount() + ucount = self.fullReadCounts["uniq"] + mcount = self.fullReadCounts["multi"] if self.dataType == "DNA": print "\n%d unique reads and %d multireads" % (ucount, mcount) elif self.dataType == "RNA": - scount = self.getSplicesCount() + scount = self.fullReadCounts["splice"] print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount) def cacheDB(self, filename): - """ copy geneinfoDB to a local cache. - """ - configParser = getConfigParser() - cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp") - tempfile.tempdir = cisTemp - self.cachedDBFile = "%s.db" % tempfile.mktemp() - shutil.copyfile(filename, self.cachedDBFile) + pass def saveCacheDB(self, filename): - """ copy geneinfoDB to a local cache. - """ - shutil.copyfile(self.cachedDBFile, filename) + pass def uncacheDB(self): - """ delete geneinfoDB from local cache. - """ - global cachedDBFile - if self.cachedDBFile != "": - try: - os.remove(self.cachedDBFile) - except: - print "could not delete %s" % self.cachedDBFile - - self.cachedDB = "" + pass def attachDB(self, filename, dbName): - """ attach another database file to the readDataset. - """ - stmt = "attach '%s' as %s" % (filename, dbName) - self.execute(stmt) + pass def detachDB(self, dbName): - """ detach a database file to the readDataset. - """ - stmt = "detach %s" % (dbName) - self.execute(stmt) + pass def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""): - """ import into current RDS the table (with columns destcolumns, - with default all columns) from the database file asname, - using the column specification of ascolumns (default all). - """ - stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table) - if flagged != "": - stmt += " where flag = '%s' " % flagged - - self.executeCommit(stmt) + pass def getTables(self, dbName=""): - """ get a list of table names in a particular database file. - """ - resultList = [] - sql = self.getSqlCursor() - - if dbName != "": - dbName = "%s." % dbName - - stmt = "select name from %ssqlite_master where type='table'" % dbName - sql.execute(stmt) - results = sql.fetchall() - - for row in results: - resultList.append(row["name"]) - - return resultList + pass def getSqlCursor(self): - if self.memBacked: - sql = self.getMemCursor() - else: - sql = self.getFileCursor() - - return sql + pass def getMemCursor(self): - """ returns a cursor to memory database for low-level (SQL) - access to the data. - """ - return self.memcon.cursor() + pass def getFileCursor(self): - """ returns a cursor to file database for low-level (SQL) - access to the data. - """ - return self.dbcon.cursor() + pass - def hasIndex(self): - """ return True if the RDS file has at least one index. - """ - stmt = "select count(*) from sqlite_master where type='index'" - count = int(self.execute(stmt, returnResults=True)[0][0]) + def initializeTables(self, dbConnection, cache=100000): + pass - return count > 0 + def getMetadata(self, valueName=""): + return self.metadata - def initializeTables(self, dbConnection, cache=100000): - """ creates table schema in a database connection, which is - typically a database file or an in-memory database. - """ - dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) - dbConnection.execute("create table metadata (name varchar, value varchar)") - dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType) - positionSchema = "start int, stop int" - tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema - dbConnection.execute("create table uniqs %s" % tableSchema) - dbConnection.execute("create table multi %s" % tableSchema) - if self.dataType == "RNA": - positionSchema = "startL int, stopL int, startR int, stopR int" - tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema - dbConnection.execute("create table splices %s" % tableSchema) - - positionSchema = "startL int, stopL int, startR int, stopR int" - tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema - dbConnection.execute("create table multisplices %s" % tableSchema) - - dbConnection.commit() - - - def getMetadata(self, valueName=""): - """ returns a dictionary of metadata. - """ - whereClause = "" - resultsDict = {} - if valueName != "": - whereClause = " where name='%s'" % valueName + def getReadSize(self): + """ This is following the same model as the original where it assumes that all + read have the same readsize and that this is reflected by the size of the + first read. + """ + if self.readsize is None: + bamFileIterator = self.bamfile.fetch(until_eof=True) + for read in bamFileIterator: + self.calculateReadsizeFromCigar(read.cigar) + break - sql = self.getSqlCursor() + return self.readsize - sql.execute("select name, value from metadata %s" % whereClause) - results = sql.fetchall() - for row in results: - parameterName = row["name"] - parameterValue = row["value"] - if parameterName not in resultsDict: - resultsDict[parameterName] = parameterValue - else: - trying = True - index = 2 - while trying: - newName = string.join([parameterName, str(index)], ":") - if newName not in resultsDict: - resultsDict[newName] = parameterValue - trying = False + def calculateReadsizeFromCigar(self, cigar): + take = (0, 1) # CIGAR operation (M/match, I/insertion) + self.readsize = sum([length for op,length in cigar if op in take]) - index += 1 - return resultsDict + def getDefaultCacheSize(self): + """ returns 0 as cache is going to be deprecated + """ + return 0 - def getReadSize(self): - """ returns readsize if defined in metadata. + def getChromosomes(self, fullChrom=True): + """ returns a sorted list of distinct chromosomes in bam file reference list. """ - metadata = self.getMetadata() - if "readsize" not in metadata: - raise ReadDatasetError("no readsize parameter defined") + if fullChrom: + results = list(self.getFullReferenceNames()) else: - readSize = metadata["readsize"] - if "import" in readSize: - readSize = readSize.split()[0] + results = list(self.getShortReferenceNames()) - readSize = int(readSize) - if readSize < 0: - raise ReadDatasetError("readsize is negative") + results.sort() - return readSize + return results - def getDefaultCacheSize(self): - """ returns the default cache size. + def getFullReferenceNames(self): + """ returns a set of bam file reference names. """ - return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0]) + return set(self.bamfile.references) - def getChromosomes(self, table="uniqs", fullChrom=True): - """ returns a sorted list of distinct chromosomes in table. + def getShortReferenceNames(self): + """ returns a set of bam file reference names after removing first 3 characters. """ - statement = "select distinct chrom from %s" % table - sql = self.getSqlCursor() - - sql.execute(statement) - results = [] - for row in sql: - if fullChrom: - if row["chrom"] not in results: - results.append(row["chrom"]) - else: - shortName = row["chrom"][3:] - if len(shortName.strip()) > 0 and shortName not in results: - results.append(shortName) - - results.sort() + results = set() + references = self.bamfile.references + for chromosome in references: + shortName = chromosome[3:] + if len(shortName.strip()) > 0: + results.add(shortName) return results @@ -338,30 +303,17 @@ class ReadDataset(): """ returns the maximum coordinate for reads on a given chromosome. """ maxCoord = 0 + coordFinder = MaxCoordFinder() + self.bamfile.fetch(reference=chrom, callback=coordFinder) if doUniqs: - maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs") + maxCoord = coordFinder.maxUniqueStart if doSplices: - spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR") - maxCoord = max(spliceMax, maxCoord) + maxCoord = max(coordFinder.maxSpliceStart, maxCoord) if doMulti: - multiMax = self.getMaxStartCoordinateInTable(chrom, "multi") - maxCoord = max(multiMax, maxCoord) - - return maxCoord - - - def getMaxStartCoordinateInTable(self, chrom, table, startField="start"): - maxCoord = 0 - sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom) - sql = self.getSqlCursor() - try: - sql.execute(sqlStatement) - maxCoord = int(sql.fetchall()[0][0]) - except: - print "couldn't retrieve coordMax for chromosome %s" % chrom + maxCoord = max(coordFinder.maxMultiStart, maxCoord) return maxCoord @@ -369,16 +321,13 @@ class ReadDataset(): def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="", flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False, - readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False, + readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False, flagLike=False, strand='', combine5p=False): - """ returns a dictionary of reads in a variety of formats - and which can be restricted by chromosome or custom-flag. - Returns unique reads by default, but can return multireads - with doMulti set to True. - + """ First pass of rewrite + 1) Leave as much in place as possible + 2) For now treat multis just like uniqs + """ """ - #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict - whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike) if findallOptimize: selectQuery = "select start, sense, sum(weight)" @@ -433,160 +382,134 @@ class ReadDataset(): sql = self.getSqlCursor() sqlQuery = string.join(stmt) sql.execute(sqlQuery) + """ + # Edits are starting here - trying to ignore the entire sql construct + bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop) resultsDict = {} - if findallOptimize: - resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql] - if self.memBacked: - self.memcon.row_factory = sqlite.Row - else: - self.dbcon.row_factory = sqlite.Row - else: - currentChrom = "" - currentReadID = "" - pairID = 0 - for row in sql: - readID = row["readID"] - if fullChrom: - chrom = row["chrom"] - else: - chrom = row["chrom"][3:] - - if not readIDDict and chrom != currentChrom: - resultsDict[chrom] = [] - currentChrom = chrom - dictKey = chrom - elif readIDDict: - theReadID = readID - if "::" in readID: - theReadID = readID.split("::")[0] - - if "/" in theReadID and withPairID: - (theReadID, pairID) = readID.split("/") - - if theReadID != currentReadID: - resultsDict[theReadID] = [] - currentReadID = theReadID - dictKey = theReadID - - newrow = {"start": int(row["start"])} - if bothEnds: - newrow["stop"] = int(row["stop"]) - - if not noSense: - newrow["sense"] = row["sense"] - - if withWeight: - newrow["weight"] = float(row["weight"]) - - if withFlag: - newrow["flag"] = row["flag"] - - if withMismatch: - newrow["mismatch"] = row["mismatch"] - - if withID: - newrow["readID"] = readID - - if withChrom: - newrow["chrom"] = chrom - - if withPairID: - newrow["pairID"] = pairID - - resultsDict[dictKey].append(newrow) - - return resultsDict - - - def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False): - if splice: - startText = "startL" - stopText = "stopR" - else: - startText = "start" - stopText = "stop" + if findallOptimize and chrom != "": + readIDDict = False + bothEnds = False + noSense = False + withWeight = True + withFlag = False + withMismatch = False + withID = False + withChrom = False + withPairID = False + fullChrom = True + #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)] + + pairID = 0 + for read in bamFileIterator: + pairReadSuffix = getPairedReadNumberSuffix(read) + readID = "%s%s" % (read.qname, pairReadSuffix) + if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti): + continue - whereClause = [] - if chrom != "" and chrom != self.memChrom: - whereClause.append("chrom = '%s'" % chrom) + sense = getReadSense(read.is_reverse) + if fullChrom: + chrom = self.bamfile.getrname(read.rname) + else: + chrom = self.bamfile.getrname(read.rname)[3:] - if flag != "": - if flagLike: - flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") - whereClause.append(flagLikeClause) + if readIDDict: + dictKey = read.qname else: - whereClause.append("flag = '%s'" % flag) + dictKey = chrom - if start > -1: - whereClause.append("%s > %d" % (startText, start)) + newrow = {"start": int(read.pos)} + if bothEnds: + if self.readsize is None: + self.calculateReadsizeFromCigar(read.cigar) - if stop > -1: - whereClause.append("%s < %d" % (stopText, stop)) + newrow["stop"] = int(read.pos + self.readsize) - if len(readLike) > 0: - readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") - whereClause.append(readIDClause) + if not noSense: + newrow["sense"] = sense - if hasMismatch: - whereClause.append("mismatch != ''") + if withWeight: + try: + newrow["weight"] = 1.0/self.multiReadIDCounts[readID] + except KeyError: + newrow["weight"] = 1.0 - if strand in ["+", "-"]: - whereClause.append("sense = '%s'" % strand) + if withFlag: + #newrow["flag"] = row["flag"] + pass - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" + if withMismatch: + try: + mismatchTag = read.opt("MD") + mismatches = getMismatches(mismatchTag, read.seq, sense) + except KeyError: + mismatches = "" - return whereQuery + newrow["mismatch"] = mismatches + if withID: + newrow["readID"] = readID - def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False): + if withChrom: + newrow["chrom"] = chrom - selectClause = [baseSelect] - if bothEnds: - selectClause.append("stop") + if withPairID: + newrow["pairID"] = pairID + + try: + resultsDict[dictKey].append(newrow) + except KeyError: + resultsDict[dictKey] = [newrow] - if not noSense: - selectClause.append("sense") + return resultsDict - if withWeight: - selectClause.append("weight") - if withFlag: - selectClause.append("flag") + def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""): + readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti) - if withMismatch: - selectClause.append("mismatch") + if flag != "": + #TODO: need to pick a tag code for the erange flag - using ZF for now + # although it looks like the flag is really just a label on a region. + # since BAM can retrieve based on location if we store the regions then + # we can just retrieve it directly from the BAM and we do not need to + # flag the individual reads. The location will be sufficient. + try: + if flag != read.opt("ZF"): + readDoesNotMeetCriteria = True + except KeyError: + readDoesNotMeetCriteria = True - selectQuery = string.join(selectClause, ",") - return selectQuery + if hasMismatch: + try: + mismatchTag = read.opt("MD") + except KeyError: + readDoesNotMeetCriteria = True + if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand: + readDoesNotMeetCriteria = True - def getReadGroupQuery(self, findallOptimize, limit, combine5p): - groupBy = [] - if findallOptimize: - groupBy = ["GROUP BY start, sense"] + return readDoesNotMeetCriteria - if limit > 0 and not combine5p: - groupBy.append("LIMIT %d" % limit) - groupQuery = string.join(groupBy) + def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10): + rejectRead = False + if readID in self.multiReadIDCounts.keys(): + if self.multiReadIDCounts[readID] > threshold or not doMulti: + rejectRead = True - return groupQuery + return rejectRead def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, withChrom=False, withPairID=False, readIDDict=False, - splitRead=False, hasMismatch=False, flagLike=False, start=-1, - stop=-1, strand=""): - """ returns a dictionary of spliced reads in a variety of - formats and which can be restricted by chromosome or custom-flag. - Returns unique spliced reads for now. + splitRead=False, hasMismatch=False, flagLike=False, start=None, + stop=None, strand=""): + """ First pass of rewrite + 1) Leave as much in place as possible + 2) For now treat multis just like regular splices + """ """ whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True) selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID" @@ -595,47 +518,55 @@ class ReadDataset(): stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) sql.execute(stmt) - currentReadID = "" - currentChrom = "" + """ + + # Edits are starting here - trying to ignore the entire sql construct + bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop) resultsDict = {} - for row in sql: - pairID = 0 - readID = row["readID"] + pairID = 0 + for read in bamFileIterator: + if not isSpliceEntry(read.cigar): + continue + + sense = getReadSense(read.is_reverse) + pairReadSuffix = getPairedReadNumberSuffix(read) + readID = "%s%s" % (read.qname, pairReadSuffix) if fullChrom: - chrom = row["chrom"] + chrom = self.bamfile.getrname(read.rname) else: - chrom = row["chrom"][3:] + chrom = self.bamfile.getrname(read.rname)[3:] - if not readIDDict and chrom != currentChrom: - resultsDict[chrom] = [] - currentChrom = chrom + if readIDDict: + dictKey = read.qname + else: dictKey = chrom - elif readIDDict: - if "/" in readID: - (theReadID, pairID) = readID.split("/") - else: - theReadID = readID - - if theReadID != currentReadID: - resultsDict[theReadID] = [] - currentReadID = theReadID - dictKey = theReadID - - newrow = {"startL": int(row["startL"])} - newrow["stopL"] = int(row["stopL"]) - newrow["startR"] = int(row["startR"]) - newrow["stopR"] = int(row["stopR"]) + + if self.readsize is None: + self.calculateReadsizeFromCigar(read.cigar) + + startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar) + newrow = {"startL": int(startL)} + newrow["stopL"] = int(stopL) + newrow["startR"] = int(startR) + newrow["stopR"] = int(stopR) if not noSense: - newrow["sense"] = row["sense"] + newrow["sense"] = sense if withWeight: - newrow["weight"] = float(row["weight"]) + newrow["weight"] = 1.0 if withFlag: - newrow["flag"] = row["flag"] + #newrow["flag"] = row["flag"] + pass if withMismatch: - newrow["mismatch"] = row["mismatch"] + try: + mismatchTag = read.opt("MD") + mismatches = getMismatches(mismatchTag, read.seq, sense) + except KeyError: + mismatches = "" + + newrow["mismatch"] = mismatches if withID: newrow["readID"] = readID @@ -653,42 +584,53 @@ class ReadDataset(): rightDict = newrow del rightDict["startL"] del rightDict["stopL"] - resultsDict[dictKey].append(leftDict) + try: + resultsDict[dictKey].append(leftDict) + except KeyError: + resultsDict[dictKey] = [leftDict] + resultsDict[dictKey].append(rightDict) else: - resultsDict[dictKey].append(newrow) + try: + resultsDict[dictKey].append(newrow) + except KeyError: + resultsDict[dictKey] = [newrow] return resultsDict - def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False, + def getFullReadCount(self): + fullReadCount = {"uniq": 0, + "multi": 0, + "splice": 0 + } + + for chromosome in self.getChromosomes(): + uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True) + fullReadCount["uniq"] += uniqCount + fullReadCount["multi"] += multiCount + fullReadCount["splice"] += spliceCount + + return fullReadCount + + + def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False, splices=False, reportCombined=True, sense="both"): """ return read counts for a given region. """ ucount = 0 mcount = 0 scount = 0 - restrict = "" - if sense in ["+", "-"]: - restrict = " sense ='%s' " % sense + readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense) if uniqs: - try: - ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict)) - except: - ucount = 0 + ucount = readCounter.uniqReadCount if multi: - try: - mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict)) - except: - mcount = 0 + mcount = readCounter.multiReadCount if splices: - try: - scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict)) - except: - scount = 0 + scount = readCounter.spliceReadCount if reportCombined: total = ucount + mcount + scount @@ -697,72 +639,32 @@ class ReadDataset(): return (ucount, mcount, scount) - def getTotalCounts(self, chrom="", rmin="", rmax=""): - """ return read counts for a given region. - """ - return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both") - - - def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): - """ returns the number of row in the specified table. - """ - whereClause = [] - count = 0 - - if chrom !="" and chrom != self.memChrom: - whereClause = ["chrom='%s'" % chrom] - - if rmin != "": - whereClause.append("%s >= %s" % (startField, str(rmin))) - - if rmax != "": - whereClause.append("%s <= %s" % (startField, str(rmax))) + def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False): + readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict) - if restrict != "": - whereClause.append(restrict) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" + return readCounter.spliceReadCount - sql = self.getSqlCursor() - if distinct: - sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery)) - else: - sql.execute("select sum(weight) from %s %s" % (table, whereQuery)) + def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False): + readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict) - result = sql.fetchone() + return readCounter.uniqReadCount - try: - count = int(result[0]) - except: - count = 0 - return count - - - def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the number of row in the splices table. - """ - # TODO: if the rds type is DNA should this just return zero? - return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") + def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict) + return readCounter.multiReadCount - def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the number of distinct readIDs in the uniqs table. - """ - return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct) + def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"): + readCounter = ReadCounter(self.multiReadIDCounts, sense) + self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter) - def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the total weight of readIDs in the multi table. - """ - return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct) + return readCounter + #TODO: Primary def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1): """ get readID's. """ @@ -807,13 +709,13 @@ class ReadDataset(): hitChromList.sort() snpDict = {} - for achrom in hitChromList: + for chromosome in hitChromList: if verbose: - print "getting mismatches from chromosome %s" % (achrom) + print "getting mismatches from chromosome %s" % (chromosome) - snpDict[achrom] = [] + snpDict[chromosome] = [] if useSplices and self.dataType == "RNA": - spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) + spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True) spliceIDList = spliceDict.keys() for spliceID in spliceIDList: spliceEntry = spliceDict[spliceID][0] @@ -845,13 +747,13 @@ class ReadDataset(): secondhalf = change_pos - firsthalf change_at = rightstart + secondhalf - snpDict[achrom].append([startpos, change_at, change_base, change_from]) + snpDict[chromosome].append([startpos, change_at, change_base, change_from]) - hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) - if achrom not in hitDict.keys(): + hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True) + if chromosome not in hitDict.keys(): continue - for readEntry in hitDict[achrom]: + for readEntry in hitDict[chromosome]: start = readEntry["start"] sense = readEntry["sense"] mismatches = readEntry["mismatch"] @@ -871,7 +773,7 @@ class ReadDataset(): change_pos = readlen - int(mismatch[1:change_len-1]) + 1 change_at = start + change_pos - 1 - snpDict[achrom].append([start, change_at, change_base, change_from]) + snpDict[chromosome].append([start, change_at, change_base, change_from]) return snpDict @@ -925,7 +827,7 @@ class ReadDataset(): spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop) else: spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True) - + if chromosome in spliceDict: for spliceEntry in spliceDict[chromosome]: Lstart = spliceEntry["startL"] @@ -956,56 +858,19 @@ class ReadDataset(): def insertMetadata(self, valuesList): - """ inserts a list of (pname, pvalue) into the metadata - table. - """ - self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList) - self.dbcon.commit() + for (pname, pvalue) in valuesList: + self.metadata[pname] = pvalue def updateMetadata(self, pname, newValue, originalValue=""): - """ update a metadata field given the original value and the new value. - """ - stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname) if originalValue != "": - stmt += " and value='%s' " % str(originalValue) - - self.dbcon.execute(stmt) - self.dbcon.commit() - - - def insertUniqs(self, valuesList): - """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) - into the uniqs table. - """ - self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def insertMulti(self, valuesList): - """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) - into the multi table. - """ - self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def insertSplices(self, valuesList): - """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) - into the splices table. - """ - self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def insertMultisplices(self, valuesList): - """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) - into the multisplices table. - """ - self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() + if self.metadata[pname] == originalValue: + self.metadata[pname] = newValue + else: + self.metadata[pname] = newValue + #TODO: Primary def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): """ update reads on file database in a list region of regions for a chromosome to have a new flag. regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with @@ -1028,186 +893,170 @@ class ReadDataset(): self.dbcon.commit() + #TODO: Primary def setFlags(self, flag, uniqs=True, multi=True, splices=True): """ set the flag fields in the entire dataset. """ if uniqs: - self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag) + self.setFlagsInDB("uniqs", flag) if multi: - self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag) + self.setFlagsInDB("multi", flag) if self.dataType == "RNA" and splices: - self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag) + self.setFlagsInDB("splices", flag) self.dbcon.commit() + #TODO: Secondary to setFlags() + def setFlagsInDB(self, table, flag): + """ set the flag field for every entry in a table. + """ + self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag)) + + + #TODO: Primary def resetFlags(self, uniqs=True, multi=True, splices=True): """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. """ self.setFlags("", uniqs, multi, splices) + #TODO: Primary def reweighMultireads(self, readList): self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList) def setSynchronousPragma(self, value="ON"): - try: - self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value) - except: - print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value + pass def setDBcache(self, cache, default=False): - self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache) - if default: - self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) + pass def execute(self, statement, returnResults=False): - sql = self.getSqlCursor() - - sql.execute(statement) - if returnResults: - result = sql.fetchall() - return result + pass def executeCommit(self, statement): - self.execute(statement) - - if self.memBacked: - self.memcon.commit() - else: - self.dbcon.commit() + pass def buildIndex(self, cache=100000): - """ Builds the file indeces for the main tables. - Cache is the number of 1.5 kb pages to keep in memory. - 100000 pages translates into 150MB of RAM, which is our default. - """ - if cache > self.getDefaultCacheSize(): - self.setDBcache(cache) - self.setSynchronousPragma("OFF") - self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") - print "built uPosIndex" - self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)") - print "built uChromIndex" - self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") - print "built mPosIndex" - self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)") - print "built mChromIndex" - - if self.dataType == "RNA": - self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)") - print "built sPosIndex" - self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)") - print "built sPosIndex2" - self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)") - print "built sChromIndex" - - self.dbcon.commit() - self.setSynchronousPragma("ON") + pass def dropIndex(self): - """ drops the file indices for the main tables. - """ - try: - self.setSynchronousPragma("OFF") - self.dbcon.execute("DROP INDEX uPosIndex") - self.dbcon.execute("DROP INDEX uChromIndex") - self.dbcon.execute("DROP INDEX mPosIndex") - self.dbcon.execute("DROP INDEX mChromIndex") + pass - if self.dataType == "RNA": - self.dbcon.execute("DROP INDEX sPosIndex") - try: - self.dbcon.execute("DROP INDEX sPosIndex2") - except: - pass - self.dbcon.execute("DROP INDEX sChromIndex") + def memSync(self, chrom="", index=False): + pass - self.dbcon.commit() - except: - print "problem dropping index" - self.setSynchronousPragma("ON") + def copyDBEntriesToMemory(self, dbName, whereClause=""): + pass - def memSync(self, chrom="", index=False): - """ makes a copy of the dataset into memory for faster access. - Can be restricted to a "full" chromosome. Can also build the - memory indices. - """ - self.memcon = "" - self.memcon = sqlite.connect(":memory:") - self.initializeTables(self.memcon) - cursor = self.dbcon.cursor() - whereclause = "" - if chrom != "": - print "memSync %s" % chrom - whereclause = " where chrom = '%s' " % chrom - self.memChrom = chrom + def copySpliceDBEntriesToMemory(self, whereClause=""): + pass + + +def getReadSense(reverse): + if reverse: + sense = "-" + else: + sense = "+" + + return sense + + +def isSpliceEntry(cigarTupleList): + isSplice = False + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def getSpliceRightStart(start, cigarTupleList): + offset = 0 + + for operation,length in cigarTupleList: + if operation == 3: + stopL = int(start + offset) + startR = int(stopL + length) + + return startR else: - self.memChrom = "" - - self.memcon.execute("PRAGMA temp_store = MEMORY") - self.memcon.execute("PRAGMA CACHE_SIZE = 1000000") - # copy metadata to memory - self.memcon.execute("delete from metadata") - results = cursor.execute("select name, value from metadata") - results2 = [] - for row in results: - results2.append((row["name"], row["value"])) - - self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2) - - self.copyDBEntriesToMemory("uniqs", whereclause) - self.copyDBEntriesToMemory("multi", whereclause) - if self.dataType == "RNA": - self.copySpliceDBEntriesToMemory(whereclause) - - if index: - if chrom != "": - self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)") - self.memcon.execute("CREATE INDEX mPosIndex on multi(start)") - if self.dataType == "RNA": - self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)") - self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)") - else: - self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") - self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") - if self.dataType == "RNA": - self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)") - self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)") + offset += length - self.memBacked = True - self.memcon.row_factory = sqlite.Row - self.memcon.commit() +def getSpliceBounds(start, readsize, cigarTupleList): + offset = 0 + #TODO: check operations here - we want to make sure we are adding the correct ones + for operation,length in cigarTupleList: + if operation == 3: + stopL = int(start + offset) + startR = int(stopL + length) + offset = 0 + else: + offset += length - def copyDBEntriesToMemory(self, dbName, whereClause=""): - cursor = self.dbcon.cursor() - sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause)) - destinationEntries = [] - for row in sourceEntries: - destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) + stopR = startR + offset - self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries) + return start, startR, stopL, stopR - def copySpliceDBEntriesToMemory(self, whereClause=""): - cursor = self.dbcon.cursor() - sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause) - destinationEntries = [] - for row in sourceEntries: - destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], - row["weight"], row["flag"], row["mismatch"])) +def getPairedReadNumberSuffix(read): + readSuffix = "" + if not isPairedRead(read): + return "" + + if read.is_read1: + readSuffix = "/1" + elif read.is_read2: + readSuffix = "/2" + + return readSuffix + + +def isPairedRead(read): + return read.is_proper_pair and (read.is_read1 or read.is_read2) + + +def getMismatches(mismatchTag, querySequence="", sense="+"): + output = [] + deletionMarker = "^" + position = 0 + + lengths = re.findall("\d+", mismatchTag) + mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag) + + for mismatchEntry in range(len(mismatchSequences)): + mismatch = mismatchSequences[mismatchEntry] + position = position + int(lengths[mismatchEntry]) + if string.find(mismatch, deletionMarker) == 0: + continue + + try: + if querySequence: + genomicNucleotide = querySequence[position] + else: + genomicNucleotide = "N" + + if sense == "-": + mismatch = getReverseComplement(mismatch) + genomicNucleotide = getReverseComplement(genomicNucleotide) - self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries) + erange1BasedElandCompatiblePosition = int(position + 1) + output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide)) + position += 1 + except IndexError: + return "" + return string.join(output, ",") diff --git a/combineRPKMs.py b/combineRPKMs.py index 77f30d6..88b57ab 100755 --- a/combineRPKMs.py +++ b/combineRPKMs.py @@ -63,6 +63,8 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do outfile.write(header) finalfile = open(finalfileName) + #TODO: QForAli - the output lines are driven by finalfile. If there are genes in the first 2 that + # are not in the finalfile then they will be lost. for line in finalfile: fields = line.strip().split() gene = fields[0] @@ -101,4 +103,4 @@ def getRPKMDict(rpkmFileName, getGIDDict=False): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/commoncode.py b/commoncode.py index 6ef4edf..cd44962 100755 --- a/commoncode.py +++ b/commoncode.py @@ -192,6 +192,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will # be merged with A but that will exit the loop and the merge with C will be missed. + #TODO: QForAli - Are we going to require sorted region files to have this even work? for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -340,6 +341,10 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, # implementing a triangular smooth smoothArray = array("f", [0.] * length) + #TODO: QForAli - really? We're going to insert MANUFACTURED data (0.0's) and use them in the calculation + # (probably) just to avoid IndexError exceptions. Then we are going to completely ignore adjusting the + # last 2 elements of the array for (probably) the same issue. Is this how erange is dealing with the + # edge cases? for pos in range(2,length -2): smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 diff --git a/findMotifs.py b/findMotifs.py index a072d43..ee2957c 100755 --- a/findMotifs.py +++ b/findMotifs.py @@ -54,7 +54,7 @@ def getParser(usage): parser.add_option("--logo", action="store_true", dest="saveLogo") parser.add_option("--threshold", type="float", dest="threshold") parser.add_option("--prefix", dest="motifPrefix") - parser.add_option("--numMotifs", dest="numMotifs") + parser.add_option("--numMotifs", type="int", dest="numMotifs") parser.add_option("--maxWidth", type="int", dest="maxWidth") parser.add_option("--maskLower", action="store_true", dest="maskLower") @@ -64,7 +64,7 @@ def getParser(usage): doCisGreedy = getConfigBoolOption(configParser, section, "doCisGreedy", False) saveLogo = getConfigBoolOption(configParser, section, "saveLogo", False) threshold = getConfigFloatOption(configParser, section, "threshold", 75.) - numMotifs = getConfigOption(configParser, section, "numMotifs", "10") + numMotifs = getConfigOption(configParser, section, "numMotifs", 10) maxWidth = getConfigIntOption(configParser, section, "maxWidth", 28) maskLower = getConfigBoolOption(configParser, section, "maskLower", False) @@ -76,7 +76,7 @@ def getParser(usage): def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75., - numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False): + numMotifs=10, maxWidth=28, maskLower=False, doCisSampler=False): motifPrefix = expbase diff --git a/findall.py b/findall.py index 2e4123a..9dd85df 100755 --- a/findall.py +++ b/findall.py @@ -390,6 +390,7 @@ def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outpu shiftDict = {} chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl) for chromosome in chromosomeList: + #TODO: QForAli -Really? Use first chr shift value for all of them if regionFinder.shiftValue == "learn": learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p) @@ -958,4 +959,4 @@ def getBestShiftInDict(shiftDict): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/geneLocusCounts.py b/geneLocusCounts.py index 218017e..ab44125 100755 --- a/geneLocusCounts.py +++ b/geneLocusCounts.py @@ -122,7 +122,7 @@ def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, print fullchrom hitRDS.memSync(fullchrom, index=True) for (start, stop, gid, length) in locusByChromDict[chrom]: - if not gidDict.has_key(gid): + if not gid in gidDict: gidDict[gid] = {"count": 0, "length": length} gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py index cf5065a..7b4a2cc 100755 --- a/geneMrnaCounts.py +++ b/geneMrnaCounts.py @@ -124,7 +124,7 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice continue if countFeats: - seenFeaturesByChromDict[chrom] = [] + seenFeaturesByChromDict[chrom] = set([]) print "\nchr%s" % chrom fullchrom = "chr%s" % chrom @@ -137,18 +137,18 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice if featureSense == "R": checkSense = "-" - regionList.append((gid, fullchrom, start, stop, checkSense)) + regionData = (gid, fullchrom, start, stop, checkSense) count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense) else: - regionList.append((gid, fullchrom, start, stop)) + regionData = (gid, fullchrom, start, stop) count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) - if count != 0: - print count gidCount[gid] += count + if markGID: + regionList.append(regionData) + if countFeats: - if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]: - seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense)) + seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense)) except: print "problem with %s - skipping" % gid diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 74e7a0c..31213c3 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -165,7 +165,7 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index): for read in hitDict[fullchrom]: tagStart = read["start"] tagReadID = read["readID"] - if read.has_key("sense"): + if "sense" in read: tagSense = read["sense"] ignoreSense = False @@ -253,7 +253,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): try: tagValue = uniqueCountDict[gid] except KeyError: - tagValue = 1 + pass tagDenom = 0. for relatedGID in read2GidDict[readID]: diff --git a/getSNPGeneInfo.py b/getSNPGeneInfo.py index edcb2ce..0fbc679 100755 --- a/getSNPGeneInfo.py +++ b/getSNPGeneInfo.py @@ -116,7 +116,7 @@ def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=Tr symbol = "LOC%s" % geneID geneDescriptor = (symbol, geneID) - if geneDict.has_key(geneDescriptor): + if geneDescriptor in geneDict: geneDict[geneDescriptor]["position"].append(pos) else: geneDict[geneDescriptor] = {"position": [pos], @@ -167,7 +167,7 @@ def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense): "geneID": geneID} rpkm = "N\A" - if rpkmDict.has_key(geneID): + if geneID in rpkmDict: rpkm = str(rpkmDict[geneID]) snpGeneInfoDict["rpkm"] = rpkm diff --git a/getSNPs.py b/getSNPs.py index 4778644..64948d4 100755 --- a/getSNPs.py +++ b/getSNPs.py @@ -174,7 +174,7 @@ def getMatchDict(rds, chrom, withSplices=True): for read in readDict[chrom]: start = read["start"] stop = read["stop"] - if finalDict.has_key(start): + if start in finalDict: finalDict[start].append(stop) else: finalDict[start] = [stop] @@ -193,7 +193,7 @@ def getMatchDict(rds, chrom, withSplices=True): start = read["startR"] stop = read["stopR"] - if finalDict.has_key(start): + if start in finalDict: finalDict[start].append(stop) else: finalDict[start] = [stop] @@ -211,12 +211,12 @@ def getMismatchDict(rds, chrom, withSplices=True): back = "%s:%s" % (str(start), change) uniqBaseDict = {change: 1} totalBaseDict = {change: 1} - if mismatchDict.has_key(change_at): + if change_at in mismatchDict: pos = "%s:%s" % (str(start), change) totalCount = mismatchDict[change_at]["totalCount"] totalCount += 1 totalBaseDict = mismatchDict[change_at]["totalBaseDict"] - if totalBaseDict.has_key(change): + if change in totalBaseDict: totalBaseDict[change] += 1 uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"] @@ -224,7 +224,7 @@ def getMismatchDict(rds, chrom, withSplices=True): back = mismatchDict[change_at]["back"] if pos not in back: uniqueReadCount += 1 - if uniqBaseDict.has_key(change): + if change in uniqBaseDict: uniqBaseDict[change] += 1 # dict contains total unique read counts back = "%s,%s" % (back, pos) diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py index c677d56..cf02cb3 100644 --- a/normalizeExpandedExonic.py +++ b/normalizeExpandedExonic.py @@ -45,7 +45,10 @@ def main(argv=None): except IndexError: pass - normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile, + RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False) + uniqcount = RDS.getUniqsCount() + + normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile, candidateLines, acceptedfilename, options.fieldID, options.maxLength, options.doCache, options.extendGenome, options.replaceModels) @@ -73,7 +76,7 @@ def makeParser(usage=""): return parser -def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename, +def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename, outfilename, candidateLines=[], acceptedfilename="", fieldID=0, maxLength=1000000000., doCache=False, extendGenome="", replaceModels=False): @@ -104,8 +107,7 @@ def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfil if extendGenome != "": hg.extendFeatures(extendGenome, replace=replaceModels) - RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False) - uniqcount = RDS.getUniqsCount() + print "%d unique reads" % uniqcount splicecount = 0 diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py index 6c1ae33..6b47a20 100755 --- a/normalizeFinalExonic.py +++ b/normalizeFinalExonic.py @@ -15,7 +15,7 @@ def main(argv=None): if not argv: argv = sys.argv - usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]" + usage = "usage: python normalizeFinalExonic rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]" parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) @@ -29,7 +29,13 @@ def main(argv=None): multicountfile = args[2] outfilename = args[3] - normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename, + readCounts = {} + RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=options.doCache, reportCount=False) + readCounts["uniq"] = RDS.getUniqsCount() + readCounts["splice"] = RDS.getSplicesCount() + readCounts["multi"] = RDS.getMultiCount() + + normalizeFinalExonic(readCounts, expandedRPKMfile, multicountfile, outfilename, options.reportFraction, options.reportFold, options.minThreshold, options.doCache, options.writeGID) @@ -56,7 +62,7 @@ def makeParser(usage=""): return parser -def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename, +def normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=False, reportFold=False, minThreshold=0., doCache=False, writeGID=False): @@ -69,10 +75,9 @@ def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, elif reportFold: print "reporting fold contribution of multireads" - RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False) - uniqcount = RDS.getUniqsCount() - splicecount = RDS.getSplicesCount() - multicount = RDS.getMultiCount() + uniqcount = readCounts["uniq"] + splicecount = readCounts["splice"] + multicount = readCounts["multi"] countDict = {} multicountDict = {} lengthDict = {} diff --git a/rnaEditing.py b/rnaEditing.py index ce9f4ea..f6f714a 100644 --- a/rnaEditing.py +++ b/rnaEditing.py @@ -1,7 +1,7 @@ import sys import optparse -from erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter -from erange.commoncode import countDuplicatesInList, getConfigParser, getConfigOption +import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter +from commoncode import countDuplicatesInList, getConfigParser, getConfigOption def main(argv=None): diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py new file mode 100644 index 0000000..ebc25a5 --- /dev/null +++ b/runRNAPairedAnalysis.py @@ -0,0 +1,135 @@ +import sys +import optparse +import ReadDataset +from commoncode import getConfigParser, getConfigBoolOption, writeLog +from checkrmask import checkrmask +from geneMrnaCounts import geneMrnaCounts +from geneMrnaCountsWeighted import geneMrnaCountsWeighted +from regionCounts import regionCounts +from rnafarPairs import rnaFarPairs +from normalizeFinalExonic import normalizeFinalExonic +from normalizeExpandedExonic import normalizeExpandedExonic +from findall import findall, RegionFinder + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runRNAPairedAnalysis: version %s" % VERSION + usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + rdsprefix = args[1] + repeatmaskdb = args[2] + try: + modelfile = args[3] + except IndexError: + modelfile = "" + + runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--replacemodels", action="store_true", dest="replacemodels") + + configParser = getConfigParser() + section = "RunRNAPairedAnalysis" + replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False) + parser.set_defaults(replacemodels=replacemodels) + + return parser + + +def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False): + """ based on original script runRNAPairedAnalysis.sh + usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels] + where rdsprefix is the name of the rds file without the .rds extension + use "none" for the repeatmaskdb if you do not have one + """ + + rdsfile = "%s.rds" % rdsprefix + logfile = "rna.log" + + # log the parameters + message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb) + writeLog(logfile, "runRNAPairedAnalysis.py", message) + + # count the unique reads falling on the gene models ; the nomatch files are + # mappable reads that fell outside of the Cistematic gene models and not the + # unmappable of Eland (i.e, the "NM" reads) + uniquecountfilename = "%s.uniqs.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + + # calculate a first-pass RPKM to re-weigh the unique reads, + # using 'none' for the splice count + initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix + RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) + (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + readCounts = {} + readCounts["uniq"] = ucount + readCounts["splice"] = mcount + readCounts["multi"] = scount + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # recount the unique reads with weights calculated during the first pass + uniquerecountfilename = "%s.uniqs.recount" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + + # count splice reads + splicecountfilename = "%s.splices.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + + # find new regions outside of gene models with reads piled up + newregionfilename = "%s.newregions.txt" % rdsprefix + regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") + findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + + # filter out new regions that overlap repeats more than a certain fraction + outFileName = "%s.newregions.repstatus" % rdsprefix + goodFileName = "%s.newregions.good" % rdsprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + + # calculate the read densities + regionfilename = "%s.newregions.checked" % rdsprefix + regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile) + + # map all candidate regions that have paired ends overlapping with known genes + candidatefilename = "%s.candidates.txt" % rdsprefix + rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True) + + expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + # calculate expanded exonic read density + acceptedfilename = "%s.accepted.rpkm" % rdsprefix + try: + candidatefile = open(candidatefilename) + candidateLines = candidatefile.readlines() + candidatefile.close() + except IOError: + candidateLines = [] + + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, + acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # weigh multi-reads + multicountfilename = "%s.multi.count" % rdsprefix + acceptfile = "%s.accepted.rpkm" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1, + extendGenome=modelfile, replaceModels=replacemodels) + + # calculate final exonic read density + outfilename = "%s.final.rpkm" % rdsprefix + normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/runSNPAnalysis.py b/runSNPAnalysis.py new file mode 100644 index 0000000..e511d80 --- /dev/null +++ b/runSNPAnalysis.py @@ -0,0 +1,100 @@ +import sys +import optparse +from commoncode import writeLog +from makeSNPtrack import makeSNPtrack +from getNovelSNPs import getNovelSNPsFromFile +from getSNPGeneInfo import writeSNPGeneInfo +from chksnp import chkSNPFile +from chkSNPrmask import chkSNPrmask +from getSNPs import writeSNPsToFile + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runSNPAnalysis: version %s" % VERSION + usage = "usage: python runSNPAnalysis.py genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + rdsfile = args[1] + label = args[2] + rmaskdbfile = args[3] + dbsnpfile = args[4] + uniqStartMin = args[5] + totalRatio = args[6] + rpkmfile = args[7] + try: + cachepages = args[8] + except IndexError: + cachepages = None + + runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + + return parser + + +def runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages=None): + """ based on original script runSNPAnalysis.sh + usage: runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages] + where for each position S: + uniqStartMin = # independent reads supporting base change at S + totalRatio = total # reads supporting base change at S / total # reads that pass through S + """ + + # log the parameters + message = "with parameters: %s %s %s %s %s %s %s %s" % (genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile) + writeLog("rna.log", "runStandardAnalysis.py", message) + + # get all SNPs by extracting it from the RDS + fullsnpfilename = "%s.snps.txt" % label + try: + snpCache = int(cachepages) + except TypeError, ValueError: + snpCache = 0 + + doCache = cachepages is not None + writeSNPsToFile(rdsfile, uniqStartMin, totalRatio, fullsnpfilename, doCache, cachePages=snpCache, forceChr=True) + + # get SNPs in non-repeat regions only + snpfilename = "%s.nr_snps.txt" % label + chkSNPrmask(rmaskdbfile, fullsnpfilename, snpfilename, repeats=False, cachePages=cachepages) + + # Check to see if SNPs are found in dbSNP + # if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC + # usage: python2.5 buildsnpdb.py snpdbdir snpdbname + # the database flat file must be in the snpdbdir directory + # To build dbSNP database file, run the following command + # python2.5 buildsnpdb.py snp128.txt dbSNP128 + + # get dbSNP info for SNPs that are found in the dbSNP database + dbsnpFileName = "%s.nr_dbsnp.txt" % label + chkSNPFile(dbsnpfile, snpfilename, dbsnpFileName, cachePages=cachepages) + + # get gene info for the snps found in dbSNP + genefilename = "%s.nr_dbsnp_geneinfo.txt" % label + writeSNPGeneInfo(genome, dbsnpFileName, rpkmfile, genefilename, doCache=doCache) + + # get gene info for snps that are not found in dbSNP + outfilename = "%s.nr_final.txt" % label + getNovelSNPsFromFile(genome, genefilename, outfilename) + + # make bed file for displaying the snps on UCSC genome browser + bedfilename = "%s.nr_snps.bed" % label + makeSNPtrack(snpfilename, label, bedfilename) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py new file mode 100644 index 0000000..adfa375 --- /dev/null +++ b/runStandardAnalysis.py @@ -0,0 +1,131 @@ +import sys +import optparse +import ReadDataset +from commoncode import getConfigParser, getConfigBoolOption, writeLog +from checkrmask import checkrmask +from geneMrnaCounts import geneMrnaCounts +from geneMrnaCountsWeighted import geneMrnaCountsWeighted +from getallgenes import getallgenes +from normalizeFinalExonic import normalizeFinalExonic +from normalizeExpandedExonic import normalizeExpandedExonic +from findall import findall, RegionFinder + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runStandardAnalysis: version %s" % VERSION + usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + rdsprefix = args[1] + repeatmaskdb = args[2] + bpradius = args[3] + try: + modelfile = args[4] + except IndexError: + modelfile = "" + + runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--replacemodels", action="store_true", dest="replacemodels") + + configParser = getConfigParser() + section = "RunStandardAnalysis" + replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False) + parser.set_defaults(replacemodels=replacemodels) + + return parser + + +def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False): + """ based on original script runStandardAnalysis.sh + usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels] + where rdsprefix is the name of the rds file without the .rds extension + use "none" for the repeatmaskdb if you do not have one + """ + + rdsfile = "%s.rds" % rdsprefix + logfile = "rna.log" + + # log the parameters + message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + writeLog(logfile, "runStandardAnalysis.py", message) + + # count the unique reads falling on the gene models ; the nomatch files are + # mappable reads that fell outside of the Cistematic gene models and not the + # unmappable of Eland (i.e, the "NM" reads) + uniquecountfilename = "%s.uniqs.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True) + + # calculate a first-pass RPKM to re-weigh the unique reads, + # using 'none' for the splice count + splicecountfilename = "none" + initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix + RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) + (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + readCounts = {} + readCounts["uniq"] = ucount + readCounts["splice"] = mcount + readCounts["multi"] = scount + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # recount the unique reads with weights calculated during the first pass + uniquerecountfilename = "%s.uniqs.recount" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels) + + # count splice reads + splicecountfilename = "%s.splices.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1) + + # Alternative 1: find new regions outside of gene models with reads piled up + newregionfilename = "%s.newregions.txt" % rdsprefix + regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM") + findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + + # Alternative 1: filter out new regions that overlap repeats more than a certain fraction + outFileName = "%s.newregions.repstatus" % rdsprefix + goodFileName = "%s.newregions.good" % rdsprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + + # map all candidate regions that are within a given radius of a gene in bp + candidatefilename = "%s.candidates.txt" % rdsprefix + getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + # calculate expanded exonic read density + acceptedfilename = "%s.accepted.rpkm" % rdsprefix + try: + candidatefile = open(candidatefilename) + candidateLines = candidatefile.readlines() + candidatefile.close() + except IOError: + candidateLines = [] + + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, + doCache=True, extendGenome=modelfile, replaceModels=replacemodels) + + # weigh multi-reads + multicountfilename = "%s.multi.count" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile, + replaceModels=replacemodels) + + # calculate final exonic read density + outfilename = "%s.final.rpkm" % rdsprefix + normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py new file mode 100644 index 0000000..55c87ca --- /dev/null +++ b/runStrandedAnalysis.py @@ -0,0 +1,129 @@ +import sys +import optparse +import ReadDataset +from commoncode import writeLog +from checkrmask import checkrmask +from geneMrnaCounts import geneMrnaCounts +from geneMrnaCountsWeighted import geneMrnaCountsWeighted +from getallgenes import getallgenes +from normalizeFinalExonic import normalizeFinalExonic +from normalizeExpandedExonic import normalizeExpandedExonic +from findall import findall, RegionFinder + +VERSION = "1.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + print "runStrandedAnalysis: version %s" % VERSION + usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius" + + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + rdsprefix = args[1] + repeatmaskdb = args[2] + bpradius = args[3] + + runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius) + + +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + + return parser + + +def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius): + """ based on original script runStrandedAnalysis.sh + usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius + where rdsprefix is the name of the rds file without the .rds extension + use "none" for the repeatmaskdb if you do not have one + """ + + rdsfile = "%s.rds" % rdsprefix + logfile = "rna.log" + + # log the parameters + message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius) + writeLog(logfile, "runStrandedAnalysis.py", message) + + # count the unique reads falling on the gene models ; the nomatch files are + # mappable reads that fell outside of the Cistematic gene models and not the + # unmappable of Eland (i.e, the "NM" reads) + uniquecountfilename = "%s.uniqs.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True) + + # calculate a first-pass RPKM to re-weigh the unique reads, + # using 'none' for the splice count + initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix + RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False) + (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False) + readCounts = {} + readCounts["uniq"] = ucount + readCounts["splice"] = mcount + readCounts["multi"] = scount + normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True) + + # recount the unique reads with weights calculated during the first pass + initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix + uniquerecountfilename = "%s.uniqs.recount" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False) + + # count splice reads + splicecountfilename = "%s.splices.count" % rdsprefix + geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1) + + # find new regions outside of gene models with reads piled up + newregionfilename = "%s.newregions.txt" % rdsprefix + regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus") + findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False) + + regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus") + findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a") + + # filter out new regions that overlap repeats more than a certain fraction + newregionfilename = "%s.newregions.txt" % rdsprefix + outFileName = "%s.newregions.repstatus" % rdsprefix + goodFileName = "%s.newregions.good" % rdsprefix + checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile) + + #TODO: these calls look wrong + # Alternative 2: use a precomputed list of "new" regions (outside of gene models) + #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed + #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good + + # map all candidate regions that are within a given radius of a gene in bp + candidatefilename = "%s.candidates.txt" % rdsprefix + getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True) + + expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix + # calculate expanded exonic read density + acceptedfilename = "%s.accepted.rpkm" % rdsprefix + try: + candidatefile = open(candidatefilename) + candidateLines = candidatefile.readlines() + candidatefile.close() + except IOError: + candidateLines = [] + + normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename, + doCache=True) + + # weigh multi-reads + multicountfilename = "%s.multi.count" % rdsprefix + geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1) + + # calculate final exonic read density + outfilename = "%s.final.rpkm" % rdsprefix + normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/siteintersects.py b/siteintersects.py index e012bb4..58d9e30 100755 --- a/siteintersects.py +++ b/siteintersects.py @@ -126,6 +126,8 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite continue siteNotCommon = True + #TODO: as handed to me this is O(n^m) - probably could be better... + #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file for (rstart, rstop, rline) in siteDict[chrom]: if regionsOverlap(start, stop, rstart, rstop): commonSites += 1 @@ -162,4 +164,4 @@ def writeRejectSiteFile(siteRejectFilename, rejectSiteDict): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/weighMultireads.py b/weighMultireads.py index 9fd041d..3fd6bd8 100755 --- a/weighMultireads.py +++ b/weighMultireads.py @@ -168,6 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False): chrom = "chr%s" % multiReadList[index][1] reweighList.append((round(score,3), chrom, start, theID)) + #TODO: QForAli - Is this right? If match index is 0 are we doing nothing? if theMatch > 0: RDS.reweighMultireads(reweighList) fixedPair += 1 @@ -329,4 +330,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file