import ReadDataset
from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+VERSION = "1.0"
def main(argv=None):
if not argv:
argv = sys.argv
- verstring = "makeBamFromRds: version 1.0"
- print verstring
+ print "makeBamFromRds: version %s" % VERSION
doPairs = False
for chrom in chromRemoveList:
chromList.remove(chrom)
- header = {"HD": {"VN": "1.0"}}
+ header = {"HD": {"VN": VERSION}}
if referenceSequenceList:
header["SQ"] = referenceSequenceList
self.cachedDB = ""
- def attachDB(self, filename, asname):
+ def attachDB(self, filename, dbName):
""" attach another database file to the readDataset.
"""
- stmt = "attach '%s' as %s" % (filename, asname)
+ stmt = "attach '%s' as %s" % (filename, dbName)
self.execute(stmt)
- def detachDB(self, asname):
+ def detachDB(self, dbName):
""" detach a database file to the readDataset.
"""
- stmt = "detach %s" % (asname)
+ stmt = "detach %s" % (dbName)
self.execute(stmt)
- def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+ def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
""" import into current RDS the table (with columns destcolumns,
with default all columns) from the database file asname,
using the column specification of ascolumns (default all).
"""
- stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+ stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
if flagged != "":
stmt += " where flag = '%s' " % flagged
self.executeCommit(stmt)
- def getTables(self, asname=""):
+ def getTables(self, dbName=""):
""" get a list of table names in a particular database file.
"""
resultList = []
sql = self.getSqlCursor()
- if asname != "":
- asname += "."
+ if dbName != "":
+ dbName = "%s." % dbName
- stmt = "select name from %ssqlite_master where type='table'" % asname
+ stmt = "select name from %ssqlite_master where type='table'" % dbName
sql.execute(stmt)
results = sql.fetchall()
if "readsize" not in metadata:
raise ReadDatasetError("no readsize parameter defined")
else:
- mysize = metadata["readsize"]
- if "import" in mysize:
- mysize = mysize.split()[0]
+ readSize = metadata["readsize"]
+ if "import" in readSize:
+ readSize = readSize.split()[0]
- return int(mysize)
+ readSize = int(readSize)
+ if readSize < 0:
+ raise ReadDatasetError("readsize is negative")
+
+ return readSize
def getDefaultCacheSize(self):
if row["chrom"] not in results:
results.append(row["chrom"])
else:
- if len(row["chrom"][3:].strip()) < 1:
- continue
-
- if row["chrom"][3:] not in results:
- results.append(row["chrom"][3:])
+ shortName = row["chrom"][3:]
+ if len(shortName.strip()) > 0 and shortName not in results:
+ results.append(shortName)
results.sort()
""" returns the maximum coordinate for reads on a given chromosome.
"""
maxCoord = 0
- sql = self.getSqlCursor()
if doUniqs:
- try:
- sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
- maxCoord = int(sql.fetchall()[0][0])
- except:
- print "couldn't retrieve coordMax for chromosome %s" % chrom
+ maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
if doSplices:
- sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
- try:
- spliceMax = int(sql.fetchall()[0][0])
- if spliceMax > maxCoord:
- maxCoord = spliceMax
- except:
- pass
+ spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+ maxCoord = max(spliceMax, maxCoord)
if doMulti:
- sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
- try:
- multiMax = int(sql.fetchall()[0][0])
- if multiMax > maxCoord:
- maxCoord = multiMax
- except:
- pass
+ multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+ maxCoord = max(multiMax, maxCoord)
if verbose:
print "%s maxCoord: %d" % (chrom, maxCoord)
return maxCoord
+ def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+ maxCoord = 0
+ sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+ sql = self.getSqlCursor()
+ try:
+ sql.execute(sqlStatement)
+ maxCoord = int(sql.fetchall()[0][0])
+ except:
+ print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+ return maxCoord
+
+
def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
"""
- whereClause = []
- resultsDict = {}
-
- if chrom != "" and chrom != self.memChrom:
- whereClause.append("chrom = '%s'" % chrom)
-
- if flag != "":
- if flagLike:
- flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
- whereClause.append(flagLikeClause)
- else:
- whereClause.append("flag = '%s'" % flag)
-
- if start > -1:
- whereClause.append("start > %d" % start)
-
- if stop > -1:
- whereClause.append("stop < %d" % stop)
-
- if len(readLike) > 0:
- readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
- whereClause.append(readIDClause)
-
- if hasMismatch:
- whereClause.append("mismatch != ''")
-
- if strand in ["+", "-"]:
- whereClause.append("sense = '%s'" % strand)
-
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
- groupBy = []
+ whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
if findallOptimize:
- selectClause = ["select start, sense, sum(weight)"]
- groupBy = ["GROUP BY start, sense"]
+ selectQuery = "select start, sense, sum(weight)"
else:
- selectClause = ["select ID, chrom, start, readID"]
- if bothEnds:
- selectClause.append("stop")
-
- if not noSense:
- selectClause.append("sense")
-
- if withWeight:
- selectClause.append("weight")
-
- if withFlag:
- selectClause.append("flag")
+ selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
- if withMismatch:
- selectClause.append("mismatch")
-
- if limit > 0 and not combine5p:
- groupBy.append("LIMIT %d" % limit)
-
- selectQuery = string.join(selectClause, ",")
- groupQuery = string.join(groupBy)
+ groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
if doUniqs:
stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
if doMulti:
sqlQuery = string.join(stmt)
sql.execute(sqlQuery)
+ resultsDict = {}
if findallOptimize:
resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
if self.memBacked:
return resultsDict
- def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
- flag="", withWeight=False, withFlag=False, withMismatch=False,
- withID=False, withChrom=False, withPairID=False, readIDDict=False,
- splitRead=False, hasMismatch=False, flagLike=False, start=-1,
- stop=-1, strand=""):
- """ returns a dictionary of spliced reads in a variety of
- formats and which can be restricted by chromosome or custom-flag.
- Returns unique spliced reads for now.
- """
- whereClause = []
- resultsDict = {}
+ def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+ if splice:
+ startText = "startL"
+ stopText = "stopR"
+ else:
+ startText = "start"
+ stopText = "stop"
+ whereClause = []
if chrom != "" and chrom != self.memChrom:
- whereClause = ["chrom = '%s'" % chrom]
+ whereClause.append("chrom = '%s'" % chrom)
if flag != "":
if flagLike:
else:
whereClause.append("flag = '%s'" % flag)
+ if start > -1:
+ whereClause.append("%s > %d" % (startText, start))
+
+ if stop > -1:
+ whereClause.append("%s < %d" % (stopText, stop))
+
+ if len(readLike) > 0:
+ readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
+ whereClause.append(readIDClause)
+
if hasMismatch:
whereClause.append("mismatch != ''")
- if strand != "":
+ if strand in ["+", "-"]:
whereClause.append("sense = '%s'" % strand)
- if start > -1:
- whereClause.append("startL > %d" % start)
-
- if stop > -1:
- whereClause.append("stopR < %d" % stop)
-
if len(whereClause) > 0:
whereStatement = string.join(whereClause, " and ")
whereQuery = "where %s" % whereStatement
else:
whereQuery = ""
- selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+ return whereQuery
+
+
+ def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+ selectClause = [baseSelect]
+ if bothEnds:
+ selectClause.append("stop")
+
if not noSense:
selectClause.append("sense")
if withMismatch:
selectClause.append("mismatch")
- selectQuery = string.join(selectClause, " ,")
+ selectQuery = string.join(selectClause, ",")
+
+ return selectQuery
+
+
+ def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+ groupBy = []
+ if findallOptimize:
+ groupBy = ["GROUP BY start, sense"]
+
+ if limit > 0 and not combine5p:
+ groupBy.append("LIMIT %d" % limit)
+
+ groupQuery = string.join(groupBy)
+
+ return groupQuery
+
+
+ def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False,
+ withID=False, withChrom=False, withPairID=False, readIDDict=False,
+ splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+ stop=-1, strand=""):
+ """ returns a dictionary of spliced reads in a variety of
+ formats and which can be restricted by chromosome or custom-flag.
+ Returns unique spliced reads for now.
+ """
+ whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+ selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+ selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
if self.memBacked:
sql = self.memcon.cursor()
else:
sql.execute(stmt)
currentReadID = ""
currentChrom = ""
+ resultsDict = {}
for row in sql:
pairID = 0
readID = row["readID"]
""" get readID's.
"""
stmt = []
- limitPart = ""
- if limit > 0:
- limitPart = "LIMIT %d" % limit
-
if uniqs:
stmt.append("select readID from uniqs")
else:
selectPart = ""
+ limitPart = ""
+ if limit > 0:
+ limitPart = "LIMIT %d" % limit
+
sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
if self.memBacked:
sql = self.memcon.cursor()
print "getting mismatches from chromosome %s" % (achrom)
snpDict[achrom] = []
- hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
if useSplices and self.dataType == "RNA":
spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
spliceIDList = spliceDict.keys()
- for k in spliceIDList:
- spliceEntry = spliceDict[k][0]
+ for spliceID in spliceIDList:
+ spliceEntry = spliceDict[spliceID][0]
startpos = spliceEntry["startL"]
lefthalf = spliceEntry["stopL"]
rightstart = spliceEntry["startR"]
snpDict[achrom].append([startpos, change_at, change_base, change_from])
+ hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
if achrom not in hitDict.keys():
continue
def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
- useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+ useSplices=False, normalizationFactor=1.0, trackStrand=False,
keepStrand="both", shiftValue=0):
"""return a profile of the chromosome as an array of per-base read coverage....
keepStrand = 'both', 'plusOnly', or 'minusOnly'.
dataType = metadata["dataType"]
scale = 1. / normalizationFactor
shift = {}
- shift['+'] = int(shiftValue)
- shift['-'] = -1 * int(shiftValue)
+ shift["+"] = int(shiftValue)
+ shift["-"] = -1 * int(shiftValue)
if cstop > 0:
lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
import psyco
psyco.full()
except:
- print 'psyco not running'
+ print "psyco not running"
print "altSpliceCounts: version 3.7"
stopDict = {}
resultDict = {}
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
readlen = hitRDS.getReadSize()
hitDict = hitRDS.getSplicesDict(noSense=True)
- outfile = open(outfilename,'w')
+ outfile = open(outfilename, "w")
for chrom in hitDict:
startDict[chrom] = []
resultDict[chrom].sort()
for line in resultDict[chrom]:
- outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n' % line)
+ outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n" % line)
alternative += 1
print chrom, maxIndex, spliceEvent, altSpliceEvent
def bedToRegion(factor, infilename, outfilename):
index = 1
infile = open(infilename)
- outfile = open(outfilename, 'w')
+ outfile = open(outfilename, "w")
for line in infile:
- if 'track' in line:
+ if "track" in line:
continue
+
fields = line.split()
- line = string.join(fields, '\t')
- outfile.write('%s%d\t%s\n' % (factor, index, line))
+ line = string.join(fields, "\t")
+ outfile.write("%s%d\t%s\n" % (factor, index, line))
index += 1
+
infile.close()
outfile.close()
import sys
+import string
print "binstocdf: version 1.1"
if not argv:
argv = sys.argv
- if len(argv) < 2:
- print 'usage: python %s infile outfile' % sys.argv[0]
+ if len(argv) < 3:
+ print "usage: python %s infile outfile" % sys.argv[0]
sys.exit(1)
- infilename = argv[0]
- outfilename = argv[1]
+ infilename = argv[1]
+ outfilename = argv[2]
binToCDF(infilename, outfilename)
def binToCDF(infilename, outfilename):
infile = open(infilename)
- outfile = open(outfilename, 'w')
+ outfile = open(outfilename, "w")
for line in infile:
fields = line.strip().split()
outfile.write(line)
continue
- outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
- cum = 0
+ outputFields = fields[:4]
+ runningTotal = 0
for bin in fields[4:]:
- cum += int(bin)
- percent = 100 * cum / total
- outfile.write('\t%d' % percent)
+ runningTotal += int(bin)
+ percent = 100 * runningTotal / total
+ outputFields.append("%d" % percent)
- outfile.write('\n')
+ outputLine = string.join(outputFields, "\t")
+ outfile.write("%s\n" % outputLine)
infile.close()
outfile.close()
if header.strip() == "":
header = "#\t"
- outfile.write( "%s\t%s\n" % (header, colID))
+ outfile.write("%s\t%s\n" % (header, colID))
values = []
min = 20000000000.
print "usage: python %s bins percent infile" % sys.argv[0]
sys.exit(1)
- bins = int(argv[0])
- percent = int(argv[1])
- infilename = argv[2]
+ bins = int(argv[1])
+ percent = int(argv[2])
+ infilename = argv[3]
- cdfDist(bins, percent, infilename)
+ printCdfDist(bins, percent, infilename)
+
+
+def printCdfDist(bins, percent, infilename):
+ print cdfDist(bins, percent, infilename)
def cdfDist(bins, percent, infilename):
index += 1
infile.close()
- print binsList
+ return binsList
if __name__ == "__main__":
count += float(fields[fieldID])
else:
count += int(fields[fieldID])
- except ValueError:
+ except (ValueError, IndexError):
pass
infile.close()
def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
- firstDict = {}
- firstfile = open(firstfileName)
- for line in firstfile:
- fields = line.strip().split()
- firstDict[fields[1]] = fields[-1]
-
- firstfile.close()
-
- expandedDict = {}
- gidDict = {}
- expandedfile = open(expandedfileName)
- for line in expandedfile:
- fields = line.strip().split()
- expandedDict[fields[1]] = fields[-1]
- gidDict[fields[1]] = fields[0]
-
- expandedfile.close()
+ firstDict = getRPKMDict(firstfileName)
+ gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
if doFraction:
header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
outfile.close()
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+ gidDict = {}
+ rpkmDict = {}
+ rpkmFile = open(rpkmFileName)
+ for line in rpkmFile:
+ fields = line.strip().split()
+ rpkmDict[fields[1]] = fields[-1]
+ if getGIDDict:
+ gidDict[fields[1]] = fields[0]
+
+ rpkmFile.close()
+
+ if getGIDDict:
+ return gidDict, rpkmDict
+ else:
+ return rpkmDict
+
+
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
pass
import sys
+import optparse
import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
print "combinerds: version 1.2"
if not argv:
argv = sys.argv
- if len(argv) < 2:
- print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
- #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+ usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+ parser = makeParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
sys.exit(1)
- doCache = False
- cachePages = -1
- if '--cache' in argv:
- doCache = True
- try:
- cachePages = int(argv[sys.argv.index('-cache') + 1])
- except:
- pass
-
- datafile = argv[1]
- infileList = []
- for index in range(2, len(argv)):
- if argv[index][0] == '-':
- break
- infileList.append(sys.argv[index])
+ datafile = args[0]
+ infileList = args[1:]
- print "destination RDS: %s" % datafile
+ combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA)
- if '--initrna' in argv:
- rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
- elif '--init' in argv:
- rds = ReadDataset.ReadDataset(datafile, initialize=True)
- withFlag = ''
- if '--flag' in argv:
- withFlag = argv[sys.argv.index('-flag') + 1]
- print "restrict to flag = %s" % withFlag
+def makeParser():
+ usage = __doc__
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--table", action="append", dest="tablelist")
+ parser.add_option("--init", action="store_true", dest="doInit")
+ parser.add_option("--initrna", action="store_true", dest="initRNA")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--flag", dest="withFlag")
+
+ configParser = getConfigParser()
+ section = "combinerds"
+ doInit = getConfigBoolOption(configParser, section, "doInit", False)
+ initRNA = getConfigBoolOption(configParser, section, "initRNA", False)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+ withFlag = getConfigOption(configParser, section, "withFlag", "")
+
+ parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages,
+ withFlag=withFlag)
+
+ return parser
+
- rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
+def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False):
+ print "destination RDS: %s" % datafile
+ datasetType="DNA"
+ if initRNA:
+ doInit = True
+ datasetType="RNA"
+
+ doCache = False
+ if cachePages is not None:
+ doCache = True
+ else:
+ cachePages = -1
+
+ rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType)
if cachePages > rds.getDefaultCacheSize():
rds.setDBcache(cachePages)
- cacheVal = cachePages
else:
- cacheVal = rds.getDefaultCacheSize()
-
- doIndex = False
- if '--index' in argv:
- doIndex = True
+ cachePages = rds.getDefaultCacheSize()
- tableList = []
- if '--table' in argv:
- tableList.append(argv[argv.index('-table') + 1])
- else:
+ if tableList == []:
tableList = rds.getTables()
- combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
-
+ if withFlag != "":
+ print "restrict to flag = %s" % withFlag
-def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
metaDict = rds.getMetadata()
if "numberImports" not in metaDict:
origIndex = 0
- rds.insertMetadata([("numberImports", str(0))])
+ rds.insertMetadata([("numberImports", "0")])
else:
origIndex = int(metaDict["numberImports"])
index = origIndex
for inputfile in infileList:
- asname = "input" + str(index)
- rds.attachDB(inputfile,asname)
+ dbName = "input%s" % str(index)
+ rds.attachDB(inputfile, dbName)
for table in tableList:
print "importing table %s from file %s" % (table, inputfile)
- ascols = "*"
+ dbColumns = "*"
if table == "uniqs":
- ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+ dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
elif table == "multi":
- ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+ dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
elif table == "splices":
- ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
- elif table == "metadata":
- ascols = "name, value || ' (import_%d)'" % index
- rds.importFromDB(asname, table, ascols)
+ dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
- if table != "metadata":
- rds.importFromDB(asname, table, ascols, withFlag)
+ if table == "metadata":
+ dbColumns = "name, value || ' (import_%d)'" % index
+ rds.importFromDB(dbName, table, dbColumns)
+ else:
+ rds.importFromDB(dbName, table, dbColumns, withFlag)
- rds.detachDB(asname)
- rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+ rds.detachDB(dbName)
+ rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))])
index += 1
rds.updateMetadata("numberImports", index, origIndex)
if doIndex:
print "building index...."
- if cacheVal > 0:
- rds.buildIndex(cacheVal)
+ if cachePages > 0:
+ rds.buildIndex(cachePages)
else:
rds.buildIndex()
commoncodeVersion = 5.6
currentRDSversion = 2.0
+class ErangeError(Exception):
+ pass
+
def getReverseComplement(base):
revComp = {"A": "T",
return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
- hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+ genome = Genome(genomeName, inRAM=inRAM)
if extendGenome != "":
- hg.extendFeatures(extendGenome, replace=replaceModels)
+ genome.extendFeatures(extendGenome, replace=replaceModels)
- geneannotDict = hg.allAnnotInfo()
+ geneannotDict = genome.allAnnotInfo()
return geneannotDict
mergeCount = 0
chromField = int(chromField)
count = 0
- #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
- # exits at the first merge. This is not a problem when the input is sorted by start position, but in
- # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
- # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
- # be merged with A but that will exit the loop and the merge with C will be missed.
for regionEntry in regionList:
if regionEntry[0] == "#":
if "pvalue" in regionEntry:
"""
if shift == "auto":
- shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+ shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
topPos = getPeakPositionList(smoothArray, length)
- peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+ peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
if leftPlus:
numLeftPlus = 0
return peak
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
bestShift = 0
lowestScore = 20000000000
for testShift in xrange(maxShift + 1):
shiftArray = array("f", [0.] * length)
- for read in hitList:
+ for read in readList:
currentpos = read["start"] - start
if read["sense"] == "+":
currentpos += testShift
if (currentpos < 1) or (currentpos >= length):
continue
- if doWeight:
+ if useWeight:
weight = read["weight"]
else:
weight = 1.0
hitIndex += 1
currentpos += 1
- while hitIndex < readlen and currentpos < length:
+ while hitIndex < readlen and currentpos < length:
seqArray[currentpos] += weight
hitIndex += 1
currentpos += 1
if topNucleotide < smoothArray[currentpos]:
topNucleotide = smoothArray[currentpos]
peakList = [currentpos]
- elif topNucleotide == smoothArray[currentpos]:
+ elif topNucleotide == smoothArray[currentpos]:
peakList.append(currentpos)
return peakList
return featuresByChromDict
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
lengthCDS=0, keepSense=False, adjustToNeighbor=True):
""" return a dictionary of gene loci. Can be used to retrieve additional
print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
return locusByChromDict
- genome = genomeObject.genome
- featuresDict = genomeObject.getallGeneFeatures()
- if len(additionalRegionsDict) > 0:
- sortList = []
- for chrom in additionalRegionsDict:
- for region in additionalRegionsDict[chrom]:
- label = region.label
- if label not in sortList:
- sortList.append(label)
-
- if label not in featuresDict:
- featuresDict[label] = []
- sense = "+"
- else:
- sense = featuresDict[label][0][-1]
-
- featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
- for gid in sortList:
- featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+ genomeName = genome.genome
+ featuresDict = getGeneFeatures(genome, additionalRegionsDict)
for gid in featuresDict:
featureList = featuresDict[gid]
newFeatureList = []
distance = upstream
if adjustToNeighbor:
- nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+ nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
if nextGene < distance * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstart -= distance
if downstream > 0:
distance = downstream
if adjustToNeighbor:
- nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+ nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
if nextGene < downstream * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstop += distance
- if lengthCDS > 0:
- if lengthCDS < glen:
- gstop = newFeatureList[0][0] + lengthCDS
+ if 0 < lengthCDS < glen:
+ gstop = newFeatureList[0][0] + lengthCDS
- if lengthCDS < 0:
- if abs(lengthCDS) < glen:
- gstart = newFeatureList[-1][1] + lengthCDS
+ if lengthCDS < 0 and abs(lengthCDS) < glen:
+ gstart = newFeatureList[-1][1] + lengthCDS
else:
if not useCDS and upstream > 0:
if upstreamSpanTSS:
distance = upstream
if adjustToNeighbor:
- nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+ nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
if nextGene < distance * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstop += distance
if downstream > 0:
distance = downstream
if adjustToNeighbor:
- nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+ nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
if nextGene < downstream * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstart -= distance
- if lengthCDS > 0:
- if lengthCDS < glen:
- gstart = newFeatureList[-1][-1] - lengthCDS
+ if 0 < lengthCDS < glen:
+ gstart = newFeatureList[-1][-1] - lengthCDS
- if lengthCDS < 0:
- if abs(lengthCDS) < glen:
- gstop = newFeatureList[0][0] - lengthCDS
+ if lengthCDS < 0 and abs(lengthCDS) < glen:
+ gstop = newFeatureList[0][0] - lengthCDS
- glen = abs(gstop - gstart)
if chrom not in locusByChromDict:
locusByChromDict[chrom] = []
return locusByChromDict
+def getGeneFeatures(genome, additionalRegionsDict):
+ featuresDict = genome.getallGeneFeatures()
+ if len(additionalRegionsDict) > 0:
+ sortList = []
+ for chrom in additionalRegionsDict:
+ for region in additionalRegionsDict[chrom]:
+ label = region.label
+ if label not in sortList:
+ sortList.append(label)
+
+ if label not in featuresDict:
+ featuresDict[label] = []
+ sense = "+"
+ else:
+ sense = featuresDict[label][0][-1]
+
+ featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+ for gid in sortList:
+ featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+ return featuresDict
+
+
def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
binLength=-1):
rlen = regionTuple[lengthField]
try:
rsense = regionTuple[senseField]
- except:
+ except IndexError:
rsense = "F"
if tagStart > stop:
print time.ctime()
- if doSplices:
- print "getting splices"
- splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
- print "got splices"
-
print "getting uniq reads"
uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
print "got uniqs"
if doSplices:
- for readID in splicesDict:
- theRead = splicesDict[readID]
- read0 = theRead[0]
- del read0[1]
- try:
- uniqDict[readID].append(read0)
- except:
- if len(theRead) == 4:
- read2 = theRead[2]
- del read2[1]
- uniqDict[readID] = [read0,read2]
+ addSplicesToUniqReads(RDS, uniqDict)
if doVerbose:
print len(uniqDict), time.ctime()
outfile = open(outfilename,"w")
-
diffChrom = 0
distal = 0
total = 0
continue
else:
outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
- outfile.write(outline + "\n")
+ print >> outfile, outline
if doVerbose:
print diffChrom, outline
else:
dist = abs(start1 - start2)
-
if minDist < dist < maxDist:
distal += 1
outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
- outfile.write(outline + "\n")
+ print >> outfile, outline
if doVerbose:
print distal, outline
print time.ctime()
+def addSplicesToUniqReads(RDS, uniqDict):
+ print "getting splices"
+ splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+ print "got splices"
+ for readID in splicesDict:
+ theRead = splicesDict[readID]
+ read0 = theRead[0]
+ del read0[1]
+ try:
+ uniqDict[readID].append(read0)
+ except:
+ if len(theRead) == 4:
+ read2 = theRead[2]
+ del read2[1]
+ uniqDict[readID] = [read0,read2]
+
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
import sys
import time
import optparse
+import string
import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
print "farPairs: version 1.4"
outfilename = args[1]
outbedname = args[2]
- farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+ farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
options.cachePages, options.minDist, options.maxDist, options.minCount,
options.label)
def getParser(usage):
parser = optparse.OptionParser(usage=usage)
- parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
parser.add_option("--cache", type="int", dest="cachePages")
parser.add_option("--verbose", action="store_true", dest="doVerbose")
parser.add_option("--minDist", type="int", dest="minDist")
configParser = getConfigParser
section = "farPairs"
- sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
cachePages = getConfigOption(configParser, section, "cachePages", None)
minDist = getConfigIntOption(configParser, section, "minDist", 1000)
minCount = getConfigIntOption(configParser, section, "minCount", 2)
label = getConfigOption(configParser, section, "label", None)
- parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+ parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
return parser
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
+ flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+ outfile = open(outfilename, "w")
+ for region in flagDict:
+ regionConnections = countDuplicatesInList(flagDict[region])
+ for (connectedRegion, count) in regionConnections:
+ if count >= minCount:
+ outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+ print >> outfile, outline
+ if doVerbose:
+ print outline
+
+ outfile.close()
+ if doVerbose:
+ print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
doCache = False
if cachePages is not None:
doCache = True
if doVerbose:
print time.ctime()
-
- total = 0
- outfile = open(outfilename, "w")
+
outbed = open(outbedname, "w")
outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+ outbed.close()
readlen = RDS.getReadSize()
flagDict = {}
if doNotProcessChromosome(chromosome):
continue
- print chromosome
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
- if doVerbose:
- print len(uniqDict), time.ctime()
-
- for readID in uniqDict:
- readList = uniqDict[readID]
- if len(readList) == 2:
- total += 1
- start1 = readList[0]["start"]
- flag1 = readList[0]["flag"]
- start2 = readList[1]["start"]
- flag2 = readList[1]["flag"]
-
- if flag1 != flag2:
- dist = abs(start1 - start2)
- startList = [start1, start2]
- stopList = [start1 + readlen, start2 + readlen]
- startList.sort()
- stopList.sort()
- if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
- outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
- outbed.write(outputLine)
- if doVerbose:
- print flag1, flag2, dist
-
- try:
- flagDict[flag1].append((flag2, start1, start2))
- except KeyError:
- flagDict[flag1] = [(flag2, start1, start2)]
-
- try:
- flagDict[flag2].append((flag1, start1, start2))
- except KeyError:
- flagDict[flag2] = [(flag2, start1, start2)]
+ writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
print "%d connected regions" % len(flagDict)
- for region in flagDict:
- flagDict[region].sort()
- regionConnections = {}
- for (region2, start1, start2) in flagDict[region]:
+ return flagDict
+
+
+def doNotProcessChromosome(chrom):
+ return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+ outbed = open(outbedname, "a")
+ print chromosome
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+ if doVerbose:
+ print len(uniqDict), time.ctime()
+
+ for readID in uniqDict:
+ readList = uniqDict[readID]
+ if readsAreFarPair(readList, minDist, maxDist):
+ start1 = readList[0]["start"]
+ start2 = readList[1]["start"]
+ startList = [start1, start2]
+ startList.sort()
+ outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+ outbed.write(outputLine)
+ flag1 = readList[0]["flag"]
+ flag2 = readList[1]["flag"]
+ if doVerbose:
+ print flag1, flag2, abs(start1 - start2)
+
try:
- regionConnections[region2] += 1
+ flagDict[flag1].append(flag2)
except KeyError:
- regionConnections[region2] = 1
+ flagDict[flag1] = [flag2]
- for region2 in regionConnections:
- if regionConnections[region2] >= minCount:
- outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
- if doVerbose:
- print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+ try:
+ flagDict[flag2].append(flag1)
+ except KeyError:
+ flagDict[flag2] = [flag1]
- outfile.close()
outbed.close()
- if doVerbose:
- print "finished: ", time.ctime()
-def doNotProcessChromosome(chrom):
- return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+ isFarPair = False
+ if len(readList) == 2:
+ flag1 = readList[0]["flag"]
+ flag2 = readList[1]["flag"]
+ if flag1 != flag2 and flag1 != "" and flag2 != "":
+ start1 = readList[0]["start"]
+ start2 = readList[1]["start"]
+ dist = abs(start1 - start2)
+ if minDist < dist < maxDist:
+ isFarPair = True
+ return isFarPair
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
- readSizes = "%d" % (stopList[0] - startList[0])
- readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+ sizes = ["%d" % readlen]
+ coords = ["0"]
leftStart = startList[0] - 1
- rightStop = stopList[-1]
+ rightStop = startList[-1]
for index in range(1, numPieces):
- readSizes += ",%d" % (stopList[index] - startList[index] + 1)
- readCoords += ",%d" % (startList[index] - startList[0])
+ sizes.append("%d" % (readlen + 1))
+ coords.append("%d" % (startList[index] - startList[0]))
if rsense == "+":
- senseCode = plusSense
+ senseCode = plusSenseColor
else:
- senseCode = minusSense
+ senseCode = minusSenseColor
+ readSizes = string.join(sizes, ",")
+ readCoords = string.join(coords, ",")
outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
return outline
def makeParser(usage=""):
parser = optparse.OptionParser(usage=usage)
- parser.add_option("--cistype", action="store_false", dest="cistype")
+ parser.add_option("--cistype", dest="cistype")
parser.add_option("--radius", type="int", dest="radius")
configParser = getConfigParser()
def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
- tabfile = open(tabFileName)
- previous = ""
+
+ posList = getPositionList(tabFileName)
+ feats = featuresIntersecting("human", posList, radius, cistype)
+ featkeys = feats.keys()
+ featkeys.sort()
+ for (chrom, pos) in featkeys:
+ print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
+def getPositionList(tabFileName):
+ previous = ""
posList = []
+ tabfile = open(tabFileName)
for line in tabfile:
fields = line.split("\t")
current = fields[0]
chrom = fields[1][3:]
posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
- feats = featuresIntersecting("human", posList, radius, cistype)
- featkeys = feats.keys()
- featkeys.sort()
- for (chrom, pos) in featkeys:
- print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+ return posList
if __name__ == "__main__":
strandfilter=None, combine5p=False):
shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
- if trimValue is not None:
- trimValue = float(trimValue) / 100.
- trimString = "%2.1f%s" % ((100. * trimValue), "%")
- else:
- trimValue = 0.1
- trimString = "10%"
-
- if not doTrim:
- trimString = "none"
+ doControl = False
+ if mockfile is not None:
+ doControl = True
if doRevBackground:
print "Swapping IP and background to calculate FDR"
pValueType = "back"
- doControl = False
- if mockfile is not None:
- doControl = True
-
doPvalue = True
if ptype is not None:
ptype = ptype.upper()
else:
pValueType = "self"
- if cachePages is not None:
- doCache = True
- else:
- doCache = False
- cachePages = -1
-
if withFlag != "":
print "restrict to flag = %s" % withFlag
stringency = max(stringency, 1.0)
writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ if cachePages is not None:
+ doCache = True
+ else:
+ doCache = False
+ cachePages = -1
+
if doControl:
print "\ncontrol:"
mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
+ if trimValue is not None:
+ trimValue = float(trimValue) / 100.
+ trimString = "%2.1f%s" % ((100. * trimValue), "%")
+ else:
+ trimValue = 0.1
+ trimString = "10%"
+
+ if not doTrim:
+ trimString = "none"
+
if doAppend:
fileMode = "a"
else:
fileMode = "w"
outfile = open(outfilename, fileMode)
-
outfile.write("#ERANGE %s\n" % versionString)
if doControl:
mockRDSsize = len(mockRDS) / 1000000.
foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
if foldRatio >= minRatio:
- localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+ localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
try:
shiftDict[localshift] += 1
except KeyError:
def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
- normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+ normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
noMulti, doControl, factor, trimValue, outputRegionList=False):
weight = read["weight"]
if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
sumAll = currentTotalWeight
- if normalize:
+ if normalizeToRPM:
sumAll /= rdsSampleSize
regionStart = currentHitList[0]
if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
sumMulti = 0.
#first pass uses getFoldRatio on mockRDS as there may not be control
- foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+ foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
if foldRatio >= minRatio:
# first pass, with absolute numbers
peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
peakScore = peak.smoothArray[bestPos]
numPlus = peak.numPlus
shift = peak.shift
- numLeft = peak.numLeft
- if normalize:
+ numLeft = peak.numLeftPlus
+ if normalizeToRPM:
peakScore /= rdsSampleSize
if doTrim:
sumAll = trimPeak.numHits
numPlus = trimPeak.numPlus
- numLeft = trimPeak.numLeft
- if normalize:
+ numLeft = trimPeak.numLeftPlus
+ if normalizeToRPM:
sumAll /= rdsSampleSize
- foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+ foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
if outputRegionList:
sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
# just in case it changed, use latest data
except:
continue
- # normalize to RPM
- if normalize:
+ if normalizeToRPM:
peakScore /= rdsSampleSize
elif outputRegionList:
sumMulti = currentTotalWeight - currentUniqReadCount
if outputRegionList:
- # normalize to RPM
- if normalize:
+ if normalizeToRPM:
sumMulti /= rdsSampleSize
try:
return footer
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
import optparse
import ReadDataset
from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
print "geneDownstreamBins: version 2.1"
def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
- bins = 10
- standardMinThresh = standardMinDist / bins
hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
normalizationFactor = 1.0
geneinfoDict = getGeneInfoDict(genome, cache=True)
hg = Genome(genome)
featuresDict = hg.getallGeneFeatures()
-
outfile = open(outfilename, "w")
-
gidList = hg.allGIDs()
gidList.sort()
for gid in gidList:
- symbol = "LOC" + gid
- geneinfo = ""
- featureList = []
try:
- geneinfo = geneinfoDict[gid]
- featureList = featuresDict[gid]
- symbol = geneinfo[0][0]
- except:
+ featuresList = featuresDict[gid]
+ except KeyError:
print gid
- if len(featureList) == 0:
+ try:
+ binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+ except ErangeError:
continue
- newfeatureList = []
- for (ftype, chrom, start, stop, fsense) in featureList:
- if (start, stop) not in newfeatureList:
- newfeatureList.append((start, stop))
+ tagCount *= normalizationFactor
+ print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+ outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+ for binAmount in binList:
+ outfile.write("\t%.2f" % binAmount)
- if chrom not in hitDict:
- continue
+ outfile.write("\n")
- newfeatureList.sort()
- if len(newfeatureList) < 1:
- continue
+ outfile.close()
- glen = standardMinDist
- if fsense == "F":
- nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
- if nextGene < glen * 2:
- glen = nextGene / 2
- if glen < 1:
- glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
- gstart = newfeatureList[-1][1]
- else:
- nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
- if nextGene < glen * 2:
- glen = nextGene / 2
+ symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+ geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+ if geneLength < standardMinDist:
+ raise ErangeError("gene length less than minimum")
- if glen < 1:
- glen = 1
+ binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+ if tagCount < 2:
+ raise ErangeError("tag count less than minimum")
- gstart = newfeatureList[0][0] - glen
- if gstart < 0:
- gstart = 0
+ return binList, symbol, geneLength, tagCount
- tagCount = 0
- if glen < standardMinDist:
- continue
- binList = [0.] * bins
- for read in hitDict[chrom]:
- tagStart = read["start"]
- weight = read["weight"]
- tagStart -= gstart
- if tagStart >= glen:
- break
-
- if tagStart > 0:
- tagCount += weight
- if fsense == "F":
- # we are relying on python's integer division quirk
- binID = tagStart / standardMinThresh
- binList[binID] += weight
- else:
- rdist = glen - tagStart
- binID = rdist / standardMinThresh
- binList[binID] += weight
-
- if tagCount < 2:
- continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+ if len(featureList) == 0:
+ raise ErangeError("no features found")
- tagCount *= normalizationFactor
- print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
- outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
- for binAmount in binList:
- outfile.write("\t%.2f" % binAmount)
+ symbol = "LOC%s" % gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except KeyError:
+ print gid
- outfile.write("\n")
+ newfeatureList = []
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+
+ if len(newfeatureList) < 1:
+ raise ErangeError("no features found")
+
+ if chrom not in chromosomeList:
+ raise ErangeError("chromosome not found in reads")
+
+ newfeatureList.sort()
+
+ return symbol, newfeatureList, sense, chrom
- outfile.close()
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+ geneLength = minDistance
+ if sense == "F":
+ nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+ geneStart = featureList[-1][1]
+ else:
+ nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+ geneStart = max(featureList[0][0] - geneLength, 0)
+
+ if nextGene < geneLength * 2:
+ geneLength = nextGene / 2
+
+ geneLength = max(geneLength, 1)
+
+ return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+ tagCount = 0
+ bins = 10
+ standardMinThresh = standardMinDist / bins
+ binList = [0.] * bins
+ for read in readList:
+ tagStart = read["start"]
+ if tagStart >= geneLength:
+ break
+
+ tagStart -= geneStart
+ weight = read["weight"]
+ if tagStart > 0:
+ tagCount += weight
+ if sense == "F":
+ # we are relying on python's integer division quirk
+ binID = tagStart / standardMinThresh
+ else:
+ rdist = geneLength - tagStart
+ binID = rdist / standardMinThresh
+
+ binList[binID] += weight
+
+ return binList, tagCount
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
import sys
import optparse
+import string
from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
import ReadDataset
from cistematic.genomes import Genome
return parser
+
def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
acceptfile=None):
+
+ hg = Genome(genome)
+ gidList = hg.allGIDs()
+ gidList.sort()
if acceptfile is None:
acceptDict = {}
else:
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+ for chrom in acceptDict:
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
+
+ if doFlank:
+ locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+ else:
+ locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalizeBins:
totalCount = len(hitRDS)
normalizationFactor = totalCount / 1000000.
- hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
- hg = Genome(genome)
- geneinfoDict = getGeneInfoDict(genome, cache=doCache)
- if doFlank:
- locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
- else:
- locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
- gidList = hg.allGIDs()
- gidList.sort()
- for chrom in acceptDict:
- for region in acceptDict[chrom]:
- if region.label not in gidList:
- gidList.append(region.label)
-
(gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+ writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
- outfile = open(outfilename,'w')
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+ outfile = open(outfilename, "w")
for gid in gidList:
- if 'FAR' not in gid:
- symbol = 'LOC' + gid
- geneinfo = ''
+ if "FAR" not in gid:
+ symbol = "LOC%s" % gid
+ geneinfo = ""
try:
geneinfo = geneinfoDict[gid]
symbol = geneinfo[0][0]
- except:
+ except KeyError:
pass
else:
symbol = gid
+
if gid in gidBins and gid in gidLen:
tagCount = 0.
for binAmount in gidBins[gid]:
tagCount += binAmount
- outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+ outputList = [gid, symbol, tagCount, gidLen[gid]]
for binAmount in gidBins[gid]:
if normalizeBins:
- if tagCount == 0:
- tagCount = 1
- outfile.write('\t%.1f' % (100. * binAmount / tagCount))
- else:
- outfile.write('\t%.1f' % binAmount)
- outfile.write('\n')
+ try:
+ normalizedValue = 100. * binAmount / tagCount
+ except ZeroDivisionError:
+ normalizedValue = 100. * binAmount
+
+ binAmount = normalizedValue
+
+ outputList.append("%.1f" % binAmount)
+
+ outLine = string.join(outputList, "\t")
+ outfile.write("%s\n" % outLine)
+
outfile.close()
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
import psyco
psyco.full()
except:
- print 'psyco not running'
+ print "psyco not running"
import sys
import optparse
return parser
-#TODO: Reported user performance issue. Long run times in conditions:
-# small number of reads ~40-50M
-# all features on single chromosome
-#
-# User states has been a long time problem.
-
def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
withUniqs=False, withMulti=False, acceptfile=None,
cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
for line in uniquecounts:
fields = line.strip().split()
# add a pseudo-count here to ease calculations below
- #TODO: figure out why this was done in prior implementation...
uniqueCountDict[fields[0]] = float(fields[-1]) + 1
uniquecounts.close()
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
def getNovelSNPInfo(genome, snpEntry, hg):
fields = snpEntry.split()
- #TODO: refactor naming. is fields[8] rpkm?
if fields[8].find("N\A") == -1:
return snpEntry
else:
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
verstring = "makerdsfromblat: version 3.10"
print verstring
+NUM_HEADER_LINES = 5
+
+
def main(argv=None):
if not argv:
argv = sys.argv
verbose=False, cachePages=100000, geneDataFileName="",
propertyList=[]):
- delimiter = "|"
- minIntron = 10
- maxBorder = 0
- index = 0
- insertSize = 100000
-
+ writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
if forceRNA:
print "forcing datatype to RNA"
dataType = "RNA"
- if dataType == "RNA":
- genedatafile = open(geneDataFileName)
-
- writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
-
geneDict = {}
mapDict = {}
if dataType == "RNA" and not forceRNA:
+ genedatafile = open(geneDataFileName)
for line in genedatafile:
fields = line.strip().split("\t")
blockCount = int(fields[7])
# make some assumptions based on first read
infile = open(filename, "r")
- for arg in range(6):
+ for arg in range(NUM_HEADER_LINES):
line = infile.readline()
+ line = infile.readline()
fields = line.split()
readsize = int(fields[10])
pairedTest = fields[9][-2:]
rds.insertMetadata([("blat_mapped", "True")])
minReadScore = readsize - readsize/25 - 1
- trim = -4
+ maxBorder = 0
if dataType == "RNA":
+ trim = -4
maxBorder = readsize + trim
infile = open(filename, "r")
index = uIndex = mIndex = sIndex = lIndex = 0
bestScore = 0
# skip headers
- for arg in range(5):
+ for arg in range(NUM_HEADER_LINES):
line = infile.readline()
+ insertSize = 100000
+ delimiter = "|"
+ minIntron = 10
for line in infile:
lIndex += 1
fields = line.strip().split()
flip=False, verbose=False, stripSpace=False, cachePages=100000,
propertyList=[]):
- delimiter = "|"
+ writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
+ geneDict = {}
dataType = "DNA"
if genedatafilename is not None:
dataType = "RNA"
genedatafile = open(genedatafilename)
-
-
- forcePair = False
- if forceID is not None:
- forcePair = True
- else:
- forceID = 0
-
- maxBorder = 0
- index = 0
- insertSize = 100000
-
- writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
-
- geneDict = {}
- mapDict = {}
- if dataType == "RNA":
for line in genedatafile:
fields = line.strip().split("\t")
blockCount = int(fields[7])
totalLength += exonLengths[index]
geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
- mapDict[uname] = []
genedatafile.close()
fields = line.split()
readsize = len(fields[5])
pairedTest = fields[0][-2:]
+ forcePair = False
+ if forceID is not None:
+ forcePair = True
+ else:
+ forceID = 0
+
paired = False
if pairedTest in ["/1", "/2"] or forcePair:
print "assuming reads are paired"
paired = True
-
print "read size: %d bp" % readsize
if init:
rds.insertMetadata([("readsize", readsize)])
infile.close()
- trim = -4
+ maxBorder = 0
if dataType == "RNA":
+ trim = -4
maxBorder = readsize + trim
infile = open(filename, "r")
mInsertList = []
sInsertList = []
index = uIndex = mIndex = sIndex = lIndex = 0
+ delimiter = "|"
+ insertSize = 100000
for line in infile:
lIndex += 1
if stripSpace:
insertSize = 100000
geneDict = {}
- mapDict = {}
- seenSpliceList = []
if dataType == 'RNA':
genedatafile = open(geneDataFileName)
for line in genedatafile:
chrom = fields[1]
sense = fields[2]
chromstarts = fields[8][:-1].split(',')
- chromstops = fields[9][:-1].split(',')
- exonLengths = []
- totalLength = 0
for index in range(blockCount):
chromstarts[index] = int(chromstarts[index])
- chromstops[index] = int(chromstops[index])
- exonLengths.append(chromstops[index] - chromstarts[index])
- totalLength += exonLengths[index]
- geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
- mapDict[uname] = []
+ geneDict[uname] = (sense, blockCount, chrom, chromstarts)
+
genedatafile.close()
rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
print '%d unique reads' % index
infile.close()
+ seenSpliceList = []
if dataType == 'RNA':
print 'mapping splices...'
index = 0
print fields
continue
- (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+ (sense, blockCount, chrom, chromstarts) = geneDict[model]
if extended:
if 'F' in thepos:
rsense = '+'
from cistematic.genomes import Genome
+class GeneSymbolAndCount():
+
+ def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+ self.symbol = symbol
+ self.uniqueCount = uniqueCount
+ self.spliceCount = spliceCount
+
+
def main(argv=None):
if not argv:
argv = sys.argv
def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
hg = Genome(genome)
- gidDict = {}
- gidList = []
- uniqueCountDict = {}
- spliceCountDict = {}
+ gidDict = getGeneData(uniquefilecount, splicefilecount)
+ gidList = gidDict.keys()
+ gidList.sort()
+
+ outfile = open(outfilename, "w")
+ for gid in gidList:
+ featureList = hg.getGeneFeatures((genome, gid))
+ featuresizesum, splicearea = getStats(featureList, splicelead)
+ fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+ geneData = gidDict[gid]
+ uniqueCount = geneData.uniqueCount
+ expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+ # this p-value is based on the observed unique count, not the expected total count
+ # nor the multi-read adjusted count
+ pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+ symbol = geneData.symbol
+ spliceCount = geneData.spliceCount
+ print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+ outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
+ outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+ gidDict = {}
uniquefile = open(uniquefilecount)
for line in uniquefile:
fields = line.strip().split()
- gidDict[fields[0]] = fields[1]
- gidList.append(fields[0])
- uniqueCountDict[fields[0]] = int(fields[2])
+ geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+ gidDict[fields[0]] = geneData
+ uniquefile.close()
splicefile = open(splicefilecount)
for line in splicefile:
fields = line.strip().split()
- spliceCountDict[fields[0]] = int(fields[2])
+ gidDict[fields[0]].spliceCount = int(fields[2])
- outfile = open(outfilename,'w')
+ splicefile.close()
- gidList.sort()
- for gid in gidList:
- symbol = gidDict[gid]
- featureList = hg.getGeneFeatures((genome, gid))
- newfeatureList = []
- featuresizesum = 0
- for (ftype, chrom, start, stop, sense) in featureList:
- if (start, stop) not in newfeatureList:
- newfeatureList.append((start, stop))
- featuresizesum += stop - start + 1
+ return gidDict
- if featuresizesum < 1:
- featuresizesum = 1
- splicearea = (len(newfeatureList) - 1) * splicelead
- if splicearea < splicelead:
- splicearea = 0
+def getStats(featureList, splicelead):
+ newfeatureList = []
+ featuresizesum = 0
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+ featuresizesum += stop - start + 1
- fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
- expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+ if featuresizesum < 1:
+ featuresizesum = 1
- # this p-value is based on the observed unique count, not the expected total count
- # nor the multi-read adjusted count
- pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
- print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
- outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+ splicearea = (len(newfeatureList) - 1) * splicelead
+ if splicearea < splicelead:
+ splicearea = 0
- outfile.close()
+ return featuresizesum, splicearea
if __name__ == "__main__":
#
import sys
+from commoncode import regionsOverlap
print "siteintersects: version 2.1"
+class Site():
+ def __init__(self, line, doExpanded=False):
+ fields = line.strip().split()
+ if doExpanded:
+ self.chrom = fields[1][3:]
+ self.start = int(fields[2])
+ self.stop = int(fields[3])
+ self.rest = fields[4:]
+ else:
+ (chromosome, pos) = fields[0].split(":")
+ self.chrom = chromosome[3:]
+ (start, stop) = pos.split("-")
+ self.start = int(start)
+ self.stop = int(stop)
+ self.rest = fields[1:]
+
+
+
def main(argv=None):
if not argv:
argv = sys.argv
siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
- siteDict = {}
- file1Dict = {}
+ siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+ commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
- infile1count = 0
- infile = open(sitefilename1)
+ if doReject and siteRejectFilename is not None:
+ writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+ print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+ siteDict = {}
+ rejectSiteDict = {}
+ processedLineCount = 0
+ infile = open(sitefilename)
infile.readline()
for line in infile:
- if line[0] == "#":
+ if doNotProcessLine(line):
continue
- infile1count += 1
- fields = line.strip().split()
- if doExpanded:
- chrom = fields[1][3:]
- start = int(fields[2])
- stop = int(fields[3])
- rest = fields[4:]
- else:
- (chrom, pos) = fields[0].split(":")
- chrom = chrom[3:]
- (start, stop) = pos.split("-")
- start = int(start)
- stop = int(stop)
- rest = fields[1:]
+ processedLineCount += 1
+ site = Site(line, doExpanded)
+ chrom = site.chrom
+ start = site.start
+ stop = site.stop
+ rest = site.fieldList
try:
siteDict[chrom].append((start, stop, rest))
- except:
+ except KeyError:
siteDict[chrom] = [(start, stop, rest)]
if doReject:
- file1Dict[str((chrom, start, stop, rest))] = line
+ rejectSiteDict[str((chrom, start, stop, rest))] = line
infile.close()
+ print "file1: %d" % processedLineCount
- print "file1: %d" % infile1count
+ return siteDict, rejectSiteDict
- infile2count = 0
- infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+ processedLineCount = 0
+ infile = open(siteFilename)
infile.readline()
commonSites = 0
- unique2List = []
+ if doReject and rejectFilename is not None:
+ rejectfile = open(rejectFilename, "w")
+ else:
+ doReject=False
+
outfile = open(outfilename, "w")
for line in infile:
- if line[0] == "#":
+ if doNotProcessLine(line):
continue
- infile2count += 1
- fields = line.strip().split()
- if doExpanded:
- chrom = fields[1][3:]
- start = int(fields[2])
- stop = int(fields[3])
- rest = fields[4:]
- else:
- (chrom, pos) = fields[0].split(":")
- chrom = chrom[3:]
- (start, stop) = pos.split("-")
- rest = str(fields[1:])
-
- start = int(start)
- stop = int(stop)
- mid = start + abs(stop - start)/2
+ processedLineCount += 1
+ site = Site(line, doExpanded)
+ chrom = site.chrom
+ start = site.start
+ stop = site.stop
if chrom not in siteDict:
if doReject:
- unique2List.append(line)
- continue
+ rejectfile.write(line)
+
+ continue
- twoNotCommon = True
+ siteNotCommon = True
for (rstart, rstop, rline) in siteDict[chrom]:
- rsize = abs(rstart - rstop) /2
- rmid = rstart + abs(rstop - rstart)/2
- if abs(mid - rmid) < rsize:
+ if regionsOverlap(start, stop, rstart, rstop):
commonSites += 1
- if twoNotCommon:
- outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
- twoNotCommon = False
+ if siteNotCommon:
+ outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+ siteNotCommon = False
try:
if doReject:
- del file1Dict[str((chrom, rstart, rstop, rline))]
- except:
+ del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+ except KeyError:
pass
- if doReject and twoNotCommon:
- unique2List.append(line)
+ if doReject and siteNotCommon:
+ rejectfile.write(line)
- outfile.close()
+ if doReject:
+ rejectfile.close()
- print "file2: %d" % infile2count
+ outfile.close()
- if doReject:
- reject1file = open(reject1filename, "w")
- reject2file = open(reject2filename, "w")
+ print "file2: %d" % processedLineCount
- for key in file1Dict:
- reject1file.write(file1Dict[key])
+ return commonSites
- for line in unique2List:
- reject2file.write(line)
- reject1file.close()
- reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+ rejectfile = open(siteRejectFilename, "w")
- print commonSites
+ for key in rejectSiteDict:
+ rejectfile.write(rejectSiteDict[key])
+ rejectfile.close()
+
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)
--- /dev/null
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import bedtoregion
+
+
+class TestBedToRegion(unittest.TestCase):
+
+ testBedFile = "erangeTestBedFile"
+ testRegionFile = "erangeTestRegionFile"
+
+
+ def setUp(self):
+ bedfile = open(self.testBedFile, "w")
+ bedfile.write("tab\tdelimited\tfields\n")
+ bedfile.write("space delimited fields\n")
+ bedfile.write("track\n")
+ bedfile.write("line after track will not be processed\n")
+ bedfile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testBedFile)
+ except OSError:
+ print "bed file does not exist"
+
+ try:
+ os.remove(self.testRegionFile)
+ except OSError:
+ print "region file does not exist"
+
+
+ def testBedToRegion(self):
+ bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile)
+ resultFile = open(self.testRegionFile)
+ regionList = resultFile.readlines()
+ self.assertEquals(2, len(regionList))
+ self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0])
+ self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1])
+
+
+ def testMain(self):
+ argv = ["bedtoregion"]
+ self.assertRaises(SystemExit, bedtoregion.main, argv)
+ argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile]
+ bedtoregion.main(argv)
+ resultFile = open(self.testRegionFile)
+ self.assertEquals(2, len(resultFile.readlines()))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestBedToRegion))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import binstocdf
+
+
+class TestBinsToCdf(unittest.TestCase):
+
+ testInputFile = "erangeTestBinFile"
+ testOutputFile = "erangeTestCDFFile"
+
+
+ def setUp(self):
+ binfile = open(self.testInputFile, "w")
+ binfile.write("field1\tfield2\t10\tfield4\t2\t3\n")
+ binfile.write("field1\tfield2\t20\tfield4\t1\t4\n")
+ binfile.write("field1\tfield2\t0\tfield4\n")
+ binfile.write("too short\n")
+ binfile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "bin file does not exist"
+
+ try:
+ os.remove(self.testOutputFile)
+ except OSError:
+ print "cdf file does not exist"
+
+
+ def testBinsToCdf(self):
+ binstocdf.binToCDF(self.testInputFile, self.testOutputFile)
+ resultFile = open(self.testOutputFile)
+ resultList = resultFile.readlines()
+ self.assertEquals(3, len(resultList))
+ self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0])
+ self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1])
+ self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2])
+
+
+ def testMain(self):
+ argv = ["binstocdf"]
+ self.assertRaises(SystemExit, binstocdf.main, argv)
+ argv = ["binstocdf", self.testInputFile, self.testOutputFile]
+ binstocdf.main(argv)
+ resultFile = open(self.testOutputFile)
+ self.assertEquals(3, len(resultFile.readlines()))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestBinsToCdf))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import cdfdist
+
+
+class TestCdfDist(unittest.TestCase):
+
+ testInputFile = "erangeTestCDFFile"
+
+
+ def setUp(self):
+ cdffile = open(self.testInputFile, "w")
+ cdffile.write("line1 30 60\n")
+ cdffile.write("line2 90 99\n")
+ cdffile.write("line3 5 80\n")
+ cdffile.write("line4 10 14\n")
+ cdffile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "cdf file does not exist"
+
+
+ def testBinsToCdf(self):
+ bins = 2
+ self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile))
+ self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile))
+ self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile))
+ self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile))
+
+
+ def testMain(self):
+ bins = 2
+ percent = 50
+ argv = ["cdfdist"]
+ self.assertRaises(SystemExit, cdfdist.main, argv)
+ argv = ["cdfdist", bins, percent, self.testInputFile]
+ cdfdist.main(argv)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestCdfDist))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
snpDB = "%s/dbSNP128.db" % dbPath
altSnpDB = "%s/snp129cDNA.db" % dbPath
+ inputFileName = "testChkSNP_input.txt"
+ outputFileName = "testChkSNP_output.txt"
def setUp(self):
pass
def testChkSNPFile(self):
- inputFileName = "testChkSNP_input.txt"
- infile = open(inputFileName, "w")
+ infile = open(self.inputFileName, "w")
infile.write("# header line\n")
snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
infile.write("%s\n" % snpEntry)
infile.write("%s\n" % snpEntry)
infile.close()
- outputFileName = "testChkSNP_output.txt"
-
- chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
- outfile = open(outputFileName, "r")
+ chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName)
+ outfile = open(self.outputFileName, "r")
line = outfile.readline()
result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
self.assertEquals(result, line)
line = outfile.readline()
self.assertEquals(result, line)
outfile.close()
- os.remove(outputFileName)
+ os.remove(self.outputFileName)
- chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
- outfile = open(outputFileName, "r")
+ chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB])
+ outfile = open(self.outputFileName, "r")
line = outfile.readline()
result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
self.assertEquals(result, line)
self.assertEquals(result, line)
outfile.close()
- os.remove(inputFileName)
- os.remove(outputFileName)
+ os.remove(self.inputFileName)
+ os.remove(self.outputFileName)
def testMain(self):
- inputFileName = "testChkSNP_input.txt"
- infile = open(inputFileName, "w")
+ infile = open(self.inputFileName, "w")
infile.write("# header line\n")
snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
infile.write("%s\n" % snpEntry)
infile.write("%s\n" % snpEntry)
infile.close()
- outputFileName = "testChkSNP_output.txt"
-
- argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+ argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName]
chksnp.main(argv)
- outfile = open(outputFileName, "r")
+ outfile = open(self.outputFileName, "r")
line = outfile.readline()
result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
self.assertEquals(result, line)
line = outfile.readline()
self.assertEquals(result, line)
outfile.close()
- os.remove(outputFileName)
+
+ os.remove(self.inputFileName)
+ os.remove(self.outputFileName)
+
def testChkSNP(self):
snpPropertiesList = []
--- /dev/null
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import colsum
+
+
+class TestColsum(unittest.TestCase):
+
+ testInputFile = "erangeTestColsumFile"
+
+
+ def setUp(self):
+ colfile = open(self.testInputFile, "w")
+ colfile.write("line1 30 60.5\n")
+ colfile.write("line2 90 99\n")
+ colfile.write("line3 5 80\n")
+ colfile.write("line4 10 1\n")
+ colfile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "cdf file does not exist"
+
+
+ def testBinsToCdf(self):
+ self.assertEquals(0, colsum.colsum(0, self.testInputFile))
+ self.assertEquals(135, colsum.colsum(1, self.testInputFile))
+ self.assertEquals(240.5, colsum.colsum(2, self.testInputFile))
+ self.assertEquals(0, colsum.colsum(3, self.testInputFile))
+
+
+ def testMain(self):
+ field = 1
+ argv = ["colsum"]
+ self.assertRaises(SystemExit, colsum.main, argv)
+ argv = ["colsum", field, self.testInputFile]
+ colsum.main(argv)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestColsum))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
import string
from array import array
from erange import commoncode
+from erange import Region
from cistematic.genomes import Genome
regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
testfile.write(regionEntry)
testfile.close()
- result = {"1": [(10, 20, 10)]}
- self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+ result = commoncode.getMergedRegions("regionTestFile")
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
os.remove("regionTestFile")
regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
regionList = [regionEntry]
- result = {"1": [(10, 20, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
- result = {"1": [(5, 25, 20)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
- result = {"1": [(12, 18, 6)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
- result = {"chr1": [(10, 20, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+ result = commoncode.getMergedRegionsFromList(regionList)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, pad=5)
+ self.assertEquals(5, result["1"][0].start)
+ self.assertEquals(25, result["1"][0].stop)
+ self.assertEquals(20, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, pad=-2)
+ self.assertEquals(12, result["1"][0].start)
+ self.assertEquals(18, result["1"][0].stop)
+ self.assertEquals(6, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True)
+ self.assertEquals(10, result["chr1"][0].start)
+ self.assertEquals(20, result["chr1"][0].stop)
+ self.assertEquals(10, result["chr1"][0].length)
regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
regionList = [regionEntry]
- result = {"1": [(10, 20, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+ result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
regionList = [regionEntry]
regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
regionList.append(regionEntry)
- result = {"1": [(10, 40, 30)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
- result = {"1": [(10, 20, 10), (15, 40, 25)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
- result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+ result = commoncode.getMergedRegionsFromList(regionList)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(40, result["1"][0].stop)
+ self.assertEquals(30, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, doMerge=False)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(15, result["1"][1].start)
+ self.assertEquals(40, result["1"][1].stop)
+ self.assertEquals(25, result["1"][1].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals("1", result["1"][0].label)
+ self.assertEquals(15, result["1"][1].start)
+ self.assertEquals(40, result["1"][1].stop)
+ self.assertEquals(25, result["1"][1].length)
+ self.assertEquals("2", result["1"][1].label)
regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
regionList = [regionEntry]
regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
regionList.append(regionEntry)
- result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+ result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals("1\tspacer", result["1"][0].label)
+ self.assertEquals(15, result["1"][1].start)
+ self.assertEquals(40, result["1"][1].stop)
+ self.assertEquals(25, result["1"][1].length)
+ self.assertEquals("2\tspacer2", result["1"][1].label)
regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
regionList = [regionEntry]
regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
regionList.append(regionEntry)
- result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
- result = {"1": [(10, 2040, 2030)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
- result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
- result = {"1": [(2030, 2040, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+ result = commoncode.getMergedRegionsFromList(regionList)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(2030, result["1"][1].start)
+ self.assertEquals(2040, result["1"][1].stop)
+ self.assertEquals(10, result["1"][1].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(2040, result["1"][0].stop)
+ self.assertEquals(2030, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, minHits=5)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(2030, result["1"][1].start)
+ self.assertEquals(2040, result["1"][1].stop)
+ self.assertEquals(10, result["1"][1].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+ self.assertEquals(2030, result["1"][0].start)
+ self.assertEquals(2040, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+ self.assertEquals(2030, result["1"][0].start)
+ self.assertEquals(2040, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
regionList = [regionEntry]
regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
regionList.append(regionEntry)
- result = {"2": [(15, 40, 25)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+ result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)
+ self.assertEquals(15, result["2"][0].start)
+ self.assertEquals(40, result["2"][0].stop)
+ self.assertEquals(25, result["2"][0].length)
self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
regionList = [regionEntry]
- result = {"1": [(10, 20, 10)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
- result = {"1": [(10, 20, 10, 3, 40)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
- result = {"1": [("1", 10, 20, 10, 3, 40)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ result = commoncode.getMergedRegionsFromList(regionList)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+ self.assertEquals("1", result["1"][0].label)
+
regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
regionList.append(regionEntry)
- result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+ self.assertEquals("1", result["1"][0].label)
+ self.assertEquals(15, result["2"][0].start)
+ self.assertEquals(40, result["2"][0].stop)
+ self.assertEquals(25, result["2"][0].length)
+ self.assertEquals(32, result["2"][0].peakPos)
+ self.assertEquals(17, result["2"][0].peakHeight)
+ self.assertEquals("2", result["2"][0].label)
+
regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
regionList.append(regionEntry)
- result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(40, result["1"][0].stop)
+ self.assertEquals(30, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+ self.assertEquals("3", result["1"][0].label)
+ self.assertEquals(15, result["2"][0].start)
+ self.assertEquals(40, result["2"][0].stop)
+ self.assertEquals(25, result["2"][0].length)
+ self.assertEquals(32, result["2"][0].peakPos)
+ self.assertEquals(17, result["2"][0].peakHeight)
+ self.assertEquals("2", result["2"][0].label)
+
regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
regionList.append(regionEntry)
- result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
- result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
- "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
- }
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(40, result["1"][0].stop)
+ self.assertEquals(30, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+ self.assertEquals("3", result["1"][0].label)
+ self.assertEquals(15, result["2"][0].start)
+ self.assertEquals(88, result["2"][0].stop)
+ self.assertEquals(73, result["2"][0].length)
+ self.assertEquals(32, result["2"][0].peakPos)
+ self.assertEquals(17, result["2"][0].peakHeight)
+ self.assertEquals("4", result["2"][0].label)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+ self.assertEquals("1", result["1"][0].label)
+ self.assertEquals(15, result["1"][1].start)
+ self.assertEquals(40, result["1"][1].stop)
+ self.assertEquals(25, result["1"][1].length)
+ self.assertEquals(32, result["1"][1].peakPos)
+ self.assertEquals(17, result["1"][1].peakHeight)
+ self.assertEquals("3", result["1"][1].label)
+ self.assertEquals(15, result["2"][0].start)
+ self.assertEquals(40, result["2"][0].stop)
+ self.assertEquals(25, result["2"][0].length)
+ self.assertEquals(32, result["2"][0].peakPos)
+ self.assertEquals(17, result["2"][0].peakHeight)
+ self.assertEquals("2", result["2"][0].label)
+ self.assertEquals(65, result["2"][1].start)
+ self.assertEquals(88, result["2"][1].stop)
+ self.assertEquals(23, result["2"][1].length)
+ self.assertEquals(72, result["2"][1].peakPos)
+ self.assertEquals(7, result["2"][1].peakHeight)
+ self.assertEquals("4", result["2"][1].label)
regionList = ["# comment"]
regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
regionList.append(regionEntry)
- result = {"1": [(10, 20, 10, 3, 40)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
regionList = ["# pvalue"]
regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
regionList.append(regionEntry)
- result = {"1": [(10, 20, 10, 3, 40)]}
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
regionList = ["# readShift"]
regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
regionList.append(regionEntry)
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
regionList = ["# pvalue readShift"]
regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
regionList.append(regionEntry)
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
- self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
+ result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+ self.assertEquals(10, result["1"][0].start)
+ self.assertEquals(20, result["1"][0].stop)
+ self.assertEquals(10, result["1"][0].length)
+ self.assertEquals(3, result["1"][0].peakPos)
+ self.assertEquals(40, result["1"][0].peakHeight)
+
#Test fails - the header line is required if there are fields after the peak which isn't so good
#self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
#TODO: write test
def testFindPeak(self):
hitList = []
- result = ([], 0.0, array("f"), 0.0)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+ result = commoncode.findPeak(hitList, 0, 0)
+ self.assertEquals([], result.topPos)
+ self.assertEquals(0.0, result.numHits)
+ self.assertEquals(array("f"), result.smoothArray)
+ self.assertEquals(0.0, result.numPlus)
hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
- result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
- result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
- result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
- result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+ result = commoncode.findPeak(hitList, 0, 10)
+ self.assertEquals([6, 7], result.topPos)
+ self.assertEquals(1.0, result.numHits)
+ self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(1.0, result.numPlus)
+
+ result = commoncode.findPeak(hitList, 0, 10, doWeight=True)
+ self.assertEquals([6, 7], result.topPos)
+ self.assertEquals(0.5, result.numHits)
+ self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(0.5, result.numPlus)
+
+ result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+ self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+ self.assertEquals(0.0, result.numHits)
+ self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(0.0, result.numPlus)
+
+ result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+ self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+ self.assertEquals(0.0, result.numHits)
+ self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(0.0, result.numPlus)
+ self.assertEquals(6, result.shift)
hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
- result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+ result = commoncode.findPeak(hitList, 0, 10, shift=3)
+ self.assertEquals([7], result.topPos)
+ self.assertEquals(1.0, result.numHits)
+ self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(1.0, result.numPlus)
+ self.assertEquals(3, result.shift)
hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
- result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
- result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
- self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+ result = commoncode.findPeak(hitList, 0, 10, leftPlus=True)
+ self.assertEquals([6, 7], result.topPos)
+ self.assertEquals(1.0, result.numHits)
+ self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(1.0, result.numPlus)
+ self.assertEquals(1.0, result.numLeftPlus)
+
+ result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3)
+ self.assertEquals([7], result.topPos)
+ self.assertEquals(1.0, result.numHits)
+ self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+ self.assertEquals(1.0, result.numPlus)
+ self.assertEquals(1.0, result.numLeftPlus)
+ self.assertEquals(3, result.shift)
#TODO: write test
}
self.assertEquals(result, complementDict)
- regionDict = {"I": [("new feature", 100, 150, 50)]}
+ additionalRegion = Region.Region(100, 150)
+ additionalRegion.label = "new feature"
+ regionDict = {"I": [additionalRegion]}
featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
result = (100, 150, "new feature", "+", "custom")
self.assertEquals(result, featureDict["I"][0])
+
def testGetLocusByChromDict(self):
self.assertTrue(chrom in self.celegansChroms)
self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
- regionDict = {"I": [("new region", 100, 150, 50)]}
+ additionalRegion = Region.Region(100, 150)
+ additionalRegion.label = "new region"
+ regionDict = {"I": [additionalRegion]}
locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
import sys
import unittest
import testAnalyzeGO
+import testBedToRegion
+import testBinsToCdf
+import testCdfDist
import testChksnp
+import testColsum
import testCommoncode
+import testFeatureIntersects
import testGeneMrnaCounts
import testGeneMrnaCountsWeighted
#import testGetFasta
import testmakebedfromrds
#import testMakeGraphs
import testMakeRdsFromBam
+import testMakeSiteTrack
import testMakeSNPTrack
import testMarkLinkers
import testPeak
suite = unittest.TestSuite()
suite.addTest(testAnalyzeGO.suite())
+ suite.addTest(testBedToRegion.suite())
+ suite.addTest(testBinsToCdf.suite())
+ suite.addTest(testCdfDist.suite())
suite.addTest(testChksnp.suite())
+ suite.addTest(testColsum.suite())
suite.addTest(testCommoncode.suite())
+ suite.addTest(testFeatureIntersects.suite())
suite.addTest(testGeneMrnaCounts.suite())
suite.addTest(testGeneMrnaCountsWeighted.suite())
#suite.addTest(testGetFasta.suite())
suite.addTest(testmakebedfromrds.suite())
#suite.addTest(testMakeGraphs.suite())
suite.addTest(testMakeRdsFromBam.suite())
+ suite.addTest(testMakeSiteTrack.suite())
suite.addTest(testMakeSNPTrack.suite())
suite.addTest(testMarkLinkers.suite())
suite.addTest(testPeak.suite())
--- /dev/null
+
+import unittest
+import os
+from erange import featureIntersects
+
+
+class TestFeatureIntersects(unittest.TestCase):
+
+ testInputFile = "erangeTestFeatureIntersects"
+
+
+ def setUp(self):
+ cdffile = open(self.testInputFile, "w")
+ cdffile.write("line1\tchr1\t30\t60\n")
+ cdffile.write("line2\tchr1\t90\t99\n")
+ cdffile.write("line3\tchr1\t5\t80\n")
+ cdffile.write("line4\tchr2\t10\t14\n")
+ cdffile.write("line4\tnot to be processed\n")
+ cdffile.write("line5\tchr2\t10\t14\n")
+ cdffile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "position file does not exist"
+
+
+ #TODO: write test
+ def testFeatureIntersects(self):
+ pass
+
+
+ #TODO: write test
+ def testGetPositionList(self):
+ pass
+
+
+ #TODO: write test
+ def testMain(self):
+ argv = ["cdfdist"]
+ self.assertRaises(SystemExit, featureIntersects.main, argv)
+ argv = ["cdfdist", self.testInputFile]
+ featureIntersects.main(argv)
+
+
+ def testMakeParser(self):
+ parser = featureIntersects.makeParser("")
+ argv = []
+ (options, args) = parser.parse_args(argv)
+ self.assertEqual([], args)
+ argv = [self.testInputFile]
+ (options, args) = parser.parse_args(argv)
+ self.assertEqual(self.testInputFile, args[0])
+ self.assertEqual("TFBSCONSSITES", options.cistype)
+ self.assertEqual(100, options.radius)
+ argv = [self.testInputFile, "--cistype", "test", "--radius", "50"]
+ (options, args) = parser.parse_args(argv)
+ self.assertEqual("test", options.cistype)
+ self.assertEqual(50, options.radius)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestFeatureIntersects))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import makesitetrack
+
+
+class TestMakeSiteTrack(unittest.TestCase):
+
+ testInputFile = "erangeTestSiteFile"
+ testOutputFile = "erangeTestBedFile"
+
+
+ def setUp(self):
+ self.createCompactInputFile()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "site file does not exist"
+
+ try:
+ os.remove(self.testOutputFile)
+ except OSError:
+ print "bed file does not exist"
+
+
+ def createStandardInputFile(self):
+ siteFile = open(self.testInputFile, "w")
+ siteFile.write("#comment line\n")
+ siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n")
+ siteFile.close()
+
+
+ def createCompactInputFile(self):
+ siteFile = open(self.testInputFile, "w")
+ siteFile.write("#comment line\n")
+ siteFile.write("chr1:100-175\t11\t+\t-\n")
+ siteFile.close()
+
+
+ def testMakeSiteTrack(self):
+ self.createCompactInputFile()
+ makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile)
+ bedfile = open(self.testOutputFile)
+ header = 'track name="" visibility=4 itemRgb="On"\n'
+ self.assertEquals(header, bedfile.readline())
+ result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n"
+ self.assertEquals(result, bedfile.readline())
+
+ makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True)
+ bedfile = open(self.testOutputFile)
+ header = bedfile.readline()
+ firstEntry = bedfile.readline()
+ self.assertEquals(firstEntry, bedfile.readline())
+
+
+ def testNonCompactMakeSiteTrack(self):
+ try:
+ os.remove(self.testInputFile)
+ except OSError:
+ print "site file does not exist"
+
+ self.createStandardInputFile()
+ makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255")
+ bedfile = open(self.testOutputFile)
+ bedfile.readline()
+ result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n"
+ self.assertEquals(result, bedfile.readline())
+
+
+ def testMain(self):
+ self.createCompactInputFile()
+ argv = ["makesitetrack"]
+ self.assertRaises(SystemExit, makesitetrack.main, argv)
+ argv = ["makesitetrack", self.testInputFile, self.testOutputFile]
+ makesitetrack.main(argv)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeSiteTrack))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
chrom = "chr%s" % multiReadList[index][1]
reweighList.append((round(score,3), chrom, start, theID))
- #TODO: Is this right? If match index is 0 are we doing nothing?
if theMatch > 0:
RDS.reweighMultireads(reweighList)
fixedPair += 1
if __name__ == "__main__":
- main(sys.argv)
\ No newline at end of file
+ main(sys.argv)