From: Sean Upchurch Date: Fri, 7 Jan 2011 18:05:34 +0000 (-0800) Subject: Release version for Erange 4.0a X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=commitdiff_plain;h=77dccd7c98d8cdb60caaf178b1123df71ea662c9 Release version for Erange 4.0a --- diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py index bfb48fd..730c20b 100644 --- a/MakeBamFromRds.py +++ b/MakeBamFromRds.py @@ -22,13 +22,13 @@ import pysam import ReadDataset from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption +VERSION = "1.0" def main(argv=None): if not argv: argv = sys.argv - verstring = "makeBamFromRds: version 1.0" - print verstring + print "makeBamFromRds: version %s" % VERSION doPairs = False @@ -134,7 +134,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, for chrom in chromRemoveList: chromList.remove(chrom) - header = {"HD": {"VN": "1.0"}} + header = {"HD": {"VN": VERSION}} if referenceSequenceList: header["SQ"] = referenceSequenceList diff --git a/ReadDataset.py b/ReadDataset.py index 5ff60e2..56e28a8 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -153,42 +153,42 @@ class ReadDataset(): self.cachedDB = "" - def attachDB(self, filename, asname): + def attachDB(self, filename, dbName): """ attach another database file to the readDataset. """ - stmt = "attach '%s' as %s" % (filename, asname) + stmt = "attach '%s' as %s" % (filename, dbName) self.execute(stmt) - def detachDB(self, asname): + def detachDB(self, dbName): """ detach a database file to the readDataset. """ - stmt = "detach %s" % (asname) + stmt = "detach %s" % (dbName) self.execute(stmt) - def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""): + def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""): """ import into current RDS the table (with columns destcolumns, with default all columns) from the database file asname, using the column specification of ascolumns (default all). """ - stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table) + stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table) if flagged != "": stmt += " where flag = '%s' " % flagged self.executeCommit(stmt) - def getTables(self, asname=""): + def getTables(self, dbName=""): """ get a list of table names in a particular database file. """ resultList = [] sql = self.getSqlCursor() - if asname != "": - asname += "." + if dbName != "": + dbName = "%s." % dbName - stmt = "select name from %ssqlite_master where type='table'" % asname + stmt = "select name from %ssqlite_master where type='table'" % dbName sql.execute(stmt) results = sql.fetchall() @@ -291,11 +291,15 @@ class ReadDataset(): if "readsize" not in metadata: raise ReadDatasetError("no readsize parameter defined") else: - mysize = metadata["readsize"] - if "import" in mysize: - mysize = mysize.split()[0] + readSize = metadata["readsize"] + if "import" in readSize: + readSize = readSize.split()[0] - return int(mysize) + readSize = int(readSize) + if readSize < 0: + raise ReadDatasetError("readsize is negative") + + return readSize def getDefaultCacheSize(self): @@ -317,11 +321,9 @@ class ReadDataset(): if row["chrom"] not in results: results.append(row["chrom"]) else: - if len(row["chrom"][3:].strip()) < 1: - continue - - if row["chrom"][3:] not in results: - results.append(row["chrom"][3:]) + shortName = row["chrom"][3:] + if len(shortName.strip()) > 0 and shortName not in results: + results.append(shortName) results.sort() @@ -333,32 +335,17 @@ class ReadDataset(): """ returns the maximum coordinate for reads on a given chromosome. """ maxCoord = 0 - sql = self.getSqlCursor() if doUniqs: - try: - sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom) - maxCoord = int(sql.fetchall()[0][0]) - except: - print "couldn't retrieve coordMax for chromosome %s" % chrom + maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs") if doSplices: - sql.execute("select max(startR) from splices where chrom = '%s'" % chrom) - try: - spliceMax = int(sql.fetchall()[0][0]) - if spliceMax > maxCoord: - maxCoord = spliceMax - except: - pass + spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR") + maxCoord = max(spliceMax, maxCoord) if doMulti: - sql.execute("select max(start) from multi where chrom = '%s'" % chrom) - try: - multiMax = int(sql.fetchall()[0][0]) - if multiMax > maxCoord: - maxCoord = multiMax - except: - pass + multiMax = self.getMaxStartCoordinateInTable(chrom, "multi") + maxCoord = max(multiMax, maxCoord) if verbose: print "%s maxCoord: %d" % (chrom, maxCoord) @@ -366,6 +353,19 @@ class ReadDataset(): return maxCoord + def getMaxStartCoordinateInTable(self, chrom, table, startField="start"): + maxCoord = 0 + sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom) + sql = self.getSqlCursor() + try: + sql.execute(sqlStatement) + maxCoord = int(sql.fetchall()[0][0]) + except: + print "couldn't retrieve coordMax for chromosome %s" % chrom + + return maxCoord + + def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="", flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False, @@ -378,67 +378,14 @@ class ReadDataset(): Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict """ - whereClause = [] - resultsDict = {} - - if chrom != "" and chrom != self.memChrom: - whereClause.append("chrom = '%s'" % chrom) - - if flag != "": - if flagLike: - flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") - whereClause.append(flagLikeClause) - else: - whereClause.append("flag = '%s'" % flag) - - if start > -1: - whereClause.append("start > %d" % start) - - if stop > -1: - whereClause.append("stop < %d" % stop) - - if len(readLike) > 0: - readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") - whereClause.append(readIDClause) - - if hasMismatch: - whereClause.append("mismatch != ''") - - if strand in ["+", "-"]: - whereClause.append("sense = '%s'" % strand) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" - groupBy = [] + whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike) if findallOptimize: - selectClause = ["select start, sense, sum(weight)"] - groupBy = ["GROUP BY start, sense"] + selectQuery = "select start, sense, sum(weight)" else: - selectClause = ["select ID, chrom, start, readID"] - if bothEnds: - selectClause.append("stop") - - if not noSense: - selectClause.append("sense") - - if withWeight: - selectClause.append("weight") - - if withFlag: - selectClause.append("flag") + selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds) - if withMismatch: - selectClause.append("mismatch") - - if limit > 0 and not combine5p: - groupBy.append("LIMIT %d" % limit) - - selectQuery = string.join(selectClause, ",") - groupQuery = string.join(groupBy) + groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p) if doUniqs: stmt = [selectQuery, "from uniqs", whereQuery, groupQuery] if doMulti: @@ -498,6 +445,7 @@ class ReadDataset(): sqlQuery = string.join(stmt) sql.execute(sqlQuery) + resultsDict = {} if findallOptimize: resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql] if self.memBacked: @@ -562,20 +510,17 @@ class ReadDataset(): return resultsDict - def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", - flag="", withWeight=False, withFlag=False, withMismatch=False, - withID=False, withChrom=False, withPairID=False, readIDDict=False, - splitRead=False, hasMismatch=False, flagLike=False, start=-1, - stop=-1, strand=""): - """ returns a dictionary of spliced reads in a variety of - formats and which can be restricted by chromosome or custom-flag. - Returns unique spliced reads for now. - """ - whereClause = [] - resultsDict = {} + def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False): + if splice: + startText = "startL" + stopText = "stopR" + else: + startText = "start" + stopText = "stop" + whereClause = [] if chrom != "" and chrom != self.memChrom: - whereClause = ["chrom = '%s'" % chrom] + whereClause.append("chrom = '%s'" % chrom) if flag != "": if flagLike: @@ -584,25 +529,37 @@ class ReadDataset(): else: whereClause.append("flag = '%s'" % flag) + if start > -1: + whereClause.append("%s > %d" % (startText, start)) + + if stop > -1: + whereClause.append("%s < %d" % (stopText, stop)) + + if len(readLike) > 0: + readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") + whereClause.append(readIDClause) + if hasMismatch: whereClause.append("mismatch != ''") - if strand != "": + if strand in ["+", "-"]: whereClause.append("sense = '%s'" % strand) - if start > -1: - whereClause.append("startL > %d" % start) - - if stop > -1: - whereClause.append("stopR < %d" % stop) - if len(whereClause) > 0: whereStatement = string.join(whereClause, " and ") whereQuery = "where %s" % whereStatement else: whereQuery = "" - selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"] + return whereQuery + + + def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False): + + selectClause = [baseSelect] + if bothEnds: + selectClause.append("stop") + if not noSense: selectClause.append("sense") @@ -615,7 +572,36 @@ class ReadDataset(): if withMismatch: selectClause.append("mismatch") - selectQuery = string.join(selectClause, " ,") + selectQuery = string.join(selectClause, ",") + + return selectQuery + + + def getReadGroupQuery(self, findallOptimize, limit, combine5p): + groupBy = [] + if findallOptimize: + groupBy = ["GROUP BY start, sense"] + + if limit > 0 and not combine5p: + groupBy.append("LIMIT %d" % limit) + + groupQuery = string.join(groupBy) + + return groupQuery + + + def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, + withID=False, withChrom=False, withPairID=False, readIDDict=False, + splitRead=False, hasMismatch=False, flagLike=False, start=-1, + stop=-1, strand=""): + """ returns a dictionary of spliced reads in a variety of + formats and which can be restricted by chromosome or custom-flag. + Returns unique spliced reads for now. + """ + whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True) + selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID" + selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch) if self.memBacked: sql = self.memcon.cursor() else: @@ -625,6 +611,7 @@ class ReadDataset(): sql.execute(stmt) currentReadID = "" currentChrom = "" + resultsDict = {} for row in sql: pairID = 0 readID = row["readID"] @@ -796,10 +783,6 @@ class ReadDataset(): """ get readID's. """ stmt = [] - limitPart = "" - if limit > 0: - limitPart = "LIMIT %d" % limit - if uniqs: stmt.append("select readID from uniqs") @@ -814,6 +797,10 @@ class ReadDataset(): else: selectPart = "" + limitPart = "" + if limit > 0: + limitPart = "LIMIT %d" % limit + sqlQuery = "%s group by readID %s" % (selectPart, limitPart) if self.memBacked: sql = self.memcon.cursor() @@ -845,12 +832,11 @@ class ReadDataset(): print "getting mismatches from chromosome %s" % (achrom) snpDict[achrom] = [] - hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) if useSplices and self.dataType == "RNA": spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) spliceIDList = spliceDict.keys() - for k in spliceIDList: - spliceEntry = spliceDict[k][0] + for spliceID in spliceIDList: + spliceEntry = spliceDict[spliceID][0] startpos = spliceEntry["startL"] lefthalf = spliceEntry["stopL"] rightstart = spliceEntry["startR"] @@ -881,6 +867,7 @@ class ReadDataset(): snpDict[achrom].append([startpos, change_at, change_base, change_from]) + hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) if achrom not in hitDict.keys(): continue @@ -910,7 +897,7 @@ class ReadDataset(): def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True, - useSplices=False, normalizationFactor = 1.0, trackStrand=False, + useSplices=False, normalizationFactor=1.0, trackStrand=False, keepStrand="both", shiftValue=0): """return a profile of the chromosome as an array of per-base read coverage.... keepStrand = 'both', 'plusOnly', or 'minusOnly'. @@ -925,8 +912,8 @@ class ReadDataset(): dataType = metadata["dataType"] scale = 1. / normalizationFactor shift = {} - shift['+'] = int(shiftValue) - shift['-'] = -1 * int(shiftValue) + shift["+"] = int(shiftValue) + shift["-"] = -1 * int(shiftValue) if cstop > 0: lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen diff --git a/altSpliceCounts.py b/altSpliceCounts.py index 12077c1..c59dc92 100755 --- a/altSpliceCounts.py +++ b/altSpliceCounts.py @@ -2,7 +2,7 @@ try: import psyco psyco.full() except: - print 'psyco not running' + print "psyco not running" print "altSpliceCounts: version 3.7" @@ -56,13 +56,13 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000): stopDict = {} resultDict = {} - hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) readlen = hitRDS.getReadSize() hitDict = hitRDS.getSplicesDict(noSense=True) - outfile = open(outfilename,'w') + outfile = open(outfilename, "w") for chrom in hitDict: startDict[chrom] = [] @@ -155,7 +155,7 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000): resultDict[chrom].sort() for line in resultDict[chrom]: - outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n' % line) + outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n" % line) alternative += 1 print chrom, maxIndex, spliceEvent, altSpliceEvent diff --git a/bedtoregion.py b/bedtoregion.py index 3bcd554..a6a344d 100755 --- a/bedtoregion.py +++ b/bedtoregion.py @@ -21,14 +21,16 @@ def main(argv=None): def bedToRegion(factor, infilename, outfilename): index = 1 infile = open(infilename) - outfile = open(outfilename, 'w') + outfile = open(outfilename, "w") for line in infile: - if 'track' in line: + if "track" in line: continue + fields = line.split() - line = string.join(fields, '\t') - outfile.write('%s%d\t%s\n' % (factor, index, line)) + line = string.join(fields, "\t") + outfile.write("%s%d\t%s\n" % (factor, index, line)) index += 1 + infile.close() outfile.close() diff --git a/binstocdf.py b/binstocdf.py index 63aa955..27b48e2 100755 --- a/binstocdf.py +++ b/binstocdf.py @@ -1,4 +1,5 @@ import sys +import string print "binstocdf: version 1.1" @@ -6,19 +7,19 @@ def main(argv=None): if not argv: argv = sys.argv - if len(argv) < 2: - print 'usage: python %s infile outfile' % sys.argv[0] + if len(argv) < 3: + print "usage: python %s infile outfile" % sys.argv[0] sys.exit(1) - infilename = argv[0] - outfilename = argv[1] + infilename = argv[1] + outfilename = argv[2] binToCDF(infilename, outfilename) def binToCDF(infilename, outfilename): infile = open(infilename) - outfile = open(outfilename, 'w') + outfile = open(outfilename, "w") for line in infile: fields = line.strip().split() @@ -30,14 +31,15 @@ def binToCDF(infilename, outfilename): outfile.write(line) continue - outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3])) - cum = 0 + outputFields = fields[:4] + runningTotal = 0 for bin in fields[4:]: - cum += int(bin) - percent = 100 * cum / total - outfile.write('\t%d' % percent) + runningTotal += int(bin) + percent = 100 * runningTotal / total + outputFields.append("%d" % percent) - outfile.write('\n') + outputLine = string.join(outputFields, "\t") + outfile.write("%s\n" % outputLine) infile.close() outfile.close() diff --git a/buildMatrix.py b/buildMatrix.py index c7b6dd0..a49120c 100755 --- a/buildMatrix.py +++ b/buildMatrix.py @@ -78,7 +78,7 @@ def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM, if header.strip() == "": header = "#\t" - outfile.write( "%s\t%s\n" % (header, colID)) + outfile.write("%s\t%s\n" % (header, colID)) values = [] min = 20000000000. diff --git a/cdfdist.py b/cdfdist.py index 7166244..52963b7 100755 --- a/cdfdist.py +++ b/cdfdist.py @@ -8,11 +8,15 @@ def main(argv=None): print "usage: python %s bins percent infile" % sys.argv[0] sys.exit(1) - bins = int(argv[0]) - percent = int(argv[1]) - infilename = argv[2] + bins = int(argv[1]) + percent = int(argv[2]) + infilename = argv[3] - cdfDist(bins, percent, infilename) + printCdfDist(bins, percent, infilename) + + +def printCdfDist(bins, percent, infilename): + print cdfDist(bins, percent, infilename) def cdfDist(bins, percent, infilename): @@ -30,7 +34,7 @@ def cdfDist(bins, percent, infilename): index += 1 infile.close() - print binsList + return binsList if __name__ == "__main__": diff --git a/colsum.py b/colsum.py index f6d1ff9..643aaea 100755 --- a/colsum.py +++ b/colsum.py @@ -28,7 +28,7 @@ def colsum(fieldID, filename): count += float(fields[fieldID]) else: count += int(fields[fieldID]) - except ValueError: + except (ValueError, IndexError): pass infile.close() diff --git a/combineRPKMs.py b/combineRPKMs.py index ead4e1b..77f30d6 100755 --- a/combineRPKMs.py +++ b/combineRPKMs.py @@ -51,23 +51,8 @@ def makeParser(usage=""): def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False): - firstDict = {} - firstfile = open(firstfileName) - for line in firstfile: - fields = line.strip().split() - firstDict[fields[1]] = fields[-1] - - firstfile.close() - - expandedDict = {} - gidDict = {} - expandedfile = open(expandedfileName) - for line in expandedfile: - fields = line.strip().split() - expandedDict[fields[1]] = fields[-1] - gidDict[fields[1]] = fields[0] - - expandedfile.close() + firstDict = getRPKMDict(firstfileName) + gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True) if doFraction: header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n" @@ -97,5 +82,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do outfile.close() +def getRPKMDict(rpkmFileName, getGIDDict=False): + gidDict = {} + rpkmDict = {} + rpkmFile = open(rpkmFileName) + for line in rpkmFile: + fields = line.strip().split() + rpkmDict[fields[1]] = fields[-1] + if getGIDDict: + gidDict[fields[1]] = fields[0] + + rpkmFile.close() + + if getGIDDict: + return gidDict, rpkmDict + else: + return rpkmDict + + if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/combinerds.py b/combinerds.py index 4c826b0..2878423 100755 --- a/combinerds.py +++ b/combinerds.py @@ -10,7 +10,9 @@ except: pass import sys +import optparse import ReadDataset +from commoncode import getConfigParser, getConfigOption, getConfigBoolOption print "combinerds: version 1.2" @@ -19,97 +21,107 @@ def main(argv=None): if not argv: argv = sys.argv - if len(argv) < 2: - print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0] - #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n' + usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0] + parser = makeParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage sys.exit(1) - doCache = False - cachePages = -1 - if '--cache' in argv: - doCache = True - try: - cachePages = int(argv[sys.argv.index('-cache') + 1]) - except: - pass - - datafile = argv[1] - infileList = [] - for index in range(2, len(argv)): - if argv[index][0] == '-': - break - infileList.append(sys.argv[index]) + datafile = args[0] + infileList = args[1:] - print "destination RDS: %s" % datafile + combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA) - if '--initrna' in argv: - rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA') - elif '--init' in argv: - rds = ReadDataset.ReadDataset(datafile, initialize=True) - withFlag = '' - if '--flag' in argv: - withFlag = argv[sys.argv.index('-flag') + 1] - print "restrict to flag = %s" % withFlag +def makeParser(): + usage = __doc__ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--table", action="append", dest="tablelist") + parser.add_option("--init", action="store_true", dest="doInit") + parser.add_option("--initrna", action="store_true", dest="initRNA") + parser.add_option("--index", action="store_true", dest="doIndex") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--flag", dest="withFlag") + + configParser = getConfigParser() + section = "combinerds" + doInit = getConfigBoolOption(configParser, section, "doInit", False) + initRNA = getConfigBoolOption(configParser, section, "initRNA", False) + doIndex = getConfigBoolOption(configParser, section, "doIndex", False) + cachePages = getConfigOption(configParser, section, "cachePages", None) + withFlag = getConfigOption(configParser, section, "withFlag", "") + + parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages, + withFlag=withFlag) + + return parser + - rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache) +def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False): + print "destination RDS: %s" % datafile + datasetType="DNA" + if initRNA: + doInit = True + datasetType="RNA" + + doCache = False + if cachePages is not None: + doCache = True + else: + cachePages = -1 + + rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType) if cachePages > rds.getDefaultCacheSize(): rds.setDBcache(cachePages) - cacheVal = cachePages else: - cacheVal = rds.getDefaultCacheSize() - - doIndex = False - if '--index' in argv: - doIndex = True + cachePages = rds.getDefaultCacheSize() - tableList = [] - if '--table' in argv: - tableList.append(argv[argv.index('-table') + 1]) - else: + if tableList == []: tableList = rds.getTables() - combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache) - + if withFlag != "": + print "restrict to flag = %s" % withFlag -def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False): metaDict = rds.getMetadata() if "numberImports" not in metaDict: origIndex = 0 - rds.insertMetadata([("numberImports", str(0))]) + rds.insertMetadata([("numberImports", "0")]) else: origIndex = int(metaDict["numberImports"]) index = origIndex for inputfile in infileList: - asname = "input" + str(index) - rds.attachDB(inputfile,asname) + dbName = "input%s" % str(index) + rds.attachDB(inputfile, dbName) for table in tableList: print "importing table %s from file %s" % (table, inputfile) - ascols = "*" + dbColumns = "*" if table == "uniqs": - ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname + dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName elif table == "multi": - ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname + dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName elif table == "splices": - ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname - elif table == "metadata": - ascols = "name, value || ' (import_%d)'" % index - rds.importFromDB(asname, table, ascols) + dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName - if table != "metadata": - rds.importFromDB(asname, table, ascols, withFlag) + if table == "metadata": + dbColumns = "name, value || ' (import_%d)'" % index + rds.importFromDB(dbName, table, dbColumns) + else: + rds.importFromDB(dbName, table, dbColumns, withFlag) - rds.detachDB(asname) - rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))]) + rds.detachDB(dbName) + rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))]) index += 1 rds.updateMetadata("numberImports", index, origIndex) if doIndex: print "building index...." - if cacheVal > 0: - rds.buildIndex(cacheVal) + if cachePages > 0: + rds.buildIndex(cachePages) else: rds.buildIndex() diff --git a/commoncode.py b/commoncode.py index 821936a..f526daa 100755 --- a/commoncode.py +++ b/commoncode.py @@ -17,6 +17,9 @@ import Region commoncodeVersion = 5.6 currentRDSversion = 2.0 +class ErangeError(Exception): + pass + def getReverseComplement(base): revComp = {"A": "T", @@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False): return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM) -def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False): - hg = Genome(genome, inRAM=inRAM) +def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False): + genome = Genome(genomeName, inRAM=inRAM) if extendGenome != "": - hg.extendFeatures(extendGenome, replace=replaceModels) + genome.extendFeatures(extendGenome, replace=replaceModels) - geneannotDict = hg.allAnnotInfo() + geneannotDict = genome.allAnnotInfo() return geneannotDict @@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, mergeCount = 0 chromField = int(chromField) count = 0 - #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it - # exits at the first merge. This is not a problem when the input is sorted by start position, but in - # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there - # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will - # be merged with A but that will exit the loop and the merge with C will be missed. for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, """ if shift == "auto": - shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift) + shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift) seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus) @@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 topPos = getPeakPositionList(smoothArray, length) - peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift) + peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift) if leftPlus: numLeftPlus = 0 @@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, return peak -def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): +def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75): bestShift = 0 lowestScore = 20000000000 for testShift in xrange(maxShift + 1): shiftArray = array("f", [0.] * length) - for read in hitList: + for read in readList: currentpos = read["start"] - start if read["sense"] == "+": currentpos += testShift @@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): if (currentpos < 1) or (currentpos >= length): continue - if doWeight: + if useWeight: weight = read["weight"] else: weight = 1.0 @@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left hitIndex += 1 currentpos += 1 - while hitIndex < readlen and currentpos < length: + while hitIndex < readlen and currentpos < length: seqArray[currentpos] += weight hitIndex += 1 currentpos += 1 @@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length): if topNucleotide < smoothArray[currentpos]: topNucleotide = smoothArray[currentpos] peakList = [currentpos] - elif topNucleotide == smoothArray[currentpos]: + elif topNucleotide == smoothArray[currentpos]: peakList.append(currentpos) return peakList @@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict -def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, +def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): """ return a dictionary of gene loci. Can be used to retrieve additional @@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" return locusByChromDict - genome = genomeObject.genome - featuresDict = genomeObject.getallGeneFeatures() - if len(additionalRegionsDict) > 0: - sortList = [] - for chrom in additionalRegionsDict: - for region in additionalRegionsDict[chrom]: - label = region.label - if label not in sortList: - sortList.append(label) - - if label not in featuresDict: - featuresDict[label] = [] - sense = "+" - else: - sense = featuresDict[label][0][-1] - - featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) - - for gid in sortList: - featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) - + genomeName = genome.genome + featuresDict = getGeneFeatures(genome, additionalRegionsDict) for gid in featuresDict: featureList = featuresDict[gid] newFeatureList = [] @@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance - if lengthCDS > 0: - if lengthCDS < glen: - gstop = newFeatureList[0][0] + lengthCDS + if 0 < lengthCDS < glen: + gstop = newFeatureList[0][0] + lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstart = newFeatureList[-1][1] + lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstart = newFeatureList[-1][1] + lengthCDS else: if not useCDS and upstream > 0: if upstreamSpanTSS: @@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance - if lengthCDS > 0: - if lengthCDS < glen: - gstart = newFeatureList[-1][-1] - lengthCDS + if 0 < lengthCDS < glen: + gstart = newFeatureList[-1][-1] - lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstop = newFeatureList[0][0] - lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstop = newFeatureList[0][0] - lengthCDS - glen = abs(gstop - gstart) if chrom not in locusByChromDict: locusByChromDict[chrom] = [] @@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def getGeneFeatures(genome, additionalRegionsDict): + featuresDict = genome.getallGeneFeatures() + if len(additionalRegionsDict) > 0: + sortList = [] + for chrom in additionalRegionsDict: + for region in additionalRegionsDict[chrom]: + label = region.label + if label not in sortList: + sortList.append(label) + + if label not in featuresDict: + featuresDict[label] = [] + sense = "+" + else: + sense = featuresDict[label][0][-1] + + featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) + + for gid in sortList: + featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) + + return featuresDict + + def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1, binLength=-1): @@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], rlen = regionTuple[lengthField] try: rsense = regionTuple[senseField] - except: + except IndexError: rsense = "F" if tagStart > stop: diff --git a/distalPairs.py b/distalPairs.py index 5bc2532..be40310 100755 --- a/distalPairs.py +++ b/distalPairs.py @@ -85,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa print time.ctime() - if doSplices: - print "getting splices" - splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True) - print "got splices" - print "getting uniq reads" uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True) print "got uniqs" if doSplices: - for readID in splicesDict: - theRead = splicesDict[readID] - read0 = theRead[0] - del read0[1] - try: - uniqDict[readID].append(read0) - except: - if len(theRead) == 4: - read2 = theRead[2] - del read2[1] - uniqDict[readID] = [read0,read2] + addSplicesToUniqReads(RDS, uniqDict) if doVerbose: print len(uniqDict), time.ctime() outfile = open(outfilename,"w") - diffChrom = 0 distal = 0 total = 0 @@ -132,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa continue else: outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2) - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print diffChrom, outline else: dist = abs(start1 - start2) - if minDist < dist < maxDist: distal += 1 outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist) - outfile.write(outline + "\n") + print >> outfile, outline if doVerbose: print distal, outline @@ -157,5 +140,22 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa print time.ctime() +def addSplicesToUniqReads(RDS, uniqDict): + print "getting splices" + splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True) + print "got splices" + for readID in splicesDict: + theRead = splicesDict[readID] + read0 = theRead[0] + del read0[1] + try: + uniqDict[readID].append(read0) + except: + if len(theRead) == 4: + read2 = theRead[2] + del read2[1] + uniqDict[readID] = [read0,read2] + + if __name__ == "__main__": main(sys.argv) \ No newline at end of file diff --git a/farPairs.py b/farPairs.py index 00cc918..712e60e 100644 --- a/farPairs.py +++ b/farPairs.py @@ -14,8 +14,9 @@ except: import sys import time import optparse +import string import ReadDataset -from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption +from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList print "farPairs: version 1.4" @@ -38,14 +39,13 @@ def main(argv=None): outfilename = args[1] outbedname = args[2] - farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose, + farPairs(rdsfile, outfilename, outbedname, options.doVerbose, options.cachePages, options.minDist, options.maxDist, options.minCount, options.label) def getParser(usage): parser = optparse.OptionParser(usage=usage) - parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly") parser.add_option("--cache", type="int", dest="cachePages") parser.add_option("--verbose", action="store_true", dest="doVerbose") parser.add_option("--minDist", type="int", dest="minDist") @@ -55,7 +55,6 @@ def getParser(usage): configParser = getConfigParser section = "farPairs" - sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False) doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) cachePages = getConfigOption(configParser, section, "cachePages", None) minDist = getConfigIntOption(configParser, section, "minDist", 1000) @@ -63,15 +62,33 @@ def getParser(usage): minCount = getConfigIntOption(configParser, section, "minCount", 2) label = getConfigOption(configParser, section, "label", None) - parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages, + parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages, minDist=minDist, maxDist=maxDist, minCount=minCount, label=label) return parser -def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False, +def farPairs(rdsfile, outfilename, outbedname, doVerbose=False, cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None): + flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose) + + outfile = open(outfilename, "w") + for region in flagDict: + regionConnections = countDuplicatesInList(flagDict[region]) + for (connectedRegion, count) in regionConnections: + if count >= minCount: + outline = "%s\t%s\t%d" % (region, connectedRegion, count) + print >> outfile, outline + if doVerbose: + print outline + + outfile.close() + if doVerbose: + print "finished: ", time.ctime() + + +def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose): doCache = False if cachePages is not None: doCache = True @@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if doVerbose: print time.ctime() - - total = 0 - outfile = open(outfilename, "w") + outbed = open(outbedname, "w") outbed.write('track name="%s distal pairs" color=0,255,0\n' % label) + outbed.close() readlen = RDS.getReadSize() flagDict = {} @@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa if doNotProcessChromosome(chromosome): continue - print chromosome - uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) - if doVerbose: - print len(uniqDict), time.ctime() - - for readID in uniqDict: - readList = uniqDict[readID] - if len(readList) == 2: - total += 1 - start1 = readList[0]["start"] - flag1 = readList[0]["flag"] - start2 = readList[1]["start"] - flag2 = readList[1]["flag"] - - if flag1 != flag2: - dist = abs(start1 - start2) - startList = [start1, start2] - stopList = [start1 + readlen, start2 + readlen] - startList.sort() - stopList.sort() - if flag1 != "" and flag2 != "" and minDist < dist < maxDist: - outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0") - outbed.write(outputLine) - if doVerbose: - print flag1, flag2, dist - - try: - flagDict[flag1].append((flag2, start1, start2)) - except KeyError: - flagDict[flag1] = [(flag2, start1, start2)] - - try: - flagDict[flag2].append((flag1, start1, start2)) - except KeyError: - flagDict[flag2] = [(flag2, start1, start2)] + writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist) print "%d connected regions" % len(flagDict) - for region in flagDict: - flagDict[region].sort() - regionConnections = {} - for (region2, start1, start2) in flagDict[region]: + return flagDict + + +def doNotProcessChromosome(chrom): + return chrom == "chrM" + + +def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist): + outbed = open(outbedname, "a") + print chromosome + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True) + if doVerbose: + print len(uniqDict), time.ctime() + + for readID in uniqDict: + readList = uniqDict[readID] + if readsAreFarPair(readList, minDist, maxDist): + start1 = readList[0]["start"] + start2 = readList[1]["start"] + startList = [start1, start2] + startList.sort() + outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0") + outbed.write(outputLine) + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if doVerbose: + print flag1, flag2, abs(start1 - start2) + try: - regionConnections[region2] += 1 + flagDict[flag1].append(flag2) except KeyError: - regionConnections[region2] = 1 + flagDict[flag1] = [flag2] - for region2 in regionConnections: - if regionConnections[region2] >= minCount: - outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2])) - if doVerbose: - print "%s\t%s\t%d" % (region, region2, regionConnections[region2]) + try: + flagDict[flag2].append(flag1) + except KeyError: + flagDict[flag2] = [flag1] - outfile.close() outbed.close() - if doVerbose: - print "finished: ", time.ctime() -def doNotProcessChromosome(chrom): - return chrom == "chrM" +def readsAreFarPair(readList, minDist, maxDist): + isFarPair = False + if len(readList) == 2: + flag1 = readList[0]["flag"] + flag2 = readList[1]["flag"] + if flag1 != flag2 and flag1 != "" and flag2 != "": + start1 = readList[0]["start"] + start2 = readList[1]["start"] + dist = abs(start1 - start2) + if minDist < dist < maxDist: + isFarPair = True + return isFarPair -def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense): - readSizes = "%d" % (stopList[0] - startList[0]) - readCoords = "0" + +def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor): + sizes = ["%d" % readlen] + coords = ["0"] leftStart = startList[0] - 1 - rightStop = stopList[-1] + rightStop = startList[-1] for index in range(1, numPieces): - readSizes += ",%d" % (stopList[index] - startList[index] + 1) - readCoords += ",%d" % (startList[index] - startList[0]) + sizes.append("%d" % (readlen + 1)) + coords.append("%d" % (startList[index] - startList[0])) if rsense == "+": - senseCode = plusSense + senseCode = plusSenseColor else: - senseCode = minusSense + senseCode = minusSenseColor + readSizes = string.join(sizes, ",") + readCoords = string.join(coords, ",") outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords) + return outline diff --git a/featureIntersects.py b/featureIntersects.py index cd357cc..c24b79e 100755 --- a/featureIntersects.py +++ b/featureIntersects.py @@ -38,7 +38,7 @@ def main(argv=None): def makeParser(usage=""): parser = optparse.OptionParser(usage=usage) - parser.add_option("--cistype", action="store_false", dest="cistype") + parser.add_option("--cistype", dest="cistype") parser.add_option("--radius", type="int", dest="radius") configParser = getConfigParser() @@ -52,10 +52,19 @@ def makeParser(usage=""): def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100): - tabfile = open(tabFileName) - previous = "" + + posList = getPositionList(tabFileName) + feats = featuresIntersecting("human", posList, radius, cistype) + featkeys = feats.keys() + featkeys.sort() + for (chrom, pos) in featkeys: + print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)])) + +def getPositionList(tabFileName): + previous = "" posList = [] + tabfile = open(tabFileName) for line in tabfile: fields = line.split("\t") current = fields[0] @@ -66,11 +75,7 @@ def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100): chrom = fields[1][3:] posList.append((chrom, (int(fields[2]) + int(fields[3]))/2)) - feats = featuresIntersecting("human", posList, radius, cistype) - featkeys = feats.keys() - featkeys.sort() - for (chrom, pos) in featkeys: - print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)])) + return posList if __name__ == "__main__": diff --git a/findall.py b/findall.py index d20608f..ec75efb 100755 --- a/findall.py +++ b/findall.py @@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= strandfilter=None, combine5p=False): shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) - - if trimValue is not None: - trimValue = float(trimValue) / 100. - trimString = "%2.1f%s" % ((100. * trimValue), "%") - else: - trimValue = 0.1 - trimString = "10%" - - if not doTrim: - trimString = "none" + doControl = False + if mockfile is not None: + doControl = True if doRevBackground: print "Swapping IP and background to calculate FDR" pValueType = "back" - doControl = False - if mockfile is not None: - doControl = True - doPvalue = True if ptype is not None: ptype = ptype.upper() @@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= else: pValueType = "self" - if cachePages is not None: - doCache = True - else: - doCache = False - cachePages = -1 - if withFlag != "": print "restrict to flag = %s" % withFlag @@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= stringency = max(stringency, 1.0) writeLog(logfilename, versionString, string.join(sys.argv[1:])) + if cachePages is not None: + doCache = True + else: + doCache = False + cachePages = -1 + if doControl: print "\ncontrol:" mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) @@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) + if trimValue is not None: + trimValue = float(trimValue) / 100. + trimString = "%2.1f%s" % ((100. * trimValue), "%") + else: + trimValue = 0.1 + trimString = "10%" + + if not doTrim: + trimString = "none" + if doAppend: fileMode = "a" else: fileMode = "w" outfile = open(outfilename, fileMode) - outfile.write("#ERANGE %s\n" % versionString) if doControl: mockRDSsize = len(mockRDS) / 1000000. @@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) if foldRatio >= minRatio: - localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True) + localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True) try: shiftDict[localshift] += 1 except KeyError: @@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, - normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, noMulti, doControl, factor, trimValue, outputRegionList=False): @@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, weight = read["weight"] if abs(pos - previousHit) > maxSpacing or pos == maxCoord: sumAll = currentTotalWeight - if normalize: + if normalizeToRPM: sumAll /= rdsSampleSize regionStart = currentHitList[0] @@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: sumMulti = 0. #first pass uses getFoldRatio on mockRDS as there may not be control - foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll) + foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll) if foldRatio >= minRatio: # first pass, with absolute numbers peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) @@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, peakScore = peak.smoothArray[bestPos] numPlus = peak.numPlus shift = peak.shift - numLeft = peak.numLeft - if normalize: + numLeft = peak.numLeftPlus + if normalizeToRPM: peakScore /= rdsSampleSize if doTrim: @@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, sumAll = trimPeak.numHits numPlus = trimPeak.numPlus - numLeft = trimPeak.numLeft - if normalize: + numLeft = trimPeak.numLeftPlus + if normalizeToRPM: sumAll /= rdsSampleSize - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio) + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) if outputRegionList: sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) # just in case it changed, use latest data @@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, except: continue - # normalize to RPM - if normalize: + if normalizeToRPM: peakScore /= rdsSampleSize elif outputRegionList: sumMulti = currentTotalWeight - currentUniqReadCount if outputRegionList: - # normalize to RPM - if normalize: + if normalizeToRPM: sumMulti /= rdsSampleSize try: @@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, return footer if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py index 4ce97e4..91db4a1 100755 --- a/geneDownstreamBins.py +++ b/geneDownstreamBins.py @@ -14,7 +14,7 @@ import sys import optparse import ReadDataset from cistematic.genomes import Genome -from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption +from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError print "geneDownstreamBins: version 2.1" @@ -53,8 +53,6 @@ def makeParser(usage=""): def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False): - bins = 10 - standardMinThresh = standardMinDist / bins hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) normalizationFactor = 1.0 @@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac geneinfoDict = getGeneInfoDict(genome, cache=True) hg = Genome(genome) featuresDict = hg.getallGeneFeatures() - outfile = open(outfilename, "w") - gidList = hg.allGIDs() gidList.sort() for gid in gidList: - symbol = "LOC" + gid - geneinfo = "" - featureList = [] try: - geneinfo = geneinfoDict[gid] - featureList = featuresDict[gid] - symbol = geneinfo[0][0] - except: + featuresList = featuresDict[gid] + except KeyError: print gid - if len(featureList) == 0: + try: + binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist) + except ErangeError: continue - newfeatureList = [] - for (ftype, chrom, start, stop, fsense) in featureList: - if (start, stop) not in newfeatureList: - newfeatureList.append((start, stop)) + tagCount *= normalizationFactor + print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList)) + outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength)) + for binAmount in binList: + outfile.write("\t%.2f" % binAmount) - if chrom not in hitDict: - continue + outfile.write("\n") - newfeatureList.sort() - if len(newfeatureList) < 1: - continue + outfile.close() - glen = standardMinDist - if fsense == "F": - nextGene = hg.rightGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 - if glen < 1: - glen = 1 +def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist): - gstart = newfeatureList[-1][1] - else: - nextGene = hg.leftGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 + symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys()) + geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense) + if geneLength < standardMinDist: + raise ErangeError("gene length less than minimum") - if glen < 1: - glen = 1 + binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense) + if tagCount < 2: + raise ErangeError("tag count less than minimum") - gstart = newfeatureList[0][0] - glen - if gstart < 0: - gstart = 0 + return binList, symbol, geneLength, tagCount - tagCount = 0 - if glen < standardMinDist: - continue - binList = [0.] * bins - for read in hitDict[chrom]: - tagStart = read["start"] - weight = read["weight"] - tagStart -= gstart - if tagStart >= glen: - break - - if tagStart > 0: - tagCount += weight - if fsense == "F": - # we are relying on python's integer division quirk - binID = tagStart / standardMinThresh - binList[binID] += weight - else: - rdist = glen - tagStart - binID = rdist / standardMinThresh - binList[binID] += weight - - if tagCount < 2: - continue +def getFeatureList(gid, geneinfoDict, featureList, chromosomeList): + if len(featureList) == 0: + raise ErangeError("no features found") - tagCount *= normalizationFactor - print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList)) - outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen)) - for binAmount in binList: - outfile.write("\t%.2f" % binAmount) + symbol = "LOC%s" % gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except KeyError: + print gid - outfile.write("\n") + newfeatureList = [] + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + + if len(newfeatureList) < 1: + raise ErangeError("no features found") + + if chrom not in chromosomeList: + raise ErangeError("chromosome not found in reads") + + newfeatureList.sort() + + return symbol, newfeatureList, sense, chrom - outfile.close() +def getGeneStats(genome, gid, minDistance, featureList, sense): + geneLength = minDistance + if sense == "F": + nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = featureList[-1][1] + else: + nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = max(featureList[0][0] - geneLength, 0) + + if nextGene < geneLength * 2: + geneLength = nextGene / 2 + + geneLength = max(geneLength, 1) + + return geneStart, geneLength + + +def getBinList(readList, standardMinDist, geneStart, geneLength, sense): + tagCount = 0 + bins = 10 + standardMinThresh = standardMinDist / bins + binList = [0.] * bins + for read in readList: + tagStart = read["start"] + if tagStart >= geneLength: + break + + tagStart -= geneStart + weight = read["weight"] + if tagStart > 0: + tagCount += weight + if sense == "F": + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + else: + rdist = geneLength - tagStart + binID = rdist / standardMinThresh + + binList[binID] += weight + + return binList, tagCount if __name__ == "__main__": main(sys.argv) \ No newline at end of file diff --git a/geneLocusBins.py b/geneLocusBins.py index 6a1efc4..49745dc 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -12,6 +12,7 @@ except: import sys import optparse +import string from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption import ReadDataset from cistematic.genomes import Genome @@ -92,68 +93,78 @@ def getParser(usage): return parser + def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): + + hg = Genome(genome) + gidList = hg.allGIDs() + gidList.sort() if acceptfile is None: acceptDict = {} else: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + for chrom in acceptDict: + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) + + if doFlank: + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) + else: + locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalizeBins: totalCount = len(hitRDS) normalizationFactor = totalCount / 1000000. - hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) - - hg = Genome(genome) - geneinfoDict = getGeneInfoDict(genome, cache=doCache) - if doFlank: - locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) - else: - locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - - gidList = hg.allGIDs() - gidList.sort() - for chrom in acceptDict: - for region in acceptDict[chrom]: - if region.label not in gidList: - gidList.append(region.label) - (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) + writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins) - outfile = open(outfilename,'w') +def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True): + outfile = open(outfilename, "w") for gid in gidList: - if 'FAR' not in gid: - symbol = 'LOC' + gid - geneinfo = '' + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except: + except KeyError: pass else: symbol = gid + if gid in gidBins and gid in gidLen: tagCount = 0. for binAmount in gidBins[gid]: tagCount += binAmount - outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid])) + + outputList = [gid, symbol, tagCount, gidLen[gid]] for binAmount in gidBins[gid]: if normalizeBins: - if tagCount == 0: - tagCount = 1 - outfile.write('\t%.1f' % (100. * binAmount / tagCount)) - else: - outfile.write('\t%.1f' % binAmount) - outfile.write('\n') + try: + normalizedValue = 100. * binAmount / tagCount + except ZeroDivisionError: + normalizedValue = 100. * binAmount + + binAmount = normalizedValue + + outputList.append("%.1f" % binAmount) + + outLine = string.join(outputList, "\t") + outfile.write("%s\n" % outLine) + outfile.close() if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 38e853a..5299d27 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -2,7 +2,7 @@ try: import psyco psyco.full() except: - print 'psyco not running' + print "psyco not running" import sys import optparse @@ -68,12 +68,6 @@ def makeParser(usage=""): return parser -#TODO: Reported user performance issue. Long run times in conditions: -# small number of reads ~40-50M -# all features on single chromosome -# -# User states has been a long time problem. - def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): @@ -206,7 +200,6 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, for line in uniquecounts: fields = line.strip().split() # add a pseudo-count here to ease calculations below - #TODO: figure out why this was done in prior implementation... uniqueCountDict[fields[0]] = float(fields[-1]) + 1 uniquecounts.close() @@ -271,4 +264,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/getNovelSNPs.py b/getNovelSNPs.py index 4091281..e95d726 100755 --- a/getNovelSNPs.py +++ b/getNovelSNPs.py @@ -68,7 +68,6 @@ def doNotProcessLine(line): def getNovelSNPInfo(genome, snpEntry, hg): fields = snpEntry.split() - #TODO: refactor naming. is fields[8] rpkm? if fields[8].find("N\A") == -1: return snpEntry else: @@ -93,4 +92,4 @@ def getNovelSNPInfo(genome, snpEntry, hg): if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/makerdsfromblat.py b/makerdsfromblat.py index 37576ca..1752040 100755 --- a/makerdsfromblat.py +++ b/makerdsfromblat.py @@ -20,6 +20,9 @@ import ReadDataset verstring = "makerdsfromblat: version 3.10" print verstring +NUM_HEADER_LINES = 5 + + def main(argv=None): if not argv: argv = sys.argv @@ -98,24 +101,15 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True, verbose=False, cachePages=100000, geneDataFileName="", propertyList=[]): - delimiter = "|" - minIntron = 10 - maxBorder = 0 - index = 0 - insertSize = 100000 - + writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:])) if forceRNA: print "forcing datatype to RNA" dataType = "RNA" - if dataType == "RNA": - genedatafile = open(geneDataFileName) - - writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:])) - geneDict = {} mapDict = {} if dataType == "RNA" and not forceRNA: + genedatafile = open(geneDataFileName) for line in genedatafile: fields = line.strip().split("\t") blockCount = int(fields[7]) @@ -164,9 +158,10 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True, # make some assumptions based on first read infile = open(filename, "r") - for arg in range(6): + for arg in range(NUM_HEADER_LINES): line = infile.readline() + line = infile.readline() fields = line.split() readsize = int(fields[10]) pairedTest = fields[9][-2:] @@ -186,8 +181,9 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True, rds.insertMetadata([("blat_mapped", "True")]) minReadScore = readsize - readsize/25 - 1 - trim = -4 + maxBorder = 0 if dataType == "RNA": + trim = -4 maxBorder = readsize + trim infile = open(filename, "r") @@ -199,9 +195,12 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True, index = uIndex = mIndex = sIndex = lIndex = 0 bestScore = 0 # skip headers - for arg in range(5): + for arg in range(NUM_HEADER_LINES): line = infile.readline() + insertSize = 100000 + delimiter = "|" + minIntron = 10 for line in infile: lIndex += 1 fields = line.strip().split() diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py index 14be260..eadb66b 100755 --- a/makerdsfrombowtie.py +++ b/makerdsfrombowtie.py @@ -87,29 +87,13 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr flip=False, verbose=False, stripSpace=False, cachePages=100000, propertyList=[]): - delimiter = "|" + writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:])) + geneDict = {} dataType = "DNA" if genedatafilename is not None: dataType = "RNA" genedatafile = open(genedatafilename) - - - forcePair = False - if forceID is not None: - forcePair = True - else: - forceID = 0 - - maxBorder = 0 - index = 0 - insertSize = 100000 - - writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:])) - - geneDict = {} - mapDict = {} - if dataType == "RNA": for line in genedatafile: fields = line.strip().split("\t") blockCount = int(fields[7]) @@ -130,7 +114,6 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr totalLength += exonLengths[index] geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) - mapDict[uname] = [] genedatafile.close() @@ -164,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr fields = line.split() readsize = len(fields[5]) pairedTest = fields[0][-2:] + forcePair = False + if forceID is not None: + forcePair = True + else: + forceID = 0 + paired = False if pairedTest in ["/1", "/2"] or forcePair: print "assuming reads are paired" paired = True - print "read size: %d bp" % readsize if init: rds.insertMetadata([("readsize", readsize)]) @@ -184,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr infile.close() - trim = -4 + maxBorder = 0 if dataType == "RNA": + trim = -4 maxBorder = readsize + trim infile = open(filename, "r") @@ -195,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr mInsertList = [] sInsertList = [] index = uIndex = mIndex = sIndex = lIndex = 0 + delimiter = "|" + insertSize = 100000 for line in infile: lIndex += 1 if stripSpace: diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py index 66209ee..b11848d 100755 --- a/makerdsfromeland2.py +++ b/makerdsfromeland2.py @@ -106,8 +106,6 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", insertSize = 100000 geneDict = {} - mapDict = {} - seenSpliceList = [] if dataType == 'RNA': genedatafile = open(geneDataFileName) for line in genedatafile: @@ -120,17 +118,11 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", chrom = fields[1] sense = fields[2] chromstarts = fields[8][:-1].split(',') - chromstops = fields[9][:-1].split(',') - exonLengths = [] - totalLength = 0 for index in range(blockCount): chromstarts[index] = int(chromstarts[index]) - chromstops[index] = int(chromstops[index]) - exonLengths.append(chromstops[index] - chromstarts[index]) - totalLength += exonLengths[index] - geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) - mapDict[uname] = [] + geneDict[uname] = (sense, blockCount, chrom, chromstarts) + genedatafile.close() rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True) @@ -322,6 +314,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", print '%d unique reads' % index infile.close() + seenSpliceList = [] if dataType == 'RNA': print 'mapping splices...' index = 0 @@ -441,7 +434,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", print fields continue - (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model] + (sense, blockCount, chrom, chromstarts) = geneDict[model] if extended: if 'F' in thepos: rsense = '+' diff --git a/predictSpliceCount.py b/predictSpliceCount.py index bfd6e1c..2df39a9 100755 --- a/predictSpliceCount.py +++ b/predictSpliceCount.py @@ -8,6 +8,14 @@ import sys from cistematic.genomes import Genome +class GeneSymbolAndCount(): + + def __init__(self, symbol="", uniqueCount=0, spliceCount=0): + self.symbol = symbol + self.uniqueCount = uniqueCount + self.spliceCount = spliceCount + + def main(argv=None): if not argv: argv = sys.argv @@ -31,53 +39,65 @@ def main(argv=None): def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename): hg = Genome(genome) - gidDict = {} - gidList = [] - uniqueCountDict = {} - spliceCountDict = {} + gidDict = getGeneData(uniquefilecount, splicefilecount) + gidList = gidDict.keys() + gidList.sort() + + outfile = open(outfilename, "w") + for gid in gidList: + featureList = hg.getGeneFeatures((genome, gid)) + featuresizesum, splicearea = getStats(featureList, splicelead) + fractionCoverage = featuresizesum / float(splicearea + featuresizesum) + geneData = gidDict[gid] + uniqueCount = geneData.uniqueCount + expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount + + # this p-value is based on the observed unique count, not the expected total count + # nor the multi-read adjusted count + pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount) + symbol = geneData.symbol + spliceCount = geneData.spliceCount + print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount) + outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)) + outfile.close() + + +def getGeneData(uniquefilecount, splicefilecount): + gidDict = {} uniquefile = open(uniquefilecount) for line in uniquefile: fields = line.strip().split() - gidDict[fields[0]] = fields[1] - gidList.append(fields[0]) - uniqueCountDict[fields[0]] = int(fields[2]) + geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2])) + gidDict[fields[0]] = geneData + uniquefile.close() splicefile = open(splicefilecount) for line in splicefile: fields = line.strip().split() - spliceCountDict[fields[0]] = int(fields[2]) + gidDict[fields[0]].spliceCount = int(fields[2]) - outfile = open(outfilename,'w') + splicefile.close() - gidList.sort() - for gid in gidList: - symbol = gidDict[gid] - featureList = hg.getGeneFeatures((genome, gid)) - newfeatureList = [] - featuresizesum = 0 - for (ftype, chrom, start, stop, sense) in featureList: - if (start, stop) not in newfeatureList: - newfeatureList.append((start, stop)) - featuresizesum += stop - start + 1 + return gidDict - if featuresizesum < 1: - featuresizesum = 1 - splicearea = (len(newfeatureList) - 1) * splicelead - if splicearea < splicelead: - splicearea = 0 +def getStats(featureList, splicelead): + newfeatureList = [] + featuresizesum = 0 + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + featuresizesum += stop - start + 1 - fractionCoverage = featuresizesum / float(splicearea + featuresizesum) - expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid] + if featuresizesum < 1: + featuresizesum = 1 - # this p-value is based on the observed unique count, not the expected total count - # nor the multi-read adjusted count - pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid]) - print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]) - outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])) + splicearea = (len(newfeatureList) - 1) * splicelead + if splicearea < splicelead: + splicearea = 0 - outfile.close() + return featuresizesum, splicearea if __name__ == "__main__": diff --git a/siteintersects.py b/siteintersects.py index 4826a13..e012bb4 100755 --- a/siteintersects.py +++ b/siteintersects.py @@ -4,10 +4,29 @@ # import sys +from commoncode import regionsOverlap print "siteintersects: version 2.1" +class Site(): + def __init__(self, line, doExpanded=False): + fields = line.strip().split() + if doExpanded: + self.chrom = fields[1][3:] + self.start = int(fields[2]) + self.stop = int(fields[3]) + self.rest = fields[4:] + else: + (chromosome, pos) = fields[0].split(":") + self.chrom = chromosome[3:] + (start, stop) = pos.split("-") + self.start = int(start) + self.stop = int(stop) + self.rest = fields[1:] + + + def main(argv=None): if not argv: argv = sys.argv @@ -33,115 +52,114 @@ def main(argv=None): siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded) -def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False): +def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False): - siteDict = {} - file1Dict = {} + siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject) + commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject) - infile1count = 0 - infile = open(sitefilename1) + if doReject and siteRejectFilename is not None: + writeRejectSiteFile(siteRejectFilename, rejectSiteDict) + + print commonSiteCount + + +def getSiteDicts(sitefilename, doExpanded=False, doReject=False): + siteDict = {} + rejectSiteDict = {} + processedLineCount = 0 + infile = open(sitefilename) infile.readline() for line in infile: - if line[0] == "#": + if doNotProcessLine(line): continue - infile1count += 1 - fields = line.strip().split() - if doExpanded: - chrom = fields[1][3:] - start = int(fields[2]) - stop = int(fields[3]) - rest = fields[4:] - else: - (chrom, pos) = fields[0].split(":") - chrom = chrom[3:] - (start, stop) = pos.split("-") - start = int(start) - stop = int(stop) - rest = fields[1:] + processedLineCount += 1 + site = Site(line, doExpanded) + chrom = site.chrom + start = site.start + stop = site.stop + rest = site.fieldList try: siteDict[chrom].append((start, stop, rest)) - except: + except KeyError: siteDict[chrom] = [(start, stop, rest)] if doReject: - file1Dict[str((chrom, start, stop, rest))] = line + rejectSiteDict[str((chrom, start, stop, rest))] = line infile.close() + print "file1: %d" % processedLineCount - print "file1: %d" % infile1count + return siteDict, rejectSiteDict - infile2count = 0 - infile = open(sitefilename2) + +def doNotProcessLine(line): + return line[0] == "#" + + +def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject): + processedLineCount = 0 + infile = open(siteFilename) infile.readline() commonSites = 0 - unique2List = [] + if doReject and rejectFilename is not None: + rejectfile = open(rejectFilename, "w") + else: + doReject=False + outfile = open(outfilename, "w") for line in infile: - if line[0] == "#": + if doNotProcessLine(line): continue - infile2count += 1 - fields = line.strip().split() - if doExpanded: - chrom = fields[1][3:] - start = int(fields[2]) - stop = int(fields[3]) - rest = fields[4:] - else: - (chrom, pos) = fields[0].split(":") - chrom = chrom[3:] - (start, stop) = pos.split("-") - rest = str(fields[1:]) - - start = int(start) - stop = int(stop) - mid = start + abs(stop - start)/2 + processedLineCount += 1 + site = Site(line, doExpanded) + chrom = site.chrom + start = site.start + stop = site.stop if chrom not in siteDict: if doReject: - unique2List.append(line) - continue + rejectfile.write(line) + + continue - twoNotCommon = True + siteNotCommon = True for (rstart, rstop, rline) in siteDict[chrom]: - rsize = abs(rstart - rstop) /2 - rmid = rstart + abs(rstop - rstart)/2 - if abs(mid - rmid) < rsize: + if regionsOverlap(start, stop, rstart, rstop): commonSites += 1 - if twoNotCommon: - outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest)) - twoNotCommon = False + if siteNotCommon: + outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList)) + siteNotCommon = False try: if doReject: - del file1Dict[str((chrom, rstart, rstop, rline))] - except: + del rejectSiteDict[str((chrom, rstart, rstop, rline))] + except KeyError: pass - if doReject and twoNotCommon: - unique2List.append(line) + if doReject and siteNotCommon: + rejectfile.write(line) - outfile.close() + if doReject: + rejectfile.close() - print "file2: %d" % infile2count + outfile.close() - if doReject: - reject1file = open(reject1filename, "w") - reject2file = open(reject2filename, "w") + print "file2: %d" % processedLineCount - for key in file1Dict: - reject1file.write(file1Dict[key]) + return commonSites - for line in unique2List: - reject2file.write(line) - reject1file.close() - reject2file.close() +def writeRejectSiteFile(siteRejectFilename, rejectSiteDict): + rejectfile = open(siteRejectFilename, "w") - print commonSites + for key in rejectSiteDict: + rejectfile.write(rejectSiteDict[key]) + rejectfile.close() + if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/test/testBedToRegion.py b/test/testBedToRegion.py new file mode 100644 index 0000000..b5b4c03 --- /dev/null +++ b/test/testBedToRegion.py @@ -0,0 +1,65 @@ +''' +Created on Dec 2, 2010 + +@author: sau +''' +import unittest +import os +from erange import bedtoregion + + +class TestBedToRegion(unittest.TestCase): + + testBedFile = "erangeTestBedFile" + testRegionFile = "erangeTestRegionFile" + + + def setUp(self): + bedfile = open(self.testBedFile, "w") + bedfile.write("tab\tdelimited\tfields\n") + bedfile.write("space delimited fields\n") + bedfile.write("track\n") + bedfile.write("line after track will not be processed\n") + bedfile.close() + + + def tearDown(self): + try: + os.remove(self.testBedFile) + except OSError: + print "bed file does not exist" + + try: + os.remove(self.testRegionFile) + except OSError: + print "region file does not exist" + + + def testBedToRegion(self): + bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile) + resultFile = open(self.testRegionFile) + regionList = resultFile.readlines() + self.assertEquals(2, len(regionList)) + self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0]) + self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1]) + + + def testMain(self): + argv = ["bedtoregion"] + self.assertRaises(SystemExit, bedtoregion.main, argv) + argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile] + bedtoregion.main(argv) + resultFile = open(self.testRegionFile) + self.assertEquals(2, len(resultFile.readlines())) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestBedToRegion)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testBinsToCdf.py b/test/testBinsToCdf.py new file mode 100644 index 0000000..688e688 --- /dev/null +++ b/test/testBinsToCdf.py @@ -0,0 +1,66 @@ +''' +Created on Dec 2, 2010 + +@author: sau +''' +import unittest +import os +from erange import binstocdf + + +class TestBinsToCdf(unittest.TestCase): + + testInputFile = "erangeTestBinFile" + testOutputFile = "erangeTestCDFFile" + + + def setUp(self): + binfile = open(self.testInputFile, "w") + binfile.write("field1\tfield2\t10\tfield4\t2\t3\n") + binfile.write("field1\tfield2\t20\tfield4\t1\t4\n") + binfile.write("field1\tfield2\t0\tfield4\n") + binfile.write("too short\n") + binfile.close() + + + def tearDown(self): + try: + os.remove(self.testInputFile) + except OSError: + print "bin file does not exist" + + try: + os.remove(self.testOutputFile) + except OSError: + print "cdf file does not exist" + + + def testBinsToCdf(self): + binstocdf.binToCDF(self.testInputFile, self.testOutputFile) + resultFile = open(self.testOutputFile) + resultList = resultFile.readlines() + self.assertEquals(3, len(resultList)) + self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0]) + self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1]) + self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2]) + + + def testMain(self): + argv = ["binstocdf"] + self.assertRaises(SystemExit, binstocdf.main, argv) + argv = ["binstocdf", self.testInputFile, self.testOutputFile] + binstocdf.main(argv) + resultFile = open(self.testOutputFile) + self.assertEquals(3, len(resultFile.readlines())) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestBinsToCdf)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testCdfDist.py b/test/testCdfDist.py new file mode 100644 index 0000000..9048749 --- /dev/null +++ b/test/testCdfDist.py @@ -0,0 +1,58 @@ +''' +Created on Dec 2, 2010 + +@author: sau +''' +import unittest +import os +from erange import cdfdist + + +class TestCdfDist(unittest.TestCase): + + testInputFile = "erangeTestCDFFile" + + + def setUp(self): + cdffile = open(self.testInputFile, "w") + cdffile.write("line1 30 60\n") + cdffile.write("line2 90 99\n") + cdffile.write("line3 5 80\n") + cdffile.write("line4 10 14\n") + cdffile.close() + + + def tearDown(self): + try: + os.remove(self.testInputFile) + except OSError: + print "cdf file does not exist" + + + def testBinsToCdf(self): + bins = 2 + self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile)) + self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile)) + self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile)) + self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile)) + + + def testMain(self): + bins = 2 + percent = 50 + argv = ["cdfdist"] + self.assertRaises(SystemExit, cdfdist.main, argv) + argv = ["cdfdist", bins, percent, self.testInputFile] + cdfdist.main(argv) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestCdfDist)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testChksnp.py b/test/testChksnp.py index b41fe65..0fbbb7b 100644 --- a/test/testChksnp.py +++ b/test/testChksnp.py @@ -23,6 +23,8 @@ class TestChksnp(unittest.TestCase): snpDB = "%s/dbSNP128.db" % dbPath altSnpDB = "%s/snp129cDNA.db" % dbPath + inputFileName = "testChkSNP_input.txt" + outputFileName = "testChkSNP_output.txt" def setUp(self): pass @@ -33,8 +35,7 @@ class TestChksnp(unittest.TestCase): def testChkSNPFile(self): - inputFileName = "testChkSNP_input.txt" - infile = open(inputFileName, "w") + infile = open(self.inputFileName, "w") infile.write("# header line\n") snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t") infile.write("%s\n" % snpEntry) @@ -42,10 +43,8 @@ class TestChksnp(unittest.TestCase): infile.write("%s\n" % snpEntry) infile.close() - outputFileName = "testChkSNP_output.txt" - - chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName) - outfile = open(outputFileName, "r") + chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName) + outfile = open(self.outputFileName, "r") line = outfile.readline() result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" self.assertEquals(result, line) @@ -53,10 +52,10 @@ class TestChksnp(unittest.TestCase): line = outfile.readline() self.assertEquals(result, line) outfile.close() - os.remove(outputFileName) + os.remove(self.outputFileName) - chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB]) - outfile = open(outputFileName, "r") + chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB]) + outfile = open(self.outputFileName, "r") line = outfile.readline() result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" self.assertEquals(result, line) @@ -65,13 +64,12 @@ class TestChksnp(unittest.TestCase): self.assertEquals(result, line) outfile.close() - os.remove(inputFileName) - os.remove(outputFileName) + os.remove(self.inputFileName) + os.remove(self.outputFileName) def testMain(self): - inputFileName = "testChkSNP_input.txt" - infile = open(inputFileName, "w") + infile = open(self.inputFileName, "w") infile.write("# header line\n") snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t") infile.write("%s\n" % snpEntry) @@ -79,11 +77,9 @@ class TestChksnp(unittest.TestCase): infile.write("%s\n" % snpEntry) infile.close() - outputFileName = "testChkSNP_output.txt" - - argv = ["chksnp", self.snpDB, inputFileName, outputFileName] + argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName] chksnp.main(argv) - outfile = open(outputFileName, "r") + outfile = open(self.outputFileName, "r") line = outfile.readline() result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" self.assertEquals(result, line) @@ -91,7 +87,10 @@ class TestChksnp(unittest.TestCase): line = outfile.readline() self.assertEquals(result, line) outfile.close() - os.remove(outputFileName) + + os.remove(self.inputFileName) + os.remove(self.outputFileName) + def testChkSNP(self): snpPropertiesList = [] diff --git a/test/testColsum.py b/test/testColsum.py new file mode 100644 index 0000000..5c149e7 --- /dev/null +++ b/test/testColsum.py @@ -0,0 +1,56 @@ +''' +Created on Dec 2, 2010 + +@author: sau +''' +import unittest +import os +from erange import colsum + + +class TestColsum(unittest.TestCase): + + testInputFile = "erangeTestColsumFile" + + + def setUp(self): + colfile = open(self.testInputFile, "w") + colfile.write("line1 30 60.5\n") + colfile.write("line2 90 99\n") + colfile.write("line3 5 80\n") + colfile.write("line4 10 1\n") + colfile.close() + + + def tearDown(self): + try: + os.remove(self.testInputFile) + except OSError: + print "cdf file does not exist" + + + def testBinsToCdf(self): + self.assertEquals(0, colsum.colsum(0, self.testInputFile)) + self.assertEquals(135, colsum.colsum(1, self.testInputFile)) + self.assertEquals(240.5, colsum.colsum(2, self.testInputFile)) + self.assertEquals(0, colsum.colsum(3, self.testInputFile)) + + + def testMain(self): + field = 1 + argv = ["colsum"] + self.assertRaises(SystemExit, colsum.main, argv) + argv = ["colsum", field, self.testInputFile] + colsum.main(argv) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestColsum)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testCommoncode.py b/test/testCommoncode.py index 43eb96b..ce8be55 100644 --- a/test/testCommoncode.py +++ b/test/testCommoncode.py @@ -8,6 +8,7 @@ import os import string from array import array from erange import commoncode +from erange import Region from cistematic.genomes import Genome @@ -122,8 +123,10 @@ class TestCommoncode(unittest.TestCase): regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") testfile.write(regionEntry) testfile.close() - result = {"1": [(10, 20, 10)]} - self.assertEquals(result, commoncode.getMergedRegions("regionTestFile")) + result = commoncode.getMergedRegions("regionTestFile") + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) os.remove("regionTestFile") @@ -132,108 +135,277 @@ class TestCommoncode(unittest.TestCase): regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") regionList = [regionEntry] - result = {"1": [(10, 20, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) - result = {"1": [(5, 25, 20)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5)) - result = {"1": [(12, 18, 6)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2)) - result = {"chr1": [(10, 20, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True)) + result = commoncode.getMergedRegionsFromList(regionList) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, pad=5) + self.assertEquals(5, result["1"][0].start) + self.assertEquals(25, result["1"][0].stop) + self.assertEquals(20, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, pad=-2) + self.assertEquals(12, result["1"][0].start) + self.assertEquals(18, result["1"][0].stop) + self.assertEquals(6, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True) + self.assertEquals(10, result["chr1"][0].start) + self.assertEquals(20, result["chr1"][0].stop) + self.assertEquals(10, result["chr1"][0].length) regionEntry = string.join(["1", "chr1:10-20", "5"], "\t") regionList = [regionEntry] - result = {"1": [(10, 20, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)) + result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") regionList = [regionEntry] regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t") regionList.append(regionEntry) - result = {"1": [(10, 40, 30)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) - result = {"1": [(10, 20, 10), (15, 40, 25)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False)) - result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)) + result = commoncode.getMergedRegionsFromList(regionList) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(40, result["1"][0].stop) + self.assertEquals(30, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, doMerge=False) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(15, result["1"][1].start) + self.assertEquals(40, result["1"][1].stop) + self.assertEquals(25, result["1"][1].length) + + result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals("1", result["1"][0].label) + self.assertEquals(15, result["1"][1].start) + self.assertEquals(40, result["1"][1].stop) + self.assertEquals(25, result["1"][1].length) + self.assertEquals("2", result["1"][1].label) regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t") regionList = [regionEntry] regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t") regionList.append(regionEntry) - result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)) + result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals("1\tspacer", result["1"][0].label) + self.assertEquals(15, result["1"][1].start) + self.assertEquals(40, result["1"][1].stop) + self.assertEquals(25, result["1"][1].length) + self.assertEquals("2\tspacer2", result["1"][1].label) regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") regionList = [regionEntry] regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t") regionList.append(regionEntry) - result = {"1": [(10, 20, 10), (2030, 2040, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) - result = {"1": [(10, 2040, 2030)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000)) - result = {"1": [(10, 20, 10), (2030, 2040, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5)) - result = {"1": [(2030, 2040, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12)) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1)) + result = commoncode.getMergedRegionsFromList(regionList) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(2030, result["1"][1].start) + self.assertEquals(2040, result["1"][1].stop) + self.assertEquals(10, result["1"][1].length) + + result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(2040, result["1"][0].stop) + self.assertEquals(2030, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, minHits=5) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(2030, result["1"][1].start) + self.assertEquals(2040, result["1"][1].stop) + self.assertEquals(10, result["1"][1].length) + + result = commoncode.getMergedRegionsFromList(regionList, minHits=12) + self.assertEquals(2030, result["1"][0].start) + self.assertEquals(2040, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + result = commoncode.getMergedRegionsFromList(regionList, minHits=12) + self.assertEquals(2030, result["1"][0].start) + self.assertEquals(2040, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t") regionList = [regionEntry] regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t") regionList.append(regionEntry) - result = {"2": [(15, 40, 25)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)) + result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12) + self.assertEquals(15, result["2"][0].start) + self.assertEquals(40, result["2"][0].stop) + self.assertEquals(25, result["2"][0].length) self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1) self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6)) self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1)) regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t") regionList = [regionEntry] - result = {"1": [(10, 20, 10)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) - result = {"1": [(10, 20, 10, 3, 40)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) - result = {"1": [("1", 10, 20, 10, 3, 40)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + result = commoncode.getMergedRegionsFromList(regionList) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + self.assertEquals("1", result["1"][0].label) + regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t") regionList.append(regionEntry) - result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + self.assertEquals("1", result["1"][0].label) + self.assertEquals(15, result["2"][0].start) + self.assertEquals(40, result["2"][0].stop) + self.assertEquals(25, result["2"][0].length) + self.assertEquals(32, result["2"][0].peakPos) + self.assertEquals(17, result["2"][0].peakHeight) + self.assertEquals("2", result["2"][0].label) + regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t") regionList.append(regionEntry) - result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(40, result["1"][0].stop) + self.assertEquals(30, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + self.assertEquals("3", result["1"][0].label) + self.assertEquals(15, result["2"][0].start) + self.assertEquals(40, result["2"][0].stop) + self.assertEquals(25, result["2"][0].length) + self.assertEquals(32, result["2"][0].peakPos) + self.assertEquals(17, result["2"][0].peakHeight) + self.assertEquals("2", result["2"][0].label) + regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t") regionList.append(regionEntry) - result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) - result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)], - "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)] - } - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(40, result["1"][0].stop) + self.assertEquals(30, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + self.assertEquals("3", result["1"][0].label) + self.assertEquals(15, result["2"][0].start) + self.assertEquals(88, result["2"][0].stop) + self.assertEquals(73, result["2"][0].length) + self.assertEquals(32, result["2"][0].peakPos) + self.assertEquals(17, result["2"][0].peakHeight) + self.assertEquals("4", result["2"][0].label) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + self.assertEquals("1", result["1"][0].label) + self.assertEquals(15, result["1"][1].start) + self.assertEquals(40, result["1"][1].stop) + self.assertEquals(25, result["1"][1].length) + self.assertEquals(32, result["1"][1].peakPos) + self.assertEquals(17, result["1"][1].peakHeight) + self.assertEquals("3", result["1"][1].label) + self.assertEquals(15, result["2"][0].start) + self.assertEquals(40, result["2"][0].stop) + self.assertEquals(25, result["2"][0].length) + self.assertEquals(32, result["2"][0].peakPos) + self.assertEquals(17, result["2"][0].peakHeight) + self.assertEquals("2", result["2"][0].label) + self.assertEquals(65, result["2"][1].start) + self.assertEquals(88, result["2"][1].stop) + self.assertEquals(23, result["2"][1].length) + self.assertEquals(72, result["2"][1].peakPos) + self.assertEquals(7, result["2"][1].peakHeight) + self.assertEquals("4", result["2"][1].label) regionList = ["# comment"] regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t") regionList.append(regionEntry) - result = {"1": [(10, 20, 10, 3, 40)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + regionList = ["# pvalue"] regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t") regionList.append(regionEntry) - result = {"1": [(10, 20, 10, 3, 40)]} - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + regionList = ["# readShift"] regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t") regionList.append(regionEntry) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + regionList = ["# pvalue readShift"] regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t") regionList.append(regionEntry) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) - self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + + result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1) + self.assertEquals(10, result["1"][0].start) + self.assertEquals(20, result["1"][0].stop) + self.assertEquals(10, result["1"][0].length) + self.assertEquals(3, result["1"][0].peakPos) + self.assertEquals(40, result["1"][0].peakHeight) + #Test fails - the header line is required if there are fields after the peak which isn't so good #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True)) @@ -274,28 +446,61 @@ class TestCommoncode(unittest.TestCase): #TODO: write test def testFindPeak(self): hitList = [] - result = ([], 0.0, array("f"), 0.0) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 0)) + result = commoncode.findPeak(hitList, 0, 0) + self.assertEquals([], result.topPos) + self.assertEquals(0.0, result.numHits) + self.assertEquals(array("f"), result.smoothArray) + self.assertEquals(0.0, result.numPlus) hitList= [{"start": 4, "sense": "+", "weight": 0.5}] - result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10)) - result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True)) - result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto")) - result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True)) + result = commoncode.findPeak(hitList, 0, 10) + self.assertEquals([6, 7], result.topPos) + self.assertEquals(1.0, result.numHits) + self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray) + self.assertEquals(1.0, result.numPlus) + + result = commoncode.findPeak(hitList, 0, 10, doWeight=True) + self.assertEquals([6, 7], result.topPos) + self.assertEquals(0.5, result.numHits) + self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray) + self.assertEquals(0.5, result.numPlus) + + result = commoncode.findPeak(hitList, 0, 10, shift="auto") + self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos) + self.assertEquals(0.0, result.numHits) + self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray) + self.assertEquals(0.0, result.numPlus) + + result = commoncode.findPeak(hitList, 0, 10, shift="auto") + self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos) + self.assertEquals(0.0, result.numHits) + self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray) + self.assertEquals(0.0, result.numPlus) + self.assertEquals(6, result.shift) hitList= [{"start": 4, "sense": "+", "weight": 0.5}] - result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True)) + result = commoncode.findPeak(hitList, 0, 10, shift=3) + self.assertEquals([7], result.topPos) + self.assertEquals(1.0, result.numHits) + self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray) + self.assertEquals(1.0, result.numPlus) + self.assertEquals(3, result.shift) hitList= [{"start": 4, "sense": "+", "weight": 0.5}] - result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True)) - result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3) - self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True)) + result = commoncode.findPeak(hitList, 0, 10, leftPlus=True) + self.assertEquals([6, 7], result.topPos) + self.assertEquals(1.0, result.numHits) + self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray) + self.assertEquals(1.0, result.numPlus) + self.assertEquals(1.0, result.numLeftPlus) + + result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3) + self.assertEquals([7], result.topPos) + self.assertEquals(1.0, result.numHits) + self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray) + self.assertEquals(1.0, result.numPlus) + self.assertEquals(1.0, result.numLeftPlus) + self.assertEquals(3, result.shift) #TODO: write test @@ -354,10 +559,13 @@ class TestCommoncode(unittest.TestCase): } self.assertEquals(result, complementDict) - regionDict = {"I": [("new feature", 100, 150, 50)]} + additionalRegion = Region.Region(100, 150) + additionalRegion.label = "new feature" + regionDict = {"I": [additionalRegion]} featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict) result = (100, 150, "new feature", "+", "custom") self.assertEquals(result, featureDict["I"][0]) + def testGetLocusByChromDict(self): @@ -382,7 +590,9 @@ class TestCommoncode(unittest.TestCase): self.assertTrue(chrom in self.celegansChroms) self.assertEquals(firstLoci[chrom], locusDict[chrom][0]) - regionDict = {"I": [("new region", 100, 150, 50)]} + additionalRegion = Region.Region(100, 150) + additionalRegion.label = "new region" + regionDict = {"I": [additionalRegion]} locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict) self.assertEquals((100, 150, "new region", 50), locusDict["I"][0]) locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True) diff --git a/test/testErange.py b/test/testErange.py index 4455fff..d993fad 100644 --- a/test/testErange.py +++ b/test/testErange.py @@ -11,8 +11,13 @@ Created on Sep 8, 2010 import sys import unittest import testAnalyzeGO +import testBedToRegion +import testBinsToCdf +import testCdfDist import testChksnp +import testColsum import testCommoncode +import testFeatureIntersects import testGeneMrnaCounts import testGeneMrnaCountsWeighted #import testGetFasta @@ -23,6 +28,7 @@ import testMakeBamFromRds import testmakebedfromrds #import testMakeGraphs import testMakeRdsFromBam +import testMakeSiteTrack import testMakeSNPTrack import testMarkLinkers import testPeak @@ -41,8 +47,13 @@ def main(argv=None): suite = unittest.TestSuite() suite.addTest(testAnalyzeGO.suite()) + suite.addTest(testBedToRegion.suite()) + suite.addTest(testBinsToCdf.suite()) + suite.addTest(testCdfDist.suite()) suite.addTest(testChksnp.suite()) + suite.addTest(testColsum.suite()) suite.addTest(testCommoncode.suite()) + suite.addTest(testFeatureIntersects.suite()) suite.addTest(testGeneMrnaCounts.suite()) suite.addTest(testGeneMrnaCountsWeighted.suite()) #suite.addTest(testGetFasta.suite()) @@ -53,6 +64,7 @@ def main(argv=None): suite.addTest(testmakebedfromrds.suite()) #suite.addTest(testMakeGraphs.suite()) suite.addTest(testMakeRdsFromBam.suite()) + suite.addTest(testMakeSiteTrack.suite()) suite.addTest(testMakeSNPTrack.suite()) suite.addTest(testMarkLinkers.suite()) suite.addTest(testPeak.suite()) diff --git a/test/testFeatureIntersects.py b/test/testFeatureIntersects.py new file mode 100644 index 0000000..9633f8c --- /dev/null +++ b/test/testFeatureIntersects.py @@ -0,0 +1,73 @@ + +import unittest +import os +from erange import featureIntersects + + +class TestFeatureIntersects(unittest.TestCase): + + testInputFile = "erangeTestFeatureIntersects" + + + def setUp(self): + cdffile = open(self.testInputFile, "w") + cdffile.write("line1\tchr1\t30\t60\n") + cdffile.write("line2\tchr1\t90\t99\n") + cdffile.write("line3\tchr1\t5\t80\n") + cdffile.write("line4\tchr2\t10\t14\n") + cdffile.write("line4\tnot to be processed\n") + cdffile.write("line5\tchr2\t10\t14\n") + cdffile.close() + + + def tearDown(self): + try: + os.remove(self.testInputFile) + except OSError: + print "position file does not exist" + + + #TODO: write test + def testFeatureIntersects(self): + pass + + + #TODO: write test + def testGetPositionList(self): + pass + + + #TODO: write test + def testMain(self): + argv = ["cdfdist"] + self.assertRaises(SystemExit, featureIntersects.main, argv) + argv = ["cdfdist", self.testInputFile] + featureIntersects.main(argv) + + + def testMakeParser(self): + parser = featureIntersects.makeParser("") + argv = [] + (options, args) = parser.parse_args(argv) + self.assertEqual([], args) + argv = [self.testInputFile] + (options, args) = parser.parse_args(argv) + self.assertEqual(self.testInputFile, args[0]) + self.assertEqual("TFBSCONSSITES", options.cistype) + self.assertEqual(100, options.radius) + argv = [self.testInputFile, "--cistype", "test", "--radius", "50"] + (options, args) = parser.parse_args(argv) + self.assertEqual("test", options.cistype) + self.assertEqual(50, options.radius) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestFeatureIntersects)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMakeSiteTrack.py b/test/testMakeSiteTrack.py new file mode 100644 index 0000000..7e609e7 --- /dev/null +++ b/test/testMakeSiteTrack.py @@ -0,0 +1,94 @@ +''' +Created on Dec 2, 2010 + +@author: sau +''' +import unittest +import os +from erange import makesitetrack + + +class TestMakeSiteTrack(unittest.TestCase): + + testInputFile = "erangeTestSiteFile" + testOutputFile = "erangeTestBedFile" + + + def setUp(self): + self.createCompactInputFile() + + + def tearDown(self): + try: + os.remove(self.testInputFile) + except OSError: + print "site file does not exist" + + try: + os.remove(self.testOutputFile) + except OSError: + print "bed file does not exist" + + + def createStandardInputFile(self): + siteFile = open(self.testInputFile, "w") + siteFile.write("#comment line\n") + siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n") + siteFile.close() + + + def createCompactInputFile(self): + siteFile = open(self.testInputFile, "w") + siteFile.write("#comment line\n") + siteFile.write("chr1:100-175\t11\t+\t-\n") + siteFile.close() + + + def testMakeSiteTrack(self): + self.createCompactInputFile() + makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile) + bedfile = open(self.testOutputFile) + header = 'track name="" visibility=4 itemRgb="On"\n' + self.assertEquals(header, bedfile.readline()) + result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n" + self.assertEquals(result, bedfile.readline()) + + makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True) + bedfile = open(self.testOutputFile) + header = bedfile.readline() + firstEntry = bedfile.readline() + self.assertEquals(firstEntry, bedfile.readline()) + + + def testNonCompactMakeSiteTrack(self): + try: + os.remove(self.testInputFile) + except OSError: + print "site file does not exist" + + self.createStandardInputFile() + makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255") + bedfile = open(self.testOutputFile) + bedfile.readline() + result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n" + self.assertEquals(result, bedfile.readline()) + + + def testMain(self): + self.createCompactInputFile() + argv = ["makesitetrack"] + self.assertRaises(SystemExit, makesitetrack.main, argv) + argv = ["makesitetrack", self.testInputFile, self.testOutputFile] + makesitetrack.main(argv) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeSiteTrack)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/weighMultireads.py b/weighMultireads.py index f4b1691..9fd041d 100755 --- a/weighMultireads.py +++ b/weighMultireads.py @@ -168,7 +168,6 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False): chrom = "chr%s" % multiReadList[index][1] reweighList.append((round(score,3), chrom, start, theID)) - #TODO: Is this right? If match index is 0 are we doing nothing? if theMatch > 0: RDS.reweighMultireads(reweighList) fixedPair += 1 @@ -330,4 +329,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False): if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv)