From: Sean Upchurch Date: Fri, 8 Oct 2010 23:32:13 +0000 (-0700) Subject: snapshot of 4.0a development. initial git repo commit X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=commitdiff_plain;h=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 snapshot of 4.0a development. initial git repo commit --- 5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py new file mode 100644 index 0000000..935a04e --- /dev/null +++ b/MakeBamFromRds.py @@ -0,0 +1,281 @@ +""" +MakeBamFromRds + +Converts ERANGE RDS zero based file to Bam zero based format. + +Usage: python MakeBamFromRDS.py rdsFile bamFile [options] + +""" + +try: + import psyco + psyco.full() +except: + pass + +import sys +import re +import optparse +import random +import pysam +from commoncode import readDataset + + +def main(argv=None): + if not argv: + argv = sys.argv + + verstring = "MakeBamFromRds: version 1.0" + print verstring + + doPairs = False + + usage = "usage: python %prog rdsFile bamFile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--nouniq", action="store_false", dest="withUniqs") + parser.add_option("--nomulti", action="store_false", dest="withMulti") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--flag", dest="withFlag") + parser.add_option("--flaglike", action="store_true", dest="useFlagLike") + parser.add_option("--pairs", action="store_true", dest="doPairs") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--enforceChr", action="store_true", dest="enforceChr") + parser.add_option("--chrom", action="append", dest="chromList") + parser.add_option("--fasta", dest="fastaFileName") + parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, + doPairs=False, withFlag="", useFlagLike=False, enforceChr=False, + doCache=False, cachePages=100000, fastaFileName="", + chromList=[]) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + rdsfile = args[0] + outfilename = args[1] + + allChrom = True + if options.chromList: + allChrom = False + + makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti, + options.doSplices, doPairs, options.withFlag, options.useFlagLike, + options.enforceChr, allChrom, options.doCache, options.cachePages, + options.chromList, options.fastaFileName) + + +def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, + doSplices=False, doPairs=False, withFlag="", + useFlagLike=False, enforceChr=False, allChrom=True, + doCache=False, cachePages=100000, chromList=[], fastaFileName=""): + + if not withUniqs and not withMulti and not doSplices: + print "must be outputting at least one of uniqs, multi, or -splices - exiting" + sys.exit(1) + + print "\nsample:" + RDS = readDataset(rdsfile, verbose = True, cache=doCache) + + if cachePages > RDS.getDefaultCacheSize(): + RDS.setDBcache(cachePages) + + readlength = RDS.getReadSize() + + if allChrom: + if withUniqs: + chromList = RDS.getChromosomes() + elif withMulti: + chromList = RDS.getChromosomes(table="multi") + else: + chromList = RDS.getChromosomes(table="splices") + + chromList.sort() + + fastaSequenceDict = {} + if fastaFileName: + fastafile = open(fastaFileName) + fastaSequenceDict = getFastaSequenceDictionary(fastaFileName) + fastafile.close() + + referenceSequenceList = [] + chromRemoveList = [] + for chromosome in chromList: + if doNotOutputChromosome(chromosome, enforceChr): + chromRemoveList.append(chromosome) + else: + chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices) + referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)} + referenceSequenceList.append(referenceDataDict) + + for chrom in chromRemoveList: + chromList.remove(chrom) + + header = {"HD": {"VN": "1.0"}} + if referenceSequenceList: + header["SQ"] = referenceSequenceList + + outfile = pysam.Samfile(outfilename, "wb", header=header) + + totalWrites = 0 + noncanonicalSplices = 0 + for chrom in chromList: + index = 0 + print "chromosome %s" % (chrom) + if withUniqs or withMulti: + hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True, + withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, + flagLike=useFlagLike, entryDict=True) + + for read in hitDict[chrom]: + writeBAMEntry(outfile, chrom, read, readlength) + index += 1 + + if doSplices: + numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict) + index += numSpliceReadsWritten + noncanonicalSplices += noncanonical + + print index + totalWrites += index + + outfile.close() + print "%d total reads written" % totalWrites + print "%d non-canonical splices" % noncanonicalSplices + + +def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}): + index = 0 + noncanonicalSplices = 0 + spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True) + if chrom not in spliceDict: + pass + else: + for read in spliceDict[chrom]: + if fastaSequenceDict.has_key(chrom): + read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"]) + noncanonicalSplices += noncanonical + + writeBAMEntry(outfile, chrom, read, readlength) + index += 1 + + return index, noncanonicalSplices + + +def writeBAMEntry(outfile, chrom, outputDict, readlength): + tagList = [] + alignedRead = pysam.AlignedRead() + alignedRead.qname = outputDict["readID"] + if outputDict["sense"] == "-": + alignedRead.is_reverse = True + + alignedRead.rname = outfile.references.index(chrom) + + if outputDict.has_key("startL"): + startL = outputDict["startL"] + stopL = outputDict["stopL"] + startR = outputDict["startR"] + stopR = outputDict["stopR"] + alignedRead.pos = startL + alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)] + tagList.append(("XS", outputDict["sense"])) + else: + alignedRead.pos = outputDict["start"] + alignedRead.cigar = [(0, readlength)] + + if outputDict.has_key("pairID"): + pairID = outputDict["pairID"] + if pairID == "1": + alignedRead.is_read1 = True + alignedRead.is_proper_pair = True + elif pairID == "2": + alignedRead.is_read2 = True + alignedRead.is_proper_pair = True + else: + pass + + if outputDict.has_key("mismatch"): + mismatchTag = getMismatches(outputDict["mismatch"]) + if mismatchTag: + tagList.append(("MD", mismatchTag)) + + if tagList: + alignedRead.tags = tagList + + outfile.write(alignedRead) + + +def getMismatches(mismatchString): + mismatch = "" + positions = re.findall("\d+", mismatchString) + nucleotides = re.findall("([ACGTN])\d+", mismatchString) + for index in range(0, len(positions)): + mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index]) + + return mismatch + + +def doNotOutputChromosome(chrom, enforceChr): + result = False + + if chrom == "chrM": + result = True + + if enforceChr and ("chr" not in chrom): + result = True + + return result + + +def getFastaSequenceDictionary(fastaFileName): + fastaSeqDict = {} + fchrom = "" + fseq = "" + + fastafile = open(fastaFileName) + for line in fastafile: + if line[0] == ">": + if fchrom != "": + fastaSeqDict[fchrom] = fseq + + fseq = "" + fchrom = line[1:-1] + else: + fseq += line.strip() + + fastafile.close() + + return fastaSeqDict + + +def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""): + spliceSense = {"GTAG": "+", + "GCAG": "+", + "ATAC": "+", + "CTAC": "-", + "CTGC": "-", + "GTAT": "-" + } + + noncanonical = 0 + intronstart = stopLeft + intronlen = startRight - stopLeft + leftJunctionSig =fastaSequence[intronstart:intronstart+2] + rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen] + spliceJunction = leftJunctionSig + rightJunctionSig + spliceJunction = spliceJunction.upper() + if spliceSense.has_key(spliceJunction): + sense = spliceSense[spliceJunction] + else: + noncanonical += 1 + senses = ["+", "-"] + random.shuffle(senses) + sense = senses[0] + + return sense, noncanonical + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py new file mode 100644 index 0000000..e9df847 --- /dev/null +++ b/MakeRdsFromBam.py @@ -0,0 +1,397 @@ +""" +MakeRdsFromBam + +Created on Jun 3, 2010 + +@author: sau +""" + +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse, re +import pysam +from commoncode import readDataset, writeLog + +verstring = "%prog: version 1.0" + + +def main(argv=None): + if not argv: + argv = sys.argv + + print verstring + + usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\ + \ninput reads must be sorted to properly record multireads" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--append", action="store_false", dest="init", + help="append to existing rds file [default: create new]") + parser.add_option("--RNA", action="store_true", dest="rnaDataType", + help="set data type to RNA [default: DNA]") + parser.add_option("-S", "--sam", action="store_true", dest="useSamFile", + help="input file is in sam format") + parser.add_option("--index", action="store_true", dest="doIndex", + help="index the output rds file") + parser.add_option("--cache", type="int", dest="cachePages", + help="number of cache pages to use [default: 100000") + parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount", + help="multi counts over this value are discarded [default: 10]") + parser.add_option("--rawreadID", action="store_false", dest="trimReadID", + help="use the raw read names") + parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000, + maxMultiReadCount=10, rnaDataType=False, trimReadID=True) + + (options, args) = parser.parse_args(argv[1:]) + + try: + label = args[0] + except IndexError: + print "no label specified - see --help for usage" + sys.exit(1) + + try: + samFileName = args[1] + except IndexError: + print "no samfile specified - see --help for usage" + sys.exit(1) + + try: + outDbName = args[2] + except IndexError: + print "no outrdsfile specified - see --help for usage" + sys.exit(1) + + makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile, + options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID) + + +def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False, + cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True): + + if useSamFile: + fileMode = "r" + else: + fileMode = "rb" + + try: + samfile = pysam.Samfile(samFileName, fileMode) + except ValueError: + print "samfile index not found" + sys.exit(1) + + if rnaDataType: + dataType = "RNA" + else: + dataType = "DNA" + + writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:])) + + rds = readDataset(outDbName, init, dataType, verbose=True) + if not init and doIndex: + try: + if rds.hasIndex(): + rds.dropIndex() + except: + pass + + if "sam_mapped" not in rds.getMetadata(): + rds.insertMetadata([("sam_mapped", "True")]) + + defaultCacheSize = rds.getDefaultCacheSize() + + if cachePages > defaultCacheSize: + if init: + rds.setDBcache(cachePages, default=True) + else: + rds.setDBcache(cachePages) + + propertyList = [] + for arg in sys.argv: + if "::" in arg: + (pname, pvalue) = arg.strip().split("::") + propertyList.append((pname, pvalue)) + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + countReads = {"unmapped": 0, + "total": 0, + "unique": 0, + "multi": 0, + "multiDiscard": 0, + "splice": 0 + } + + readsize = 0 + insertSize = 100000 + + uniqueInsertList = [] + multiInsertList = [] + spliceInsertList = [] + + processedEntryDict = {} + uniqueReadDict = {} + multiReadDict = {} + spliceReadDict = {} + + samFileIterator = samfile.fetch(until_eof=True) + + for read in samFileIterator: + if read.is_unmapped: + countReads["unmapped"] += 1 + continue + + if readsize == 0: + take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip) + readsize = sum([length for op,length in read.cigar if op in take]) + if init: + rds.insertMetadata([("readsize", readsize)]) + + #Build the read dictionaries + try: + readSequence = read.seq + except KeyError: + readSequence = "" + + pairReadSuffix = getPairedReadNumberSuffix(read) + readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix) + if trimReadID: + rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix) + else: + rdsEntryName = read.qname + + if processedEntryDict.has_key(readName): + if isSpliceEntry(read.cigar): + if spliceReadDict.has_key(readName): + del spliceReadDict[readName] + else: + if uniqueReadDict.has_key(readName): + del uniqueReadDict[readName] + + if multiReadDict.has_key(readName): + (read, priorCount, rdsEntryName) = multiReadDict[readName] + count = priorCount + 1 + multiReadDict[readName] = (read, count, rdsEntryName) + else: + multiReadDict[readName] = (read, 1, rdsEntryName) + else: + processedEntryDict[readName] = "" + if isSpliceEntry(read.cigar): + spliceReadDict[readName] = (read,rdsEntryName) + else: + uniqueReadDict[readName] = (read, rdsEntryName) + + if countReads["total"] % insertSize == 0: + for entry in uniqueReadDict.keys(): + (readData, rdsEntryName) = uniqueReadDict[entry] + chrom = samfile.getrname(readData.rname) + uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize)) + countReads["unique"] += 1 + + for entry in spliceReadDict.keys(): + (readData, rdsEntryName) = spliceReadDict[entry] + chrom = samfile.getrname(readData.rname) + spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) + countReads["splice"] += 1 + + for entry in multiReadDict.keys(): + (readData, count, rdsEntryName) = multiReadDict[entry] + chrom = samfile.getrname(readData.rname) + if count > maxMultiReadCount: + countReads["multiDiscard"] += 1 + else: + multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) + countReads["multi"] += 1 + + rds.insertUniqs(uniqueInsertList) + rds.insertMulti(multiInsertList) + uniqueInsertList = [] + uniqueReadDict = {} + multiInsertList = [] + multiReadDict = {} + if dataType == "RNA": + rds.insertSplices(spliceInsertList) + spliceInsertList = [] + spliceReadDict = {} + + print ".", + sys.stdout.flush() + processedEntryDict = {} + + countReads["total"] += 1 + + if len(uniqueReadDict.keys()) > 0: + for entry in uniqueReadDict.keys(): + (readData, rdsEntryName) = uniqueReadDict[entry] + chrom = samfile.getrname(readData.rname) + uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize)) + countReads["unique"] += 1 + + rds.insertUniqs(uniqueInsertList) + + if len(multiReadDict.keys()) > 0: + for entry in multiReadDict.keys(): + (readData, count, rdsEntryName) = multiReadDict[entry] + chrom = samfile.getrname(readData.rname) + if count > maxMultiReadCount: + countReads["multiDiscard"] += 1 + else: + multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) + countReads["multi"] += 1 + + countReads["multi"] += len(multiInsertList) + + if len(spliceReadDict.keys()) > 0 and dataType == "RNA": + for entry in spliceReadDict.keys(): + (readData, rdsEntryName) = spliceReadDict[entry] + chrom = samfile.getrname(readData.rname) + spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) + countReads["splice"] += 1 + + rds.insertSplices(spliceInsertList) + + countString = "\n%d unmapped reads discarded" % countReads["unmapped"] + countString += "\t%d unique reads" % countReads["unique"] + countString += "\t%d multi reads" % countReads["multi"] + countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount) + if dataType == "RNA": + countString += "\t%d spliced reads" % countReads["splice"] + + print countString.replace("\t", "\n") + + writeLog("%s.log" % outDbName, verstring, countString) + + if doIndex: + print "building index...." + if cachePages > defaultCacheSize: + rds.setDBcache(cachePages) + rds.buildIndex(cachePages) + else: + rds.buildIndex(defaultCacheSize) + + +def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1): + start = int(alignedRead.pos) + stop = int(start+readSize) + sense = getReadSense(alignedRead.is_reverse) + try: + mismatchTag = alignedRead.opt("MD") + mismatches = getMismatches(mismatchTag, alignedRead.seq, sense) + except KeyError: + mismatches = "" + + return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches) + + +def getRDSSpliceEntry(alignedRead, readName, chrom, readSize): + (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize) + startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar) + + return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches) + + +def getPairedReadNumberSuffix(read): + readSuffix = "" + if not isPairedRead(read): + return "" + + if read.is_read1: + readSuffix = "/1" + elif read.is_read2: + readSuffix = "/2" + + return readSuffix + + +def isPairedRead(read): + return read.is_proper_pair and (read.is_read1 or read.is_read2) + + +def isSpliceEntry(cigarTupleList): + isSplice = False + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def getReadSense(reverse): + if reverse: + sense = "-" + else: + sense = "+" + + return sense + + +def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False): + output = [] + deletionMarker = "^" + position = 0 + + lengths = re.findall("\d+", mismatchTag) + mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag) + + for mismatchEntry in range(len(mismatchSequences)): + mismatch = mismatchSequences[mismatchEntry] + position = position + int(lengths[mismatchEntry]) + if string.find(mismatch, deletionMarker) == 0: + continue + + try: + if querySequence: + genomicNucleotide = querySequence[position] + else: + genomicNucleotide = "N" + + if sense == "-": + mismatch = getComplementNucleotide(mismatch) + genomicNucleotide = getComplementNucleotide(genomicNucleotide) + + elandCompatiblePosition = int(position + 1) + output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide)) + position += 1 + except IndexError: + if logErrors: + errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position) + writeLog("MakeRdsFromBamError.log", "1.0", errorMessage) + + return "" + + return string.join(output, ",") + + +def getComplementNucleotide(nucleotide): + complement = {"A": "T", + "T": "A", + "C": "G", + "G": "C", + "N": "N" + } + + return complement[nucleotide] + + +def getSpliceBounds(start, readsize, cigarTupleList): + stopR = int(start + readsize) + offset = 0 + + for operation,length in cigarTupleList: + if operation == 3: + stopL = int(start + offset) + startR = int(stopL + length) + + return start, startR, stopL, stopR + else: + offset += length + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/ReadDataset.py b/ReadDataset.py new file mode 100644 index 0000000..ef80d65 --- /dev/null +++ b/ReadDataset.py @@ -0,0 +1,1246 @@ +""" +Created on Jul 1, 2010 + +@author: sau +""" + +import sqlite3 as sqlite +import string +import tempfile +import shutil +import os +from os import environ +from array import array +from commoncode import getReverseComplement + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp +currentRDSVersion = "1.1" + + +class ReadDatasetError(Exception): + pass + + +class ReadDataset(): + """ Class for storing reads from experiments. Assumes that custom scripts + will translate incoming data into a format that can be inserted into the + class using the insert* methods. Default class subtype ('DNA') includes + tables for unique and multireads, whereas 'RNA' subtype also includes a + splices table. + """ + + def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, + cache=False, reportCount=True): + """ creates an rds datafile if initialize is set to true, otherwise + will append to existing tables. datasetType can be either 'DNA' or 'RNA'. + """ + self.dbcon = "" + self.memcon = "" + self.dataType = "" + self.rdsVersion = currentRDSVersion + self.memBacked = False + self.memChrom = "" + self.memCursor = "" + self.cachedDBFile = "" + + if cache: + if verbose: + print "caching ...." + + self.cacheDB(datafile) + dbFile = self.cachedDBFile + else: + dbFile = datafile + + self.dbcon = sqlite.connect(dbFile) + self.dbcon.row_factory = sqlite.Row + self.dbcon.execute("PRAGMA temp_store = MEMORY") + if initialize: + if datasetType not in ["DNA", "RNA"]: + raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") + else: + self.dataType = datasetType + + self.initializeTables(self.dbcon) + else: + metadata = self.getMetadata("dataType") + self.dataType = metadata["dataType"] + + try: + metadata = self.getMetadata("rdsVersion") + self.rdsVersion = metadata["rdsVersion"] + except: + try: + self.insertMetadata([("rdsVersion", float(currentRDSVersion))]) + except IOError: + print "could not add rdsVersion - read-only ?" + self.rdsVersion = "pre-1.0" + + if verbose: + if initialize: + print "INITIALIZED dataset %s" % datafile + else: + print "dataset %s" % datafile + + metadata = self.getMetadata() + print "metadata:" + pnameList = metadata.keys() + pnameList.sort() + for pname in pnameList: + print "\t" + pname + "\t" + metadata[pname] + + if reportCount: + ucount = self.getUniqsCount() + mcount = self.getMultiCount() + if self.dataType == "DNA" and not initialize: + try: + print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount)) + except ValueError: + print "\n%s unique reads and %s multireads" % (ucount, mcount) + elif self.dataType == "RNA" and not initialize: + scount = self.getSplicesCount() + try: + print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount)) + except ValueError: + print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount) + + print "default cache size is %d pages" % self.getDefaultCacheSize() + if self.hasIndex(): + print "found index" + else: + print "not indexed" + + + def __len__(self): + """ return the number of usable reads in the dataset. + """ + total = self.getUniqsCount() + total += self.getMultiCount() + + if self.dataType == "RNA": + total += self.getSplicesCount() + + total = int(total) + + return total + + + def __del__(self): + """ cleanup copy in local cache, if present. + """ + if self.cachedDBFile != "": + self.uncacheDB() + + + def cacheDB(self, filename): + """ copy geneinfoDB to a local cache. + """ + self.cachedDBFile = "%s.db" % tempfile.mktemp() + shutil.copyfile(filename, self.cachedDBFile) + + + def saveCacheDB(self, filename): + """ copy geneinfoDB to a local cache. + """ + shutil.copyfile(self.cachedDBFile, filename) + + + def uncacheDB(self): + """ delete geneinfoDB from local cache. + """ + global cachedDBFile + if self.cachedDBFile != "": + try: + os.remove(self.cachedDBFile) + except: + print "could not delete %s" % self.cachedDBFile + + self.cachedDB = "" + + + def attachDB(self, filename, asname): + """ attach another database file to the readDataset. + """ + stmt = "attach '%s' as %s" % (filename, asname) + self.execute(stmt) + + + def detachDB(self, asname): + """ detach a database file to the readDataset. + """ + stmt = "detach %s" % (asname) + self.execute(stmt) + + + def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""): + """ import into current RDS the table (with columns destcolumns, + with default all columns) from the database file asname, + using the column specification of ascolumns (default all). + """ + stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table) + if flagged != "": + stmt += " where flag = '%s' " % flagged + + self.executeCommit(stmt) + + + def getTables(self, asname=""): + """ get a list of table names in a particular database file. + """ + resultList = [] + sql = self.getSqlCursor() + + if asname != "": + asname += "." + + stmt = "select name from %ssqlite_master where type='table'" % asname + sql.execute(stmt) + results = sql.fetchall() + + for row in results: + resultList.append(row["name"]) + + return resultList + + + def getSqlCursor(self): + if self.memBacked: + sql = self.getMemCursor() + else: + sql = self.getFileCursor() + + return sql + + + def hasIndex(self): + """ check whether the RDS file has at least one index. + """ + stmt = "select count(*) from sqlite_master where type='index'" + count = int(self.execute(stmt, returnResults=True)[0][0]) + if count > 0: + return True + + return False + + + def initializeTables(self, dbConnection, cache=100000): + """ creates table schema in a database connection, which is + typically a database file or an in-memory database. + """ + dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) + dbConnection.execute("create table metadata (name varchar, value varchar)") + dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType) + positionSchema = "start int, stop int" + tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema + dbConnection.execute("create table uniqs %s" % tableSchema) + dbConnection.execute("create table multi %s" % tableSchema) + if self.dataType == "RNA": + positionSchema = "startL int, stopL int, startR int, stopR int" + tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema + dbConnection.execute("create table splices %s" % tableSchema) + + dbConnection.commit() + + + def getFileCursor(self): + """ returns a cursor to file database for low-level (SQL) + access to the data. + """ + return self.dbcon.cursor() + + + def getMemCursor(self): + """ returns a cursor to memory database for low-level (SQL) + access to the data. + """ + return self.memcon.cursor() + + + def getMetadata(self, valueName=""): + """ returns a dictionary of metadata. + """ + whereClause = "" + resultsDict = {} + + if valueName != "": + whereClause = " where name='%s'" % valueName + + sql = self.getSqlCursor() + + sql.execute("select name, value from metadata %s" % whereClause) + results = sql.fetchall() + + for row in results: + parameterName = row["name"] + parameterValue = row["value"] + if parameterName not in resultsDict: + resultsDict[parameterName] = parameterValue + else: + trying = True + index = 2 + while trying: + newName = string.join([parameterName, str(index)], ":") + if newName not in resultsDict: + resultsDict[newName] = parameterValue + trying = False + + index += 1 + + return resultsDict + + + def getReadSize(self): + """ returns readsize if defined in metadata. + """ + metadata = self.getMetadata() + if "readsize" not in metadata: + raise ReadDatasetError("no readsize parameter defined") + else: + mysize = metadata["readsize"] + if "import" in mysize: + mysize = mysize.split()[0] + + return int(mysize) + + + def getDefaultCacheSize(self): + """ returns the default cache size. + """ + return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0]) + + + def getChromosomes(self, table="uniqs", fullChrom=True): + """ returns a list of distinct chromosomes in table. + """ + statement = "select distinct chrom from %s" % table + sql = self.getSqlCursor() + + sql.execute(statement) + results = [] + for row in sql: + if fullChrom: + if row["chrom"] not in results: + results.append(row["chrom"]) + else: + if len(row["chrom"][3:].strip()) < 1: + continue + + if row["chrom"][3:] not in results: + results.append(row["chrom"][3:]) + + results.sort() + + return results + + + def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True, + doMulti=False, doSplices=False): + """ returns the maximum coordinate for reads on a given chromosome. + """ + maxCoord = 0 + sql = self.getSqlCursor() + + if doUniqs: + try: + sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom) + maxCoord = int(sql.fetchall()[0][0]) + except: + print "couldn't retrieve coordMax for chromosome %s" % chrom + + if doSplices: + sql.execute("select max(startR) from splices where chrom = '%s'" % chrom) + try: + spliceMax = int(sql.fetchall()[0][0]) + if spliceMax > maxCoord: + maxCoord = spliceMax + except: + pass + + if doMulti: + sql.execute("select max(start) from multi where chrom = '%s'" % chrom) + try: + multiMax = int(sql.fetchall()[0][0]) + if multiMax > maxCoord: + maxCoord = multiMax + except: + pass + + if verbose: + print "%s maxCoord: %d" % (chrom, maxCoord) + + return maxCoord + + + def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, + withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False, + readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False, + flagLike=False, strand='', combine5p=False): + """ returns a dictionary of reads in a variety of formats + and which can be restricted by chromosome or custom-flag. + Returns unique reads by default, but can return multireads + with doMulti set to True. + """ + whereClause = [] + resultsDict = {} + + if chrom != "" and chrom != self.memChrom: + whereClause.append("chrom = '%s'" % chrom) + + if flag != "": + if flagLike: + flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") + whereClause.append(flagLikeClause) + else: + whereClause.append("flag = '%s'" % flag) + + if start > -1: + whereClause.append("start > %d" % start) + + if stop > -1: + whereClause.append("stop < %d" % stop) + + if len(readLike) > 0: + readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") + whereClause.append(readIDClause) + + if hasMismatch: + whereClause.append("mismatch != ''") + + if strand in ["+", "-"]: + whereClause.append("sense = '%s'" % strand) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + groupBy = [] + if findallOptimize: + selectClause = ["select start, sense, sum(weight)"] + groupBy = ["GROUP BY start, sense"] + else: + selectClause = ["select ID, chrom, start, readID"] + if bothEnds: + selectClause.append("stop") + + if not noSense: + selectClause.append("sense") + + if withWeight: + selectClause.append("weight") + + if withFlag: + selectClause.append("flag") + + if withMismatch: + selectClause.append("mismatch") + + if limit > 0 and not combine5p: + groupBy.append("LIMIT %d" % limit) + + selectQuery = string.join(selectClause, ",") + groupQuery = string.join(groupBy) + if doUniqs: + stmt = [selectQuery, "from uniqs", whereQuery, groupQuery] + if doMulti: + stmt.append("UNION ALL") + stmt.append(selectQuery) + stmt.append("from multi") + stmt.append(whereQuery) + stmt.append(groupQuery) + else: + stmt = [selectQuery, "from multi", whereQuery] + + if combine5p: + if findallOptimize: + selectQuery = "select start, sense, weight, chrom" + + if doUniqs: + subSelect = [selectQuery, "from uniqs", whereQuery] + if doMulti: + subSelect.append("union all") + subSelect.append(selectQuery) + subSelect.append("from multi") + subSelect.append(whereQuery) + else: + subSelect = [selectQuery, "from multi", whereQuery] + + sqlStmt = string.join(subSelect) + if findallOptimize: + selectQuery = "select start, sense, sum(weight)" + + stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union", + selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"] + + if findallOptimize: + if self.memBacked: + self.memcon.row_factory = None + sql = self.memcon.cursor() + else: + self.dbcon.row_factory = None + sql = self.dbcon.cursor() + + stmt.append("order by start") + elif readIDDict: + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + stmt.append("order by readID, start") + else: + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + stmt.append("order by chrom, start") + + sqlQuery = string.join(stmt) + sql.execute(sqlQuery) + + if findallOptimize: + resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql] + if self.memBacked: + self.memcon.row_factory = sqlite.Row + else: + self.dbcon.row_factory = sqlite.Row + else: + currentChrom = "" + currentReadID = "" + pairID = 0 + for row in sql: + readID = row["readID"] + if fullChrom: + chrom = row["chrom"] + else: + chrom = row["chrom"][3:] + + if not readIDDict and chrom != currentChrom: + resultsDict[chrom] = [] + currentChrom = chrom + dictKey = chrom + elif readIDDict: + theReadID = readID + if "::" in readID: + theReadID = readID.split("::")[0] + + if "/" in theReadID and withPairID: + (theReadID, pairID) = readID.split("/") + + if theReadID != currentReadID: + resultsDict[theReadID] = [] + currentReadID = theReadID + dictKey = theReadID + + newrow = {"start": int(row["start"])} + if bothEnds: + newrow["stop"] = int(row["stop"]) + + if not noSense: + newrow["sense"] = row["sense"] + + if withWeight: + newrow["weight"] = float(row["weight"]) + + if withFlag: + newrow["flag"] = row["flag"] + + if withMismatch: + newrow["mismatch"] = row["mismatch"] + + if withID: + newrow["readID"] = readID + + if withChrom: + newrow["chrom"] = chrom + + if withPairID: + newrow["pairID"] = pairID + + resultsDict[dictKey].append(newrow) + + return resultsDict + + + def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, + withID=False, withChrom=False, withPairID=False, readIDDict=False, + splitRead=False, hasMismatch=False, flagLike=False, start=-1, + stop=-1, strand=""): + """ returns a dictionary of spliced reads in a variety of + formats and which can be restricted by chromosome or custom-flag. + Returns unique spliced reads for now. + """ + whereClause = [] + resultsDict = {} + + if chrom != "" and chrom != self.memChrom: + whereClause = ["chrom = '%s'" % chrom] + + if flag != "": + if flagLike: + flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") + whereClause.append(flagLikeClause) + else: + whereClause.append("flag = '%s'" % flag) + + if hasMismatch: + whereClause.append("mismatch != ''") + + if strand != "": + whereClause.append("sense = '%s'" % strand) + + if start > -1: + whereClause.append("startL > %d" % start) + + if stop > -1: + whereClause.append("stopR < %d" % stop) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"] + if not noSense: + selectClause.append("sense") + + if withWeight: + selectClause.append("weight") + + if withFlag: + selectClause.append("flag") + + if withMismatch: + selectClause.append("mismatch") + + selectQuery = string.join(selectClause, " ,") + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) + sql.execute(stmt) + currentReadID = "" + currentChrom = "" + for row in sql: + pairID = 0 + readID = row["readID"] + if fullChrom: + chrom = row["chrom"] + else: + chrom = row["chrom"][3:] + + if not readIDDict and chrom != currentChrom: + resultsDict[chrom] = [] + currentChrom = chrom + dictKey = chrom + elif readIDDict: + if "/" in readID: + (theReadID, pairID) = readID.split("/") + else: + theReadID = readID + + if theReadID != currentReadID: + resultsDict[theReadID] = [] + currentReadID = theReadID + dictKey = theReadID + + newrow = {"startL": int(row["startL"])} + newrow["stopL"] = int(row["stopL"]) + newrow["startR"] = int(row["startR"]) + newrow["stopR"] = int(row["stopR"]) + if not noSense: + newrow["sense"] = row["sense"] + + if withWeight: + newrow["weight"] = float(row["weight"]) + + if withFlag: + newrow["flag"] = row["flag"] + + if withMismatch: + newrow["mismatch"] = row["mismatch"] + + if withID: + newrow["readID"] = readID + + if withChrom: + newrow["chrom"] = chrom + + if withPairID: + newrow["pairID"] = pairID + + if splitRead: + leftDict = newrow.copy() + del leftDict["startR"] + del leftDict["stopR"] + rightDict = newrow + del rightDict["startL"] + del rightDict["stopL"] + resultsDict[dictKey].append(leftDict) + resultsDict[dictKey].append(rightDict) + else: + resultsDict[dictKey].append(newrow) + + return resultsDict + + + def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False, + splices=False, reportCombined=True, sense="both"): + """ return read counts for a given region. + """ + ucount = 0 + mcount = 0 + scount = 0 + restrict = "" + if sense in ["+", "-"]: + restrict = " sense ='%s' " % sense + + if uniqs: + try: + ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict)) + except: + ucount = 0 + + if multi: + try: + mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict)) + except: + mcount = 0 + + if splices: + try: + scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict)) + except: + scount = 0 + + if reportCombined: + total = ucount + mcount + scount + return total + else: + return (ucount, mcount, scount) + + + def getTotalCounts(self, chrom="", rmin="", rmax=""): + """ return read counts for a given region. + """ + return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both") + + + def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): + """ returns the number of row in the uniqs table. + """ + whereClause = [] + count = 0 + + if chrom !="" and chrom != self.memChrom: + whereClause = ["chrom='%s'" % chrom] + + if rmin != "": + whereClause.append("%s >= %s" % (startField, str(rmin))) + + if rmax != "": + whereClause.append("%s <= %s" % (startField, str(rmax))) + + if restrict != "": + whereClause.append(restrict) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + if distinct: + sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery)) + else: + sql.execute("select sum(weight) from %s %s" % (table, whereQuery)) + + result = sql.fetchone() + + try: + count = int(result[0]) + except: + count = 0 + + return count + + + def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the number of row in the splices table. + """ + return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") + + + def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the number of distinct readIDs in the uniqs table. + """ + return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct) + + + def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the total weight of readIDs in the multi table. + """ + return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct) + + + def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1): + """ get readID's. + """ + stmt = [] + limitPart = "" + if limit > 0: + limitPart = "LIMIT %d" % limit + + if uniqs: + stmt.append("select readID from uniqs") + + if multi: + stmt.append("select readID from multi") + + if splices: + stmt.append("select readID from splices") + + if len(stmt) > 0: + selectPart = string.join(stmt, " union ") + else: + selectPart = "" + + sqlQuery = "%s group by readID %s" % (selectPart, limitPart) + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + sql.execute(sqlQuery) + result = sql.fetchall() + + if paired: + return [x[0].split("/")[0] for x in result] + else: + return [x[0] for x in result] + + + def getMismatches(self, mischrom=None, verbose=False, useSplices=True): + """ returns the uniq and spliced mismatches in a dictionary. + """ + readlen = self.getReadSize() + if mischrom: + hitChromList = [mischrom] + else: + hitChromList = self.getChromosomes() + hitChromList.sort() + + snpDict = {} + for achrom in hitChromList: + if verbose: + print "getting mismatches from chromosome %s" % (achrom) + + snpDict[achrom] = [] + hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) + if useSplices and self.dataType == "RNA": + spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) + spliceIDList = spliceDict.keys() + for k in spliceIDList: + spliceEntry = spliceDict[k][0] + startpos = spliceEntry["startL"] + lefthalf = spliceEntry["stopL"] + rightstart = spliceEntry["startR"] + sense = spliceEntry["sense"] + mismatches = spliceEntry["mismatch"] + spMismatchList = mismatches.split(",") + for mismatch in spMismatchList: + if "N" in mismatch: + continue + + change_len = len(mismatch) + if sense == "+": + change_from = mismatch[0] + change_base = mismatch[change_len-1] + change_pos = int(mismatch[1:change_len-1]) + elif sense == "-": + change_from = getReverseComplement([mismatch[0]]) + change_base = getReverseComplement([mismatch[change_len-1]]) + change_pos = readlen - int(mismatch[1:change_len-1]) + 1 + + firsthalf = int(lefthalf)-int(startpos)+1 + secondhalf = 0 + if int(change_pos) <= int(firsthalf): + change_at = startpos + change_pos - 1 + else: + secondhalf = change_pos - firsthalf + change_at = rightstart + secondhalf + + snpDict[achrom].append([startpos, change_at, change_base, change_from]) + + if achrom not in hitDict.keys(): + continue + + for readEntry in hitDict[achrom]: + start = readEntry["start"] + sense = readEntry["sense"] + mismatches = readEntry["mismatch"] + mismatchList = mismatches.split(",") + for mismatch in mismatchList: + if "N" in mismatch: + continue + + change_len = len(mismatch) + if sense == "+": + change_from = mismatch[0] + change_base = mismatch[change_len-1] + change_pos = int(mismatch[1:change_len-1]) + elif sense == "-": + change_from = getReverseComplement([mismatch[0]]) + change_base = getReverseComplement([mismatch[change_len-1]]) + change_pos = readlen - int(mismatch[1:change_len-1]) + 1 + + change_at = start + change_pos - 1 + snpDict[achrom].append([start, change_at, change_base, change_from]) + + return snpDict + + + def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True, + useSplices=False, normalizationFactor = 1.0, trackStrand=False, + keepStrand="both", shiftValue=0): + """return a profile of the chromosome as an array of per-base read coverage.... + keepStrand = 'both', 'plusOnly', or 'minusOnly'. + Will also shift position of unique and multireads (but not splices) if shift is a natural number + """ + metadata = self.getMetadata() + try: + readlen = int(metadata["readsize"]) + except KeyError: + readlen = 0 + + dataType = metadata["dataType"] + scale = 1. / normalizationFactor + shift = {} + shift['+'] = int(shiftValue) + shift['-'] = -1 * int(shiftValue) + + if cstop > 0: + lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen + else: + lastNT = cstop - cstart + readlen + shift["+"] + + chromModel = array("f",[0.] * lastNT) + hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True) + if cstart < 0: + cstart = 0 + + for readEntry in hitDict[chromosome]: + hstart = readEntry["start"] + sense = readEntry ["sense"] + weight = readEntry["weight"] + hstart = hstart - cstart + shift[sense] + for currentpos in range(hstart,hstart+readlen): + try: + if not trackStrand or (sense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale * weight + elif sense == "-" and keepStrand != "plusOnly": + chromModel[currentpos] -= scale * weight + except: + continue + + del hitDict + if useSplices and dataType == "RNA": + if cstop > 0: + spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop) + else: + spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True) + + if chromosome in spliceDict: + for spliceEntry in spliceDict[chromosome]: + Lstart = spliceEntry["startL"] + Lstop = spliceEntry["stopL"] + Rstart = spliceEntry["startR"] + Rstop = spliceEntry["stopR"] + rsense = spliceEntry["sense"] + if (Rstop - cstart) < lastNT: + for index in range(abs(Lstop - Lstart)): + currentpos = Lstart - cstart + index + # we only track unique splices + if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale + elif rsense == "-" and keepStrand != "plusOnly": + chromModel[currentpos] -= scale + + for index in range(abs(Rstop - Rstart)): + currentpos = Rstart - cstart + index + # we only track unique splices + if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale + elif rsense == "-" and keepStrand != "plusOnly": + chromModel[currentpos] -= scale + + del spliceDict + + return chromModel + + + def insertMetadata(self, valuesList): + """ inserts a list of (pname, pvalue) into the metadata + table. + """ + self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList) + self.dbcon.commit() + + + def updateMetadata(self, pname, newValue, originalValue=""): + """ update a metadata field given the original value and the new value. + """ + stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname) + if originalValue != "": + stmt += " and value='%s' " % str(originalValue) + + self.dbcon.execute(stmt) + self.dbcon.commit() + + + def insertUniqs(self, valuesList): + """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) + into the uniqs table. + """ + self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def insertMulti(self, valuesList): + """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) + into the multi table. + """ + self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def insertSplices(self, valuesList): + """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) + into the splices table. + """ + self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): + """ update reads on file database in a list region of regions for a chromosome to have a new flag. + regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with + sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense). + """ + restrict = "" + if sense != "both": + restrict = " and sense = ? " + + if uniqs: + self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) + + if multi: + self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) + + if self.dataType == "RNA" and splices: + self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList) + self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList) + + self.dbcon.commit() + + + def setFlags(self, flag, uniqs=True, multi=True, splices=True): + """ set the flag fields in the entire dataset. + """ + if uniqs: + self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag) + + if multi: + self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag) + + if self.dataType == "RNA" and splices: + self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag) + + self.dbcon.commit() + + + def resetFlags(self, uniqs=True, multi=True, splices=True): + """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. + """ + self.setFlags("", uniqs, multi, splices) + + + def reweighMultireads(self, readList): + self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList) + + + def setSynchronousPragma(self, value="ON"): + try: + self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value) + except: + print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value + + + def setDBcache(self, cache, default=False): + self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache) + if default: + self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) + + + def execute(self, statement, returnResults=False): + sql = self.getSqlCursor() + + sql.execute(statement) + if returnResults: + result = sql.fetchall() + return result + + + def executeCommit(self, statement): + self.execute(statement) + + if self.memBacked: + self.memcon.commit() + else: + self.dbcon.commit() + + + def buildIndex(self, cache=100000): + """ Builds the file indeces for the main tables. + Cache is the number of 1.5 kb pages to keep in memory. + 100000 pages translates into 150MB of RAM, which is our default. + """ + if cache > self.getDefaultCacheSize(): + self.setDBcache(cache) + self.setSynchronousPragma("OFF") + self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") + print "built uPosIndex" + self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)") + print "built uChromIndex" + self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") + print "built mPosIndex" + self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)") + print "built mChromIndex" + + if self.dataType == "RNA": + self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)") + print "built sPosIndex" + self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)") + print "built sPosIndex2" + self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)") + print "built sChromIndex" + + self.dbcon.commit() + self.setSynchronousPragma("ON") + + + def dropIndex(self): + """ drops the file indices for the main tables. + """ + try: + self.setSynchronousPragma("OFF") + self.dbcon.execute("DROP INDEX uPosIndex") + self.dbcon.execute("DROP INDEX uChromIndex") + self.dbcon.execute("DROP INDEX mPosIndex") + self.dbcon.execute("DROP INDEX mChromIndex") + + if self.dataType == "RNA": + self.dbcon.execute("DROP INDEX sPosIndex") + try: + self.dbcon.execute("DROP INDEX sPosIndex2") + except: + pass + + self.dbcon.execute("DROP INDEX sChromIndex") + + self.dbcon.commit() + except: + print "problem dropping index" + + self.setSynchronousPragma("ON") + + + def memSync(self, chrom="", index=False): + """ makes a copy of the dataset into memory for faster access. + Can be restricted to a "full" chromosome. Can also build the + memory indices. + """ + self.memcon = "" + self.memcon = sqlite.connect(":memory:") + self.initializeTables(self.memcon) + cursor = self.dbcon.cursor() + whereclause = "" + if chrom != "": + print "memSync %s" % chrom + whereclause = " where chrom = '%s' " % chrom + self.memChrom = chrom + else: + self.memChrom = "" + + self.memcon.execute("PRAGMA temp_store = MEMORY") + self.memcon.execute("PRAGMA CACHE_SIZE = 1000000") + # copy metadata to memory + self.memcon.execute("delete from metadata") + results = cursor.execute("select name, value from metadata") + results2 = [] + for row in results: + results2.append((row["name"], row["value"])) + + self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2) + + self.copyDBEntriesToMemory("uniqs", whereclause) + self.copyDBEntriesToMemory("multi", whereclause) + if self.dataType == "RNA": + self.copySpliceDBEntriesToMemory(whereclause) + + if index: + if chrom != "": + self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)") + self.memcon.execute("CREATE INDEX mPosIndex on multi(start)") + if self.dataType == "RNA": + self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)") + self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)") + else: + self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") + self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") + if self.dataType == "RNA": + self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)") + self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)") + + self.memBacked = True + self.memcon.row_factory = sqlite.Row + self.memcon.commit() + + + def copyDBEntriesToMemory(self, dbName, whereClause=""): + cursor = self.dbcon.cursor() + sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause)) + destinationEntries = [] + for row in sourceEntries: + destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) + + self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries) + + + def copySpliceDBEntriesToMemory(self, whereClause=""): + cursor = self.dbcon.cursor() + sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause) + destinationEntries = [] + for row in sourceEntries: + destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], + row["weight"], row["flag"], row["mismatch"])) + + self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries) + diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/altSpliceCounts.py b/altSpliceCounts.py new file mode 100755 index 0000000..1517ef8 --- /dev/null +++ b/altSpliceCounts.py @@ -0,0 +1,152 @@ +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +print 'version 3.6' + +import sys, optparse +from commoncode import readDataset + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s rdsfile outfilename [--cache pages]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cache", type="int", dest="numCachePages", + help="number of cache pages to use [default: 100000]") + parser.set_defaults(numCachePages=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + hitfile = args[0] + outfilename = args[1] + + if options.numCachePages is not None: + doCache = True + cachePages = options.numCachePages + else: + doCache = False + cachePages = 100000 + + altSpliceCounts(hitfile, outfilename, doCache, cachePages) + + +def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000): + startDict = {} + stopDict = {} + resultDict = {} + + hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + readlen = hitRDS.getReadSize() + hitDict = hitRDS.getSplicesDict(noSense=True) + outfile = open(outfilename,'w') + + for chrom in hitDict: + startDict[chrom] = [] + stopDict[chrom] = [] + resultDict[chrom] = [] + + index = 0 + for chrom in hitDict: + for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]: + index += 1 + length = tagStop - tagStart + if length < readlen + 5: + continue + + startDict[chrom].append((tagStart, length)) + stopDict[chrom].append((tagStop, length)) + + startDict[chrom].sort() + stopDict[chrom].sort() + + spliceEvent = 0 + altSpliceEvent = 0 + alternative = 1 + for chrom in startDict: + firstIndex = 0 + maxIndex = len(startDict[chrom]) + while firstIndex < maxIndex: + (fstart, flen) = startDict[chrom][firstIndex] + (start, length) = (fstart, flen) + secondIndex = firstIndex + secondLengths = [] + while (start - fstart) < readlen: + if secondIndex >= maxIndex: + break + + (start, length) = startDict[chrom][secondIndex] + if (start - fstart) < readlen and abs(length - flen) > readlen: + line = (chrom, fstart, fstart + flen, chrom, start, start + length) + alreadySeen = False + for slength in secondLengths: + if abs(slength - length) < readlen: + alreadySeen = True + + if len(resultDict[chrom]) == 0: + resultDict[chrom].append(line) + elif line != resultDict[chrom][-1] and not alreadySeen: + resultDict[chrom].append(line) + secondLengths.append(length) + altSpliceEvent += 1 + spliceEvent += 1 + + secondIndex += 1 + + firstIndex = secondIndex + spliceEvent += 1 + + firstIndex = 0 + maxIndex = len(stopDict[chrom]) + while firstIndex < maxIndex: + (fstop, flen) = stopDict[chrom][firstIndex] + (stop, length) = (fstop, flen) + secondIndex = firstIndex + secondLengths = [] + while (stop - fstop) < readlen: + if secondIndex >= maxIndex: + break + (stop, length) = stopDict[chrom][secondIndex] + if (stop - fstop) < readlen and abs(length - flen) > readlen: + line = (chrom, fstop - flen, fstop, chrom, stop - length, stop) + alreadySeen = False + for slength in secondLengths: + if abs(slength - length) < readlen: + alreadySeen = True + + if len(resultDict[chrom]) == 0: + resultDict[chrom].append(line) + + if line != resultDict[chrom][-1] and not alreadySeen: + resultDict[chrom].append(line) + secondLengths.append(length) + altSpliceEvent += 1 + spliceEvent += 1 + + secondIndex += 1 + + firstIndex = secondIndex + spliceEvent += 1 + + resultDict[chrom].sort() + for line in resultDict[chrom]: + outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n' % line) + alternative += 1 + + print chrom, maxIndex, spliceEvent, altSpliceEvent + + print spliceEvent, altSpliceEvent + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/analyzego.py b/analyzego.py new file mode 100755 index 0000000..d4f9f6f --- /dev/null +++ b/analyzego.py @@ -0,0 +1,86 @@ +try: + import psyco + psyco.full() +except: + print "psyco not running" + +import sys, optparse +from cistematic.cisstat.analyzego import calculateGOStats +from cistematic.core.geneinfo import geneinfoDB + +print "version 2.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--geneName", action="store_true", dest="translateGene", + help="translate gene") + parser.add_option("--field", type="int", dest="fieldID", + help="column containing gene ID/Name") + parser.set_defaults(translateGene=False, fieldID=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + fieldID = 1 + if options.translateGene: + fieldID = 0 + + if options.fieldID is not None: + fieldID = options.fieldID + + genome = args[0] + infilename = args[1] + prefix = args[2] + + analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID) + + +def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1): + infile = open(infilename) + analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1) + infile.close() + + +def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1): + if translateGene: + idb = geneinfoDB(cache=True) + geneinfoDict = idb.getallGeneInfo(genome) + symbolToGidDict = {} + for gid in geneinfoDict: + symbol = geneinfoDict[gid][0][0].strip() + symbolToGidDict[symbol] = gid + + locusList = [] + for line in geneInfoList: + fields = line.split() + if translateGene: + gene = fields[fieldID] + if "LOC" in gene: + gID = gene[3:] + elif "FAR" in gene: + print "ignoring %s" % gene + continue + else: + try: + gID = symbolToGidDict[gene] + except KeyError: + print "ignoring %s" % gene + continue + else: + gID = fields[fieldID] + + if (genome, gID) not in locusList: + locusList.append((genome, gID)) + + if len(locusList) > 0: + calculateGOStats(locusList, prefix) + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/bedtoregion.py b/bedtoregion.py new file mode 100755 index 0000000..d6c44de --- /dev/null +++ b/bedtoregion.py @@ -0,0 +1,35 @@ +import sys, string + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s factorlabel bedinfilename regionoutfile" % sys.argv[0] + + if len(argv) < 4: + print usage + sys.exit(1) + + factor = argv[1] + infilename = argv[2] + outfilename = argv[3] + + bedToRegion(factor, infilename, outfilename) + + +def bedToRegion(factor, infilename, outfilename): + index = 1 + infile = open(infilename) + outfile = open(outfilename, 'w') + for line in infile: + if 'track' in line: + continue + fields = line.split() + line = string.join(fields, '\t') + outfile.write('%s%d\t%s\n' % (factor, index, line)) + index += 1 + infile.close() + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/binstocdf.py b/binstocdf.py new file mode 100755 index 0000000..9381866 --- /dev/null +++ b/binstocdf.py @@ -0,0 +1,46 @@ +import sys + +print 'version 1.0' + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 2: + print 'usage: python %s infile outfile' % sys.argv[0] + sys.exit(1) + + infilename = argv[0] + outfilename = argv[1] + + binToCDF(infilename, outfilename) + + +def binToCDF(infilename, outfilename): + infile = open(infilename) + outfile = open(outfilename, 'w') + + for line in infile: + fields = line.strip().split() + if len(fields) < 4: + continue + + total = int(fields[2]) + if total == 0: + outfile.write(line) + continue + + outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3])) + cum = 0 + for bin in fields[4:]: + cum += int(bin) + percent = 100 * cum / total + outfile.write('\t%d' % percent) + + outfile.write('\n') + + infile.close() + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/buildMatrix.py b/buildMatrix.py new file mode 100755 index 0000000..361f56e --- /dev/null +++ b/buildMatrix.py @@ -0,0 +1,120 @@ +# +# buildMatrix.py +# ENRAGE +# +# Created by Ali Mortazavi on 3/6/09. +# +import sys, string, optparse +from commoncode import writeLog + +versionString = "%prog: version 1.3" +print versionString + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog matrix.step.N-1 data.part matrix.step.N [--rescale] [--truncate maxRPKM] [--log altlogfile]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--rescale", action="store_true", dest="rescale") + parser.add_option("--truncate", type="int", dest="maxRPKM") + parser.add_option("--log", dest="logfilename") + parser.set_defaults(rescale=False, maxRPKM=None, logfilename="buildMatrix.log") + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(0) + + infile = args[0] + colfilename = args[1] + outfilename = args[2] + + if options.maxRPKM is not None: + truncateRPKM = True + maxRPKM = options.maxRPKM + else: + truncateRPKM = False + maxRPKM = 100000000 + + buildMatrix(infile, colfilename, outfilename, truncateRPKM, maxRPKM, + options.rescale, options.logfilename) + + +def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM, + maxRPKM=100000000, rescale=False, logfilename="buildMatrix.log"): + + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + + if "/" in colfilename: + colname = colfilename.split("/")[-1] + else: + colname = colfilename + + fileParts = colname.split(".") + colID = fileParts[0] + + infile = open(inFileName) + colfile = open(colfilename) + outfile = open(outfilename, "w") + header = infile.readline()[:-1] + if header.strip() == "": + header = "#\t" + + outfile.write( "%s\t%s\n" % (header, colID)) + + values = [] + min = 20000000000. + max = -1. + untruncatedMax = -1. + for line in colfile: + if doNotProcessLine(line): + continue + + fields = line.strip().split() + val = float(fields[-1]) + if truncateRPKM and val > maxRPKM: + if val > untruncatedMax: + untruncatedMax = val + + val = maxRPKM + + values.append(val) + if val < min: + min = val + + if val > max: + max = val + + range = max - min + if rescale: + finalValues = [(val - min)/range for val in values] + else: + finalValues = values + + for val in finalValues: + line = infile.readline().strip() + line += "\t%1.3f\n" % val + outfile.write(line) + + outfile.close() + + if untruncatedMax > 0: + max = untruncatedMax + + message = "max value in %s was %.2f" % (colname, max) + if untruncatedMax > 0: + message += " but was truncated to %d" % maxRPKM + + print message + writeLog(logfilename, versionString, message) + + +def doNotProcessLine(line): + return line[0] == "#" + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/buildrmaskdb.py b/buildrmaskdb.py new file mode 100755 index 0000000..d1d6b00 --- /dev/null +++ b/buildrmaskdb.py @@ -0,0 +1,61 @@ +try: + import psyco + psyco.full() +except: + pass + +import sys +import sqlite3 as sqlite +import os + +def main(argv=None): + if not argv: + argv = sys.argv + + print "version 2.0" + if len(argv) < 3: + print "usage: python %s rmaskdir rmaskdbfile" % argv[0] + exit(1) + + rmaskdir = argv[1] + rmaskdb = argv[2] + + buildrmaskdb(rmaskdir, rmaskdb) + + +def buildrmaskdb(rmaskdir, rmaskdb): + files = os.listdir(rmaskdir) + db = sqlite.connect(rmaskdb) + sql = db.cursor() + sql.execute("create table repeats (chrom varchar, start int, stop int, name varchar, family varchar)") + sql.execute("PRAGMA temp_store = MEMORY") + sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000") + db.commit() + + for filename in files: + if "rmsk" not in filename: + continue + + print filename + infile = open(rmaskdir + "/" + filename) + for entry in infile: + fields = entry.strip().split("\t") + chrom = fields[5][3:] + start = int(fields[6]) + stop = int(fields[7]) + name = fields[10] + family = fields[12] + stmt = "insert into repeats values('%s', %d, %d, '%s', '%s')" % (chrom, start, stop, name, family) + sql.execute(stmt) + + db.commit() + + print "building index..." + sql.execute("PRAGMA SYNCHRONOUS = OFF") + sql.execute("create index chromIndex on repeats(chrom)") + sql.execute("create index mainIndex on repeats(chrom, start, stop)") + db.commit() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/buildsnpdb.py b/buildsnpdb.py new file mode 100755 index 0000000..2510443 --- /dev/null +++ b/buildsnpdb.py @@ -0,0 +1,95 @@ +""" +creates table snp {chrom varchar, + start int, + stop int, + name varchar, + observed varchar, + strand varchar, + ucscref varchar, + ncbiref varchar, + func varchar, + moltype varchar, + valid varchar, + class varchar +} + +sample line in dbsnp file +608 chr1 3093453 3093454 rs52602943 0 + G G C/G genomic single unknown 0 0 unknown exact 1 +""" + +try: + import psyco + psyco.full() +except: + pass +import sys +import sqlite3 as sqlite + +def main(argv=None): + if not argv: + argv = sys.argv + + print "version 2.0" + if len(argv) < 3: + print "usage: python %s snpfile snpdbname" % argv[0] + sys.exit(1) + + snpfilename = argv[1] + snpdb = argv[2] + + buildsnpdb(snpfilename, snpdb) + + +def buildsnpdb(snpfilename, snpdb): + db = sqlite.connect(snpdb) + sql = db.cursor() + sql.execute("create table snp (chrom varchar, start long, stop long, name varchar, observed varchar, strand varchar, ucscref varchar, ncbiref varchar, func varchar, moltype varchar, valid varchar, class varchar)") + sql.execute("PRAGMA temp_store = MEMORY") + sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000") + db.commit() + + insertSize = 100000 + insertCounter = 0 + valuesList = [] + print snpfilename + infile = open(snpfilename) + for entry in infile: + try: + fields = entry.strip().split("\t") + chrom = fields[1][3:] + start = int(fields[2]) + stop = int(fields[3]) + name = fields[4] + strand = fields[6] + refNcbi = fields[7] + refUcsc = fields[8] + observed = fields[9] + molType = fields[10] + classes = fields[11] + valid = fields[12] + func = fields[15] + + valuesList.append((chrom, start, stop, name, observed, strand, refUcsc, refNcbi, func, molType, valid, classes)) + insertCounter += 1 + except: + continue + + if insertCounter % insertSize == 0: + print insertCounter + db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList) + valuesList = [] + + if len(valuesList) > 0: + db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList) + + db.commit() + + print "building index" + sql.execute("PRAGMA SYNCHRONOUS = OFF") + sql.execute("create index chromIndex on snp(chrom)") + sql.execute("create index mainIndex on snp(chrom,start,stop)") + db.commit() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/cdfdist.py b/cdfdist.py new file mode 100755 index 0000000..7166244 --- /dev/null +++ b/cdfdist.py @@ -0,0 +1,37 @@ +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 4: + print "usage: python %s bins percent infile" % sys.argv[0] + sys.exit(1) + + bins = int(argv[0]) + percent = int(argv[1]) + infilename = argv[2] + + cdfDist(bins, percent, infilename) + + +def cdfDist(bins, percent, infilename): + infile = open(infilename) + binsList = [0] * bins + + for line in infile: + fields = line.strip().split() + index = 0 + for binCdf in fields[-1 * bins:]: + if int(binCdf) > percent: + binsList[index] += 1 + break + + index += 1 + + infile.close() + print binsList + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/checkrmask.py b/checkrmask.py new file mode 100755 index 0000000..9f58983 --- /dev/null +++ b/checkrmask.py @@ -0,0 +1,189 @@ +try: + import psyco + psyco.full() +except: + pass + +import sqlite3 as sqlite +import sys, string, optparse +import os.path +from commoncode import writeLog + +versionString = "%prog: version 3.5" +print versionString + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--startField", type="int", dest="startField") + parser.add_option("--log", dest="logfilename") + parser.set_defaults(cachePages=500000, startField=0, logfilename=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + dbfile = args[0] + filename = args[1] + outfile = args[2] + goodfile = args[3] + + checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename) + + +def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None): + + outfile = open(outFileName, "w") + goodfile = open(goodFileName, "w") + if startField < 0: + startField = 0 + + if cachePages < 250000: + cachePages = 250000 + + doLog = False + if logfilename is not None: + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + doLog = True + + infile = open(filename) + if os.path.isfile(dbfile): + db = sqlite.connect(dbfile) + sql = db.cursor() + sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages) + sql.execute("PRAGMA temp_store = MEMORY") + else: + print "No database - passing through" + if doLog: + writeLog(logfilename, versionString, "No database - passing through") + + for line in infile: + outfile.write("%s\tNR\tNR\t0.00\n" % line) + goodfile.write(line) + + outfile.close() + goodfile.close() + sys.exit(0) + + featureList = [] + featureDict = {} + + for line in infile: + if line[0] == "#": + continue + + fields = line.strip().split("\t") + chrom = fields[startField][3:] + start = int(fields[startField + 1]) + stop = int(fields[startField + 2]) + featureList.append((chrom,start, stop)) + featureDict[(chrom, start, stop)] = line.strip() + + infile.close() + + featureList.sort() + currentChrom = "" + currentMax = 0 + increment = 20000000 + for (chrom, start, stop) in featureList: + if chrom != currentChrom: + currentMax = 0 + + if start > currentMax: + currentChrom = chrom + currentMin = currentMax + currentMax += increment + print "caching %s from %d to %d" % (chrom, currentMin, currentMax) + try: + del con + except: + pass + + con = sqlite.connect(":memory:") + sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000)) + results = sql.fetchall() + results2 = [] + con.execute("create table repeats(name, family, start, stop)") + con.execute("PRAGMA CACHE_SIZE = %d" % cachePages) + con.execute("PRAGMA temp_store = MEMORY") + for (rstart, rstop, name, family) in results: + results2.append((name, family, int(rstart), int(rstop))) + + con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2) + con.execute("CREATE INDEX posIndex on repeats(start, stop)") + print chrom, len(results2) + sql2 = con.cursor() + + featureLength = abs(stop - start) + results = [] + finalresults = [] + sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start)) + results = sql2.fetchall() + for (rstart, rstop, name, family) in results: + overlapLength = float(abs(rstop - start)) + if overlapLength > featureLength: + overlapLength = featureLength + + ratio = overlapLength / featureLength + if (name, family, ratio) not in finalresults: + finalresults.append((name, family, ratio)) + + sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop)) + results = sql2.fetchall() + for (rstart, rstop, name, family) in results: + overlapLength = float(abs(rstart - stop)) + if overlapLength > featureLength: + overlapLength = featureLength + + ratio = overlapLength / featureLength + if (name, family, ratio) not in finalresults: + finalresults.append((name, family, ratio)) + + sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop)) + results = sql2.fetchall() + for (rstart, rstop, name, family) in results: + overlapLength = float(abs(rstop - rstart)) + if overlapLength > featureLength: + overlapLength = featureLength + + ratio = overlapLength / featureLength + if (name, family, ratio) not in finalresults: + finalresults.append((name, family, ratio)) + + sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop)) + results = sql2.fetchall() + for (rstart, rstop, name, family) in results: + overlapLength = float(abs(rstop - rstart)) + if overlapLength > featureLength: + overlapLength = featureLength + + ratio = overlapLength / featureLength + if (name, family, ratio) not in finalresults: + finalresults.append((name, family, ratio)) + + line = featureDict[(chrom, start, stop)] + total = 0. + for (name, family, fraction) in finalresults: + outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction) + total += fraction + print outline + outfile.write(outline + "\n") + + if len(finalresults) == 0: + outline = "%s\tNR\tNR\t%0.00" % line + print outline + outfile.write(outline + "\n") + + if total < 0.2: + goodfile.write(line + "\n") + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chiapet/.svn/entries b/chiapet/.svn/entries new file mode 100644 index 0000000..595b06e --- /dev/null +++ b/chiapet/.svn/entries @@ -0,0 +1,142 @@ +10 + +dir +23 +file:///Users/sau/svn/repos/erange/source/Erange/chiapet +file:///Users/sau/svn/repos + + + +2010-10-01T18:32:26.347691Z +22 +sau + + + + + + + + + + + + + + +d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9 + +segregateLinkers.py +file + + + + +2010-09-15T19:01:49.000000Z +a847d39676e6a4fb9501811ab9a4c0b9 +2010-09-15T19:02:48.670738Z +21 +sau + + + + + + + + + + + + + + + + + + + + + +2380 + +__init__.py +file + + + + +2010-09-15T19:01:49.000000Z +d41d8cd98f00b204e9800998ecf8427e +2010-09-15T19:02:48.670738Z +21 +sau + + + + + + + + + + + + + + + + + + + + + +0 + +markLinkers.py +file + + + + + +10c527dc803a21ba14dfd8efc4f1e3d3 +2010-10-01T18:32:26.347691Z +22 +sau + +linkers.fa +file + + + + +2010-09-15T19:01:49.000000Z +2b64087c826083f04e0ff968312e019a +2010-09-15T19:02:48.670738Z +21 +sau + + + + + + + + + + + + + + + + + + + + + +63 + diff --git a/chiapet/.svn/text-base/__init__.py.svn-base b/chiapet/.svn/text-base/__init__.py.svn-base new file mode 100644 index 0000000..e69de29 diff --git a/chiapet/.svn/text-base/linkers.fa.svn-base b/chiapet/.svn/text-base/linkers.fa.svn-base new file mode 100644 index 0000000..290c98b --- /dev/null +++ b/chiapet/.svn/text-base/linkers.fa.svn-base @@ -0,0 +1,4 @@ +>linker_b.1 +GTTGGATAAGATATCGCGG +>linker_b.2 +GTTGGAATGTATATCGCGG \ No newline at end of file diff --git a/chiapet/.svn/text-base/markLinkers.py.svn-base b/chiapet/.svn/text-base/markLinkers.py.svn-base new file mode 100644 index 0000000..a2a97e6 --- /dev/null +++ b/chiapet/.svn/text-base/markLinkers.py.svn-base @@ -0,0 +1,68 @@ +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + linkerfile = argv[1] + infile = argv[2] + outfile = argv[3] + + markLinkers(linkerfile, infile, outfile) + + +def markLinkers(linkerFileName, inFileName, outFileName): + infile = open(inFileName) + outfile = open(outFileName, "w") + linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName) + + for line in infile: + if len(line) < 2: + continue + + if "@" in line: + readID = line.strip() + readID = readID.replace("@", "") + else: + found = False + for linkerID in linkerList: + position = line.find(linkerDict[linkerID]) + if position >= 19: + found = True + outfile.write(">L%s_%s\n" % (linkerID[-1:], readID)) + outfile.write("%s\n" % line[:20]) + + if not found: + outfile.write(">NA_%s\n" % readID) + outfile.write("%s\n" % line[:20]) + + +def getLinkerInformationFromFile(linkerFileName): + linkerDict = {} + linkerList = [] + try: + linkerfile = open(linkerFileName) + return getLinkerInformation(linkerfile) + except IOError: + return linkerDict, linkerList + + +def getLinkerInformation(linkerInformationList): + linkerDict = {} + linkerList = [] + + for entry in linkerInformationList: + if ">" in entry: + linkerID = entry.strip() + linkerID = linkerID[1:] + linkerList.append(linkerID) + else: + sequence = entry.strip() + linkerDict[linkerID] = sequence[:10] + + return linkerDict, linkerList + + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chiapet/.svn/text-base/segregateLinkers.py.svn-base b/chiapet/.svn/text-base/segregateLinkers.py.svn-base new file mode 100644 index 0000000..3d213da --- /dev/null +++ b/chiapet/.svn/text-base/segregateLinkers.py.svn-base @@ -0,0 +1,88 @@ +import sys + + +def main(argv=None): + if not argv: + argv = sys.argv + + infile1 = argv[1] + infile2 = argv[2] + outprefix = argv[3] + + segregateLinkers(infile1, infile2, outprefix) + + +def segregateLinkers(infile1name, infile2name, outprefix): + infile1 = open(infile1name) + infile2 = open(infile2name) + same1 = 0 + same2 = 0 + mixed = 0 + hasNA = 0 + + outsame1 = open("%s.same1.fa" % outprefix, "w") + outsame2 = open("%s.same2.fa" % outprefix, "w") + outNA = open("%s.NA.fa" % outprefix, "w") + outmixed = open("%s.mixed.fa" % outprefix, "w") + + lines1 = infile1.readlines() + + failed = False + for line1 in lines1: + line2 = infile2.readline() + if failed: + line2 = infile2.readline() + print line1.strip() + print line2.strip() + sys.exit(1) + continue + + if ">" in line1: + try: + (linker1, readid1) = line1.split("_") + (linker2, readid2) = line2.split("_") + shortid1 = readid1.split("/")[0] + shortid2 = readid2.split("/")[0] + if shortid1 != shortid2: + print shortid1, shortid2 + sys.exit(1) + + failed = False + except: + print line1.strip() + print line2.strip() + failed = True + + continue + + if "NA" in linker1 or "NA" in linker2: + hasNA += 1 + outNA.write("%s_%s%s" % (linker1, readid1, line1)) + outNA.write("%s_%s%s" % (linker2, readid2, line2)) + elif linker1 == linker2: + if "L1" in linker1: + same1 += 1 + outsame1.write("%s_%s%s" % (linker1, readid1, line1)) + outsame1.write("%s_%s%s" % (linker2, readid2, line2)) + else: + same2 += 1 + outsame2.write("%s_%s%s" % (linker1, readid1, line1)) + outsame2.write("%s_%s%s" % (linker2, readid2, line2)) + else: + mixed += 1 + outmixed.write("%s_%s%s" % (linker1, readid1, line1)) + outmixed.write("%s_%s%s" % (linker2, readid2, line2)) + + print same1 + print same2 + print mixed + print hasNA + + outmixed.close() + outNA.close() + outsame1.close() + outsame2.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chiapet/.svn/tmp/markLinkers.py.tmp b/chiapet/.svn/tmp/markLinkers.py.tmp new file mode 100644 index 0000000..a2a97e6 --- /dev/null +++ b/chiapet/.svn/tmp/markLinkers.py.tmp @@ -0,0 +1,68 @@ +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + linkerfile = argv[1] + infile = argv[2] + outfile = argv[3] + + markLinkers(linkerfile, infile, outfile) + + +def markLinkers(linkerFileName, inFileName, outFileName): + infile = open(inFileName) + outfile = open(outFileName, "w") + linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName) + + for line in infile: + if len(line) < 2: + continue + + if "@" in line: + readID = line.strip() + readID = readID.replace("@", "") + else: + found = False + for linkerID in linkerList: + position = line.find(linkerDict[linkerID]) + if position >= 19: + found = True + outfile.write(">L%s_%s\n" % (linkerID[-1:], readID)) + outfile.write("%s\n" % line[:20]) + + if not found: + outfile.write(">NA_%s\n" % readID) + outfile.write("%s\n" % line[:20]) + + +def getLinkerInformationFromFile(linkerFileName): + linkerDict = {} + linkerList = [] + try: + linkerfile = open(linkerFileName) + return getLinkerInformation(linkerfile) + except IOError: + return linkerDict, linkerList + + +def getLinkerInformation(linkerInformationList): + linkerDict = {} + linkerList = [] + + for entry in linkerInformationList: + if ">" in entry: + linkerID = entry.strip() + linkerID = linkerID[1:] + linkerList.append(linkerID) + else: + sequence = entry.strip() + linkerDict[linkerID] = sequence[:10] + + return linkerDict, linkerList + + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chiapet/__init__.py b/chiapet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chiapet/linkers.fa b/chiapet/linkers.fa new file mode 100644 index 0000000..290c98b --- /dev/null +++ b/chiapet/linkers.fa @@ -0,0 +1,4 @@ +>linker_b.1 +GTTGGATAAGATATCGCGG +>linker_b.2 +GTTGGAATGTATATCGCGG \ No newline at end of file diff --git a/chiapet/markLinkers.py b/chiapet/markLinkers.py new file mode 100644 index 0000000..1f7c675 --- /dev/null +++ b/chiapet/markLinkers.py @@ -0,0 +1,67 @@ +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + linkerfile = argv[1] + infile = argv[2] + outfile = argv[3] + + markLinkers(linkerfile, infile, outfile) + + +def markLinkers(linkerFileName, inFileName, outFileName): + infile = open(inFileName) + outfile = open(outFileName, "w") + linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName) + + for line in infile: + if len(line) < 2: + continue + + if "@" in line: + readID = line.strip() + readID = readID.replace("@", "") + else: + found = False + for linkerID in linkerList: + position = line.find(linkerDict[linkerID]) + if position >= 19: + found = True + outfile.write(">L%s_%s\n" % (linkerID[-1:], readID)) + outfile.write("%s\n" % line[:20]) + + if not found: + outfile.write(">NA_%s\n" % readID) + outfile.write("%s\n" % line[:20]) + + +def getLinkerInformationFromFile(linkerFileName): + linkerDict = {} + linkerList = [] + try: + linkerfile = open(linkerFileName) + return getLinkerInformation(linkerfile) + except IOError: + return linkerDict, linkerList + + +def getLinkerInformation(linkerInformationList): + linkerDict = {} + linkerList = [] + + for entry in linkerInformationList: + if ">" in entry: + linkerID = entry.strip() + linkerID = linkerID[1:] + linkerList.append(linkerID) + else: + sequence = entry.strip() + linkerDict[linkerID] = sequence[:10] + + return linkerDict, linkerList + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chiapet/segregateLinkers.py b/chiapet/segregateLinkers.py new file mode 100644 index 0000000..3d213da --- /dev/null +++ b/chiapet/segregateLinkers.py @@ -0,0 +1,88 @@ +import sys + + +def main(argv=None): + if not argv: + argv = sys.argv + + infile1 = argv[1] + infile2 = argv[2] + outprefix = argv[3] + + segregateLinkers(infile1, infile2, outprefix) + + +def segregateLinkers(infile1name, infile2name, outprefix): + infile1 = open(infile1name) + infile2 = open(infile2name) + same1 = 0 + same2 = 0 + mixed = 0 + hasNA = 0 + + outsame1 = open("%s.same1.fa" % outprefix, "w") + outsame2 = open("%s.same2.fa" % outprefix, "w") + outNA = open("%s.NA.fa" % outprefix, "w") + outmixed = open("%s.mixed.fa" % outprefix, "w") + + lines1 = infile1.readlines() + + failed = False + for line1 in lines1: + line2 = infile2.readline() + if failed: + line2 = infile2.readline() + print line1.strip() + print line2.strip() + sys.exit(1) + continue + + if ">" in line1: + try: + (linker1, readid1) = line1.split("_") + (linker2, readid2) = line2.split("_") + shortid1 = readid1.split("/")[0] + shortid2 = readid2.split("/")[0] + if shortid1 != shortid2: + print shortid1, shortid2 + sys.exit(1) + + failed = False + except: + print line1.strip() + print line2.strip() + failed = True + + continue + + if "NA" in linker1 or "NA" in linker2: + hasNA += 1 + outNA.write("%s_%s%s" % (linker1, readid1, line1)) + outNA.write("%s_%s%s" % (linker2, readid2, line2)) + elif linker1 == linker2: + if "L1" in linker1: + same1 += 1 + outsame1.write("%s_%s%s" % (linker1, readid1, line1)) + outsame1.write("%s_%s%s" % (linker2, readid2, line2)) + else: + same2 += 1 + outsame2.write("%s_%s%s" % (linker1, readid1, line1)) + outsame2.write("%s_%s%s" % (linker2, readid2, line2)) + else: + mixed += 1 + outmixed.write("%s_%s%s" % (linker1, readid1, line1)) + outmixed.write("%s_%s%s" % (linker2, readid2, line2)) + + print same1 + print same2 + print mixed + print hasNA + + outmixed.close() + outNA.close() + outsame1.close() + outsame2.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chkSNPrmask.py b/chkSNPrmask.py new file mode 100755 index 0000000..498ef49 --- /dev/null +++ b/chkSNPrmask.py @@ -0,0 +1,131 @@ +try: + import psyco + psyco.full() +except: + pass + +import sqlite3 as sqlite +import sys +import tempfile, shutil, os, optparse +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" +tempfile.tempdir = cisTemp + +print "version 3.3: %prog" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s dbfile snpsfile nr_snps_outfile [--cache numPages] [--repeats]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--repeats", action="store_true", dest="repeats") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(repeats=False, cachePages=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + dbfile = args[0] + filename = args[1] + outfile = args[2] + + chkSNPrmask(dbfile, filename, outfile, options.repeats, options.cachePages) + + +def chkSNPrmask(dbfile, filename, outfile, repeats=False, cachePages=None): + print dbfile + + if cachePages is not None: + if cachePages < 250000: + cachePages = 250000 + + print "caching locally..." + cachefile = tempfile.mktemp() + ".db" + shutil.copyfile(dbfile, cachefile) + db = sqlite.connect(cachefile) + doCache = True + print "cached..." + else: + cachePages = 500000 + doCache = False + db = sqlite.connect(dbfile) + + sql = db.cursor() + sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages) + sql.execute("PRAGMA temp_store = MEMORY") + sql.execute("ANALYZE") + + infile = open(filename) + featureList = [] + featureDict = {} + + for line in infile: + if doNotProcessLine(line): + continue + + fields = line.strip().split("\t") + chrom = fields[2][3:] + pos = int(fields[3]) + featureList.append((chrom,pos)) + featureDict[(chrom, pos)] = line.strip() + + featureList.sort() + + index = 0 + currentChrom=None + for (chrom, pos) in featureList: + index += 1 + if chrom != currentChrom: + print "\n%s" % chrom + currentChrom = chrom + + results = [] + try: + sql.execute("select family from repeats where chrom = '%s' and %d between start and stop" % (chrom, pos)) + results = sql.fetchall() + except: + pass + + if repeats: # if user wants to keep track of the SNPs in repeats + featureDict[(chrom,pos)] += "\tN\A" + for x in results: + featureDict[(chrom,pos)] += "\t" + str(x) + else: + for x in results: + try: + del featureDict[(chrom,pos)] + except KeyError: + pass + + if index % 100 == 0: + print ".", + sys.stdout.flush() + + if doCache: + print "removing cache" + del db + os.remove(cachefile) + + outFile = open(outfile, "w") + for key, value in featureDict.iteritems(): + outStr = str(value) + "\n" + outFile.write(outStr) + + outFile.close() + + +def doNotProcessLine(line): + return line[0] == "#" + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/chksnp.py b/chksnp.py new file mode 100755 index 0000000..daf6b0a --- /dev/null +++ b/chksnp.py @@ -0,0 +1,169 @@ +try: + import psyco + psyco.full() +except: + pass + +import sys +import optparse +import tempfile +import shutil +import os +import string +import sqlite3 as sqlite + +print "version 3.6: %s" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog dbfile snpsfile dbsnp_outfile [--cache numPages] [--snpDB dbfile]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--snpDB", action="append", dest="snpDBList", + help="additional snp db files to check will be searched in order given") + parser.set_defaults(cachePages=None, snpDBList=[]) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + dbfile = args[0] + infile = args[1] + outfile = args[2] + + chkSNPFile(dbfile, infile, outfile, options.cachePages, options.snpDBList) + + +def chkSNPFile(dbfile, inputFileName, outputFileName, cachePages=None, snpDBList=[]): + + snpInputFile = open(inputFileName) + snpLocationList, snpDict = getSNPLocationInfo(snpInputFile) + + dbList = [dbfile] + for dbFileName in snpDBList: + dbList.append(dbFileName) + + annotatedSnpDict = annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages) + + outputFile = open(outputFileName, "w") + outputLine = "" + outputFile.write(outputLine) + for key,value in annotatedSnpDict.iteritems(): + outputLine = "%s\n" % str(value) + outputFile.write(outputLine) + + outputFile.close() + + +def chkSNP(dbList, snpPropertiesList, cachePages=None): + + snpLocationList, snpDict = getSNPLocationInfo(snpPropertiesList) + return annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages) + + +def getSNPLocationInfo(snpPropertiesList): + snpLocationList = [] + snpDict = {} + + for line in snpPropertiesList: + if doNotProcessLine(line): + continue + + fields = line.strip().split("\t") + chromosome = fields[2][3:] + position = int(fields[3]) + snpLocation = (chromosome, position) + snpLocationList.append(snpLocation) + snpDict[snpLocation] = line.strip() + + snpLocationList.sort() + + return snpLocationList, snpDict + + +def doNotProcessLine(line): + return line[0] == "#" + + +def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None): + return annotateSNPFromDBList(snpLocationList, snpDict, [dbFileName], cachePages) + + +def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None): + if os.environ.get("CISTEMATIC_TEMP"): + cisTemp = os.environ.get("CISTEMATIC_TEMP") + else: + cisTemp = "/tmp" + + tempfile.tempdir = cisTemp + + for dbFileName in dbList: + if cachePages is not None: + print "caching locally..." + cachefile = "%s.db" % tempfile.mktemp() + shutil.copyfile(dbFileName, cachefile) + db = sqlite.connect(cachefile) + doCache = True + print "cached..." + else: + db = sqlite.connect(dbFileName) + doCache = False + + cacheSize = max(cachePages, 500000) + sql = db.cursor() + sql.execute("PRAGMA CACHE_SIZE = %d" % cacheSize) + sql.execute("PRAGMA temp_store = MEMORY") + + index = 0 + foundEntries = [] + for chromosomePosition in snpLocationList: + (chromosome, position) = chromosomePosition + found = False + results = [] + index += 1 + startPosition = position - 1 + sql.execute("select func, name from snp where chrom = '%s' and start = %d and stop = %d" % (chromosome, startPosition, position)) + results = sql.fetchall() + try: + (func, name) = results[0] + found = True + except IndexError: + sql.execute("select func, name from snp where chrom = '%s' and start <= %d and stop >= %d" % (chromosome, startPosition, position)) + results = sql.fetchall() + try: + (func, name) = results[0] + found = True + except IndexError: + pass + + if found: + snpEntry = snpDict[chromosomePosition] + snpDict[chromosomePosition] = string.join([snpEntry, str(name), str(func)], "\t") + foundEntries.append(chromosomePosition) + + if index % 100 == 0: + print ".", + sys.stdout.flush() + + for chromosomePosition in foundEntries: + del snpLocationList[snpLocationList.index(chromosomePosition)] + + if doCache: + print "\nremoving cache" + del db + os.remove(cachefile) + + for chromosomePosition in snpLocationList: + snpEntry = snpDict[chromosomePosition] + snpDict[chromosomePosition] = string.join([snpEntry, "N\A", "N\A"], "\t") + + return snpDict + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/colsum.py b/colsum.py new file mode 100755 index 0000000..703bd5c --- /dev/null +++ b/colsum.py @@ -0,0 +1,39 @@ +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + print "version 1.2" + if len(argv) < 3: + print "usage: python %s field filename" % argv[0] + print "\n\tfields are counted starting at zero.\n" + sys.exit(1) + + fieldID = int(argv[1]) + filename = argv[2] + + count = colsum(fieldID, filename) + print count + + +def colsum(fieldID, filename): + infile = open(filename) + count = 0 + + for line in infile: + fields = line.strip().split() + try: + if "." in fields[fieldID]: + count += float(fields[fieldID]) + else: + count += int(fields[fieldID]) + except ValueError: + pass + + infile.close() + return count + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/combineRPKMs.py b/combineRPKMs.py new file mode 100755 index 0000000..8fd8f9f --- /dev/null +++ b/combineRPKMs.py @@ -0,0 +1,87 @@ +# +# combineRPKMS.py +# ENRAGE +# + +print 'version 1.0' +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]" + parser = optparse.OptionParser(usage=usage) + parser.add_option("--withmultifraction", action="store_true", dest="doFraction") + parser.set_defaults(doFraction=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + firstfile = args[0] + expandedfile = args[1] + finalfile = args[2] + outfile = args[3] + + combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction) + + +def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False): + firstfile = open(firstfileName) + expandedfile = open(expandedfileName) + finalfile = open(finalfileName) + outfile = open(outfileName, "w") + + firstDict = {} + gidDict = {} + expandedDict = {} + + for line in firstfile: + fields = line.strip().split() + firstDict[fields[1]] = fields[-1] + + firstfile.close() + + for line in expandedfile: + fields = line.strip().split() + expandedDict[fields[1]] = fields[-1] + gidDict[fields[1]] = fields[0] + + expandedfile.close() + + if doFraction: + header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n" + else: + header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n" + + outfile.write(header) + + for line in finalfile: + fields = line.strip().split() + gene = fields[0] + rnakb = fields[1] + finalRPKM = fields[2] + firstRPKM = firstDict.get(gene, "") + outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM) + + if doFraction: + fraction = fields[3] + outline += "\t%s" % fraction + + outfile.write(outline + '\n') + + finalfile.close() + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/combinerds.py b/combinerds.py new file mode 100755 index 0000000..7eac48a --- /dev/null +++ b/combinerds.py @@ -0,0 +1,121 @@ +# +# combinerds.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys +from commoncode import readDataset + +print '%s: version 1.1' % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 2: + print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0] + #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n' + sys.exit(1) + + doCache = False + cachePages = -1 + if '--cache' in argv: + doCache = True + try: + cachePages = int(argv[sys.argv.index('-cache') + 1]) + except: + pass + + datafile = argv[1] + infileList = [] + for index in range(2, len(argv)): + if argv[index][0] == '-': + break + infileList.append(sys.argv[index]) + + print "destination RDS: %s" % datafile + + if '--initrna' in argv: + rds = readDataset(datafile, initialize=True, datasetType='RNA') + elif '--init' in argv: + rds = readDataset(datafile, initialize=True) + + withFlag = '' + if '--flag' in argv: + withFlag = argv[sys.argv.index('-flag') + 1] + print "restrict to flag = %s" % withFlag + + rds = readDataset(datafile, verbose=True, cache=doCache) + + if cachePages > rds.getDefaultCacheSize(): + rds.setDBcache(cachePages) + cacheVal = cachePages + else: + cacheVal = rds.getDefaultCacheSize() + + doIndex = False + if '--index' in argv: + doIndex = True + + tableList = [] + if '--table' in argv: + tableList.append(argv[argv.index('-table') + 1]) + else: + tableList = rds.getTables() + + combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache) + + +def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False): + metaDict = rds.getMetadata() + if "numberImports" not in metaDict: + origIndex = 0 + rds.insertMetadata([("numberImports", str(0))]) + else: + origIndex = int(metaDict["numberImports"]) + + index = origIndex + for inputfile in infileList: + asname = "input" + str(index) + rds.attachDB(inputfile,asname) + for table in tableList: + print "importing table %s from file %s" % (table, inputfile) + ascols = "*" + if table == "uniqs": + ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname + elif table == "multi": + ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname + elif table == "splices": + ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname + elif table == "metadata": + ascols = "name, value || ' (import_%d)'" % index + rds.importFromDB(asname, table, ascols) + + if table != "metadata": + rds.importFromDB(asname, table, ascols, withFlag) + + rds.detachDB(asname) + rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))]) + index += 1 + + rds.updateMetadata("numberImports", index, origIndex) + if doIndex: + print "building index...." + if cacheVal > 0: + rds.buildIndex(cacheVal) + else: + rds.buildIndex() + + if doCache: + rds.saveCacheDB(datafile) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/commoncode.py b/commoncode.py new file mode 100755 index 0000000..9d86473 --- /dev/null +++ b/commoncode.py @@ -0,0 +1,2068 @@ +# +# commoncode.py +# ENRAGE +# + +import tempfile +import shutil +import os +from os import environ +import string +import sqlite3 as sqlite +from time import strftime +from array import array +from collections import defaultdict + +commoncodeVersion = 5.5 +currentRDSversion = 1.1 + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + + +def getReverseComplement(base): + revComp = {"A": "T", + "T": "A", + "G": "C", + "C": "G", + "N": "N" + } + + return revComp[base] + + +def countDuplicatesInList(listToCheck): + tally = defaultdict(int) + for item in listToCheck: + tally[item] += 1 + + return tally.items() + + +def writeLog(logFile, messenger, message): + """ create a log file to write a message from a messenger or append to an existing file. + """ + try: + logfile = open(logFile) + except IOError: + logfile = open(logFile, "w") + else: + logfile = open(logFile, "a") + + logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message)) + logfile.close() + + +def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, + fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False, + doMerge=True, keepPeak=False, returnTop=0): + + """ returns a list of merged overlapping regions; + can optionally filter regions that have a scoreField fewer than minHits. + Can also optionally return the label of each region, as well as the + peak, if supplied (peakPos and peakHeight should be the last 2 fields). + Can return the top regions based on score if higher than minHits. + """ + infile = open(regionfilename) + lines = infile.readlines() + regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel, + fullChrom, chromField, scoreField, pad, compact, + doMerge, keepPeak, returnTop) + + infile.close() + + return regions + + +def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False, + fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False, + doMerge=True, keepPeak=False, returnTop=0): + """ returns a list of merged overlapping regions; + can optionally filter regions that have a scoreField fewer than minHits. + Can also optionally return the label of each region, as well as the + peak, if supplied (peakPos and peakHeight should be the last 2 fields). + Can return the top regions based on score if higher than minHits. + """ + regions = {} + hasPvalue = 0 + hasShift = 0 + if 0 < returnTop < len(regionList): + scores = [] + for regionEntry in regionList: + if regionEntry[0] == "#": + if "pvalue" in regionEntry: + hasPvalue = 1 + + if "readShift" in regionEntry: + hasShift = 1 + + continue + + fields = regionEntry.strip().split("\t") + hits = float(fields[scoreField].strip()) + scores.append(hits) + + scores.sort() + returnTop = -1 * returnTop + minScore = scores[returnTop] + if minScore > minHits: + minHits = minScore + + mergeCount = 0 + chromField = int(chromField) + count = 0 + #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it + # exits at the first merge. This is not a problem when the input is sorted by start position, but in + # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there + # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will + # be merged with A but that will exit the loop and the merge with C will be missed. + for regionEntry in regionList: + if regionEntry[0] == "#": + if "pvalue" in regionEntry: + hasPvalue = 1 + + if "readShift" in regionEntry: + hasShift = 1 + + continue + + fields = regionEntry.strip().split("\t") + if minHits >= 0: + try: + hits = float(fields[scoreField].strip()) + except (IndexError, ValueError): + continue + + if hits < minHits: + continue + + if compact: + (chrom, pos) = fields[chromField].split(":") + (front, back) = pos.split("-") + start = int(front) + stop = int(back) + elif chromField > 1: + label = string.join(fields[:chromField],"\t") + chrom = fields[chromField] + start = int(fields[chromField + 1]) - pad + stop = int(fields[chromField + 2]) + pad + else: + label = fields[0] + chrom = fields[1] + start = int(fields[2]) - pad + stop = int(fields[3]) + pad + + if not fullChrom: + chrom = chrom[3:] + + length = abs(stop - start) + if keepPeak: + peakPos = int(fields[-2 - hasPvalue - hasShift]) + peakHeight = float(fields[-1 - hasPvalue - hasShift]) + + if chrom not in regions: + regions[chrom] = [] + + merged = False + + if doMerge and len(regions[chrom]) > 0: + for index in range(len(regions[chrom])): + if keepLabel and keepPeak: + (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index] + elif keepLabel: + (rlabel, rstart, rstop, rlen) = regions[chrom][index] + elif keepPeak: + (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index] + else: + (rstart, rstop, rlen) = regions[chrom][index] + + if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist): + if start < rstart: + rstart = start + + if rstop < stop: + rstop = stop + + rlen = abs(rstop - rstart) + if keepPeak: + if peakHeight > rpeakHeight: + rpeakHeight = peakHeight + rpeakPos = peakPos + + if keepLabel and keepPeak: + regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight) + elif keepLabel: + regions[chrom][index] = (label, rstart, rstop, rlen) + elif keepPeak: + regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight) + else: + regions[chrom][index] = (rstart, rstop, rlen) + + mergeCount += 1 + merged = True + break + + if not merged: + if keepLabel and keepPeak: + regions[chrom].append((label, start, stop, length, peakPos, peakHeight)) + elif keepLabel: + regions[chrom].append((label, start, stop, length)) + elif keepPeak: + regions[chrom].append((start, stop, length, peakPos, peakHeight)) + else: + regions[chrom].append((start, stop, length)) + + count += 1 + + if verbose and (count % 100000 == 0): + print count + + regionCount = 0 + for chrom in regions: + regionCount += len(regions[chrom]) + if keepLabel: + regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1])) + else: + regions[chrom].sort() + + if verbose: + print "merged %d times" % mergeCount + print "returning %d regions" % regionCount + + return regions + + +def regionsOverlap(start, stop, rstart, rstop): + if start > stop: + (start, stop) = (stop, start) + + if rstart > rstop: + (rstart, rstop) = (rstop, rstart) + + return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop) + + +def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist): + if start > stop: + (start, stop) = (stop, start) + + if rstart > rstop: + (rstart, rstop) = (rstop, rstart) + + return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist) + + +def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, + shift=0, returnShift=False, maxshift=75): + """ find the peak in a list of reads (hitlist) in a region + of a given length and absolute start point. returns a + list of peaks, the number of hits, a triangular-smoothed + version of hitlist, and the number of reads that are + forward (plus) sense. + If doWeight is True, weight the reads accordingly. + If leftPlus is True, return the number of plus reads left of + the peak, taken to be the first TopPos position. + """ + + seqArray = array("f", [0.] * length) + smoothArray = array("f", [0.] * length) + numHits = 0. + numPlus = 0. + regionArray = [] + if shift == "auto": + shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift) + + # once we have the best shift, compute seqArray + for read in hitList: + currentpos = read[0] - start + if read[1] == "+": + currentpos += shift + else: + currentpos -= shift + + if (currentpos < 1 - readlen) or (currentpos >= length): + continue + + hitIndex = 0 + if doWeight: + weight = read[2] + else: + weight = 1.0 + + numHits += weight + if leftPlus: + regionArray.append(read) + + while currentpos < 0: + hitIndex += 1 + currentpos += 1 + + while hitIndex < readlen and currentpos < length: + seqArray[currentpos] += weight + hitIndex += 1 + currentpos += 1 + + if read[1] == "+": + numPlus += weight + + # implementing a triangular smooth + for pos in range(2,length -2): + smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 + + topNucleotide = 0 + topPos = [] + for currentpos in xrange(length): + if topNucleotide < smoothArray[currentpos]: + topNucleotide = smoothArray[currentpos] + topPos = [currentpos] + elif topNucleotide == smoothArray[currentpos]: + topPos.append(currentpos) + + if leftPlus: + numLeftPlus = 0 + maxPos = topPos[0] + for read in regionArray: + if doWeight: + weight = read[2] + else: + weight = 1.0 + + currentPos = read[0] - start + if currentPos <= maxPos and read[1] == "+": + numLeftPlus += weight + + if returnShift: + return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift) + else: + return (topPos, numHits, smoothArray, numPlus, numLeftPlus) + else: + if returnShift: + return (topPos, numHits, smoothArray, numPlus, shift) + else: + return (topPos, numHits, smoothArray, numPlus) + + +def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): + bestShift = 0 + lowestScore = 20000000000 + for testShift in xrange(maxShift + 1): + shiftArray = array("f", [0.] * length) + for read in hitList: + currentpos = read[0] - start + if read[1] == "+": + currentpos += testShift + else: + currentpos -= testShift + + if (currentpos < 1) or (currentpos >= length): + continue + + if doWeight: + weight = read[2] + else: + weight = 1.0 + + if read[1] == "+": + shiftArray[currentpos] += weight + else: + shiftArray[currentpos] -= weight + + currentScore = 0 + for score in shiftArray: + currentScore += abs(score) + + print currentScore + if currentScore < lowestScore: + bestShift = testShift + lowestScore = currentScore + + return bestShift + + +def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False, + restrictList=[], regionComplement=False, maxStop=250000000): + """ return a dictionary of cistematic gene features. Requires + cistematic, obviously. Can filter-out pseudogenes. Will use + additional regions dict to supplement gene models, if available. + Can restrict output to a list of GIDs. + If regionComplement is set to true, returns the regions *outside* of the + calculated boundaries, which is useful for retrieving intronic and + intergenic regions. maxStop is simply used to define the uppermost + boundary of the complement region. + """ + featuresDict = genomeObject.getallGeneFeatures() + restrictGID = False + if len(restrictList) > 0: + restrictGID = True + + if len(additionalRegionsDict) > 0: + sortList = [] + for chrom in additionalRegionsDict: + for (label, start, stop, length) in additionalRegionsDict[chrom]: + if label not in sortList: + sortList.append(label) + + if label not in featuresDict: + featuresDict[label] = [] + sense = "+" + else: + sense = featuresDict[label][0][-1] + + featuresDict[label].append(("custom", chrom, start, stop, sense)) + + for gid in sortList: + featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) + + featuresByChromDict = {} + for gid in featuresDict: + if restrictGID and gid not in restrictList: + continue + + featureList = featuresDict[gid] + newFeatureList = [] + isPseudo = False + for (ftype, chrom, start, stop, sense) in featureList: + if ftype == "PSEUDO": + isPseudo = True + + if (start, stop, ftype) not in newFeatureList: + notContained = True + containedList = [] + for (fstart, fstop, ftype2) in newFeatureList: + if start >= fstart and stop <= fstop: + notContained = False + + if start < fstart and stop > fstop: + containedList.append((fstart, fstop)) + + if len(containedList) > 0: + newFList = [] + notContained = True + for (fstart, fstop, ftype2) in newFeatureList: + if (fstart, fstop) not in containedList: + newFList.append((fstart, fstop, ftype2)) + if start >= fstart and stop <= fstop: + notContained = False + + newFeatureList = newFList + if notContained: + newFeatureList.append((start, stop, ftype)) + + if ignorePseudo and isPseudo: + continue + + if chrom not in featuresByChromDict: + featuresByChromDict[chrom] = [] + + for (start, stop, ftype) in newFeatureList: + featuresByChromDict[chrom].append((start, stop, gid, sense, ftype)) + + for chrom in featuresByChromDict: + featuresByChromDict[chrom].sort() + + if regionComplement: + complementByChromDict = {} + complementIndex = 0 + for chrom in featuresByChromDict: + complementByChromDict[chrom] = [] + listLength = len(featuresByChromDict[chrom]) + if listLength > 0: + currentStart = 0 + for index in range(listLength): + currentStop = featuresByChromDict[chrom][index][0] + complementIndex += 1 + if currentStart < currentStop: + complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon")) + + currentStart = featuresByChromDict[chrom][index][1] + + currentStop = maxStop + complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon")) + + return (featuresByChromDict, complementByChromDict) + else: + return featuresByChromDict + + +def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, + additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, + lengthCDS=0, keepSense=False, adjustToNeighbor=True): + """ return a dictionary of gene loci. Can be used to retrieve additional + sequence upstream or downstream of gene, up to the next gene. Requires + cistematic, obviously. + Can filter-out pseudogenes and use additional regions outside of existing + gene models. Use upstreamSpanTSS to overlap half of the upstream region + over the TSS. + If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If + lengthCDS < 0bp, return only the last X bp from CDS. + """ + locusByChromDict = {} + if upstream == 0 and downstream == 0 and not useCDS: + print "getLocusByChromDict: asked for no sequence - returning empty dict" + return locusByChromDict + elif upstream > 0 and downstream > 0 and not useCDS: + print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" + return locusByChromDict + elif lengthCDS != 0 and not useCDS: + print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" + return locusByChromDict + elif upstreamSpanTSS and lengthCDS != 0: + print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" + return locusByChromDict + elif lengthCDS > 0 and downstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" + return locusByChromDict + elif lengthCDS < 0 and upstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + return locusByChromDict + + genome = genomeObject.genome + featuresDict = genomeObject.getallGeneFeatures() + if len(additionalRegionsDict) > 0: + sortList = [] + for chrom in additionalRegionsDict: + for (label, start, stop, length) in additionalRegionsDict[chrom]: + if label not in sortList: + sortList.append(label) + + if label not in featuresDict: + featuresDict[label] = [] + sense = "+" + else: + sense = featuresDict[label][0][-1] + + featuresDict[label].append(("custom", chrom, start, stop, sense)) + + for gid in sortList: + featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) + + for gid in featuresDict: + featureList = featuresDict[gid] + newFeatureList = [] + for (ftype, chrom, start, stop, sense) in featureList: + newFeatureList.append((start, stop)) + + if ignorePseudo and ftype == "PSEUDO": + continue + + newFeatureList.sort() + + sense = featureList[0][-1] + gstart = newFeatureList[0][0] + gstop = newFeatureList[-1][1] + glen = abs(gstart - gstop) + if sense == "F": + if not useCDS and upstream > 0: + if upstreamSpanTSS: + if gstop > (gstart + upstream / 2): + gstop = gstart + upstream / 2 + else: + gstop = gstart + elif not useCDS and downstream > 0: + gstart = gstop + + if upstream > 0: + if upstreamSpanTSS: + distance = upstream / 2 + else: + distance = upstream + + if adjustToNeighbor: + nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2) + if nextGene < distance * 2: + distance = nextGene / 2 + + if distance < 1: + distance = 1 + + gstart -= distance + + if downstream > 0: + distance = downstream + if adjustToNeighbor: + nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2) + if nextGene < downstream * 2: + distance = nextGene / 2 + + if distance < 1: + distance = 1 + + gstop += distance + + if lengthCDS > 0: + if lengthCDS < glen: + gstop = newFeatureList[0][0] + lengthCDS + + if lengthCDS < 0: + if abs(lengthCDS) < glen: + gstart = newFeatureList[-1][1] + lengthCDS + else: + if not useCDS and upstream > 0: + if upstreamSpanTSS: + if gstart < (gstop - upstream / 2): + gstart = gstop - upstream / 2 + else: + gstart = gstop + elif not useCDS and downstream > 0: + gstop = gstart + + if upstream > 0: + if upstreamSpanTSS: + distance = upstream /2 + else: + distance = upstream + + if adjustToNeighbor: + nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2) + if nextGene < distance * 2: + distance = nextGene / 2 + + if distance < 1: + distance = 1 + + gstop += distance + + if downstream > 0: + distance = downstream + if adjustToNeighbor: + nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2) + if nextGene < downstream * 2: + distance = nextGene / 2 + + if distance < 1: + distance = 1 + + gstart -= distance + + if lengthCDS > 0: + if lengthCDS < glen: + gstart = newFeatureList[-1][-1] - lengthCDS + + if lengthCDS < 0: + if abs(lengthCDS) < glen: + gstop = newFeatureList[0][0] - lengthCDS + + glen = abs(gstop - gstart) + if chrom not in locusByChromDict: + locusByChromDict[chrom] = [] + + if keepSense: + locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense)) + else: + locusByChromDict[chrom].append((gstart, gstop, gid, glen)) + + for chrom in locusByChromDict: + locusByChromDict[chrom].sort() + + return locusByChromDict + + +def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], + normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1, + binLength=-1): + """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions, + a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions + or a different weight / tag. + """ + index = 0 + regionsBins = {} + regionsLen = {} + + if defaultRegionFormat: + regionIDField = 0 + startField = 1 + stopField = 2 + lengthField = 3 + else: + startField = 0 + stopField = 1 + regionIDField = 2 + lengthField = 3 + + senseField = 4 + + print "entering computeRegionBins" + if len(regionList) > 0: + for readID in regionList: + regionsBins[readID] = [0.] * bins + else: + for chrom in regionsByChromDict: + for regionTuple in regionsByChromDict[chrom]: + regionID = regionTuple[regionIDField] + regionsBins[regionID] = [0.] * bins + + for chrom in hitDict: + if chrom not in regionsByChromDict: + continue + + for regionTuple in regionsByChromDict[chrom]: + regionID = regionTuple[regionIDField] + regionsLen[regionID] = regionTuple[lengthField] + + print "%s\n" % chrom + startRegion = 0 + for (tagStart, sense, weight) in hitDict[chrom]: + index += 1 + if index % 100000 == 0: + print "read %d " % index, + + stopPoint = tagStart + readlen + if startRegion < 0: + startRegion = 0 + + for regionTuple in regionsByChromDict[chrom][startRegion:]: + start = regionTuple[startField] + stop = regionTuple[stopField] + regionID = regionTuple[regionIDField] + rlen = regionTuple[lengthField] + try: + rsense = regionTuple[senseField] + except: + rsense = "F" + + if tagStart > stop: + startRegion += 1 + continue + + if start > stopPoint: + startRegion -= 10 + break + + if start <= tagStart <= stop: + if binLength < 1: + regionBinLength = rlen / bins + else: + regionBinLength = binLength + + startdist = tagStart - start + if rsense == "F": + # we are relying on python's integer division quirk + binID = startdist / regionBinLength + if (fixedFirstBin > 0) and (startdist < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) + else: + rdist = rlen - startdist + binID = rdist / regionBinLength + if (fixedFirstBin > 0) and (rdist < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) + + stopPoint = stop + + return (regionsBins, regionsLen) + + +# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will +# require going through all the code to make the changes needed. Major project for another +# day, but it really needs to be done +class readDataset: + """ Class for storing reads from experiments. Assumes that custom scripts + will translate incoming data into a format that can be inserted into the + class using the insert* methods. Default class subtype ('DNA') includes + tables for unique and multireads, whereas 'RNA' subtype also includes a + splices table. + """ + + def __init__(self, datafile, initialize=False, datasetType='', verbose=False, + cache=False, reportCount=True): + """ creates an rds datafile if initialize is set to true, otherwise + will append to existing tables. datasetType can be either 'DNA' or 'RNA'. + """ + self.dbcon = "" + self.memcon = "" + self.dataType = "" + self.rdsVersion = "1.1" + self.memBacked = False + self.memChrom = "" + self.memCursor = "" + self.cachedDBFile = "" + + if cache: + if verbose: + print "caching ...." + + self.cacheDB(datafile) + dbfile = self.cachedDBFile + else: + dbfile = datafile + + self.dbcon = sqlite.connect(dbfile) + self.dbcon.row_factory = sqlite.Row + self.dbcon.execute("PRAGMA temp_store = MEMORY") + if initialize: + if datasetType == "": + self.dataType = "DNA" + else: + self.dataType = datasetType + + self.initializeTables(self.dbcon) + else: + metadata = self.getMetadata("dataType") + self.dataType = metadata["dataType"] + + try: + metadata = self.getMetadata("rdsVersion") + self.rdsVersion = metadata["rdsVersion"] + except: + try: + self.insertMetadata([("rdsVersion", currentRDSversion)]) + except: + print "could not add rdsVersion - read-only ?" + self.rdsVersion = "pre-1.0" + + if verbose: + if initialize: + print "INITIALIZED dataset %s" % datafile + else: + print "dataset %s" % datafile + + metadata = self.getMetadata() + print "metadata:" + pnameList = metadata.keys() + pnameList.sort() + for pname in pnameList: + print "\t" + pname + "\t" + metadata[pname] + + if reportCount: + ucount = self.getUniqsCount() + mcount = self.getMultiCount() + if self.dataType == "DNA" and not initialize: + try: + print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount)) + except: + print "\n%s unique reads and %s multireads" % (ucount, mcount) + elif self.dataType == 'RNA' and not initialize: + scount = self.getSplicesCount() + try: + print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount)) + except: + print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount) + + print "default cache size is %d pages" % self.getDefaultCacheSize() + if self.hasIndex(): + print "found index" + else: + print "not indexed" + + + def __len__(self): + """ return the number of usable reads in the dataset. + """ + try: + total = self.getUniqsCount() + except: + total = 0 + + try: + total += self.getMultiCount() + except: + pass + + if self.dataType == "RNA": + try: + total += self.getSplicesCount() + except: + pass + + try: + total = int(total) + except: + total = 0 + + return total + + + def __del__(self): + """ cleanup copy in local cache, if present. + """ + if self.cachedDBFile != "": + self.uncacheDB() + + + def cacheDB(self, filename): + """ copy geneinfoDB to a local cache. + """ + self.cachedDBFile = tempfile.mktemp() + ".db" + shutil.copyfile(filename, self.cachedDBFile) + + + def saveCacheDB(self, filename): + """ copy geneinfoDB to a local cache. + """ + shutil.copyfile(self.cachedDBFile, filename) + + + def uncacheDB(self): + """ delete geneinfoDB from local cache. + """ + global cachedDBFile + if self.cachedDBFile != "": + try: + os.remove(self.cachedDBFile) + except: + print "could not delete %s" % self.cachedDBFile + + self.cachedDB = "" + + + def attachDB(self, filename, asname): + """ attach another database file to the readDataset. + """ + stmt = "attach '%s' as %s" % (filename, asname) + self.execute(stmt) + + + def detachDB(self, asname): + """ detach a database file to the readDataset. + """ + stmt = "detach %s" % (asname) + self.execute(stmt) + + + def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""): + """ import into current RDS the table (with columns destcolumns, + with default all columns) from the database file asname, + using the column specification of ascolumns (default all). + """ + stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table) + if flagged != "": + stmt += " where flag = '%s' " % flagged + + self.execute(stmt, forceCommit=True) + + + def getTables(self, asname=""): + """ get a list of table names in a particular database file. + """ + resultList = [] + + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + if asname != "": + asname += "." + + stmt = "select name from %ssqlite_master where type='table'" % asname + sql.execute(stmt) + results = sql.fetchall() + + for row in results: + resultList.append(row["name"]) + + return resultList + + + def hasIndex(self): + """ check whether the RDS file has at least one index. + """ + stmt = "select count(*) from sqlite_master where type='index'" + count = int(self.execute(stmt, returnResults=True)[0][0]) + if count > 0: + return True + + return False + + + def initializeTables(self, acon, cache=100000): + """ creates table schema in database connection acon, which is + typically a database file or an in-memory database. + """ + acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) + acon.execute("create table metadata (name varchar, value varchar)") + acon.execute("insert into metadata values('dataType','%s')" % self.dataType) + acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)") + acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)") + if self.dataType == "RNA": + acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)") + + acon.commit() + + + def getFileCursor(self): + """ returns a cursor to file database for low-level (SQL) + access to the data. + """ + return self.dbcon.cursor() + + + def getMemCursor(self): + """ returns a cursor to memory database for low-level (SQL) + access to the data. + """ + return self.memcon.cursor() + + + def getMetadata(self, valueName=""): + """ returns a dictionary of metadata. + """ + whereClause = "" + resultsDict = {} + + if valueName != "": + whereClause = " where name = '%s' " % valueName + + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + sql.execute("select name, value from metadata" + whereClause) + results = sql.fetchall() + + for row in results: + pname = row["name"] + pvalue = row["value"] + if pname not in resultsDict: + resultsDict[pname] = pvalue + else: + trying = True + index = 2 + while trying: + newName = pname + ":" + str(index) + if newName not in resultsDict: + resultsDict[newName] = pvalue + trying = False + + index += 1 + + return resultsDict + + + def getReadSize(self): + """ returns readsize if defined in metadata. + """ + metadata = self.getMetadata() + if "readsize" not in metadata: + print "no readsize parameter defined - returning 0" + return 0 + else: + mysize = metadata["readsize"] + if "import" in mysize: + mysize = mysize.split()[0] + + return int(mysize) + + + def getDefaultCacheSize(self): + """ returns the default cache size. + """ + return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0]) + + + def getChromosomes(self, table="uniqs", fullChrom=True): + """ returns a list of distinct chromosomes in table. + """ + statement = "select distinct chrom from %s" % table + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + sql.execute(statement) + results = [] + for row in sql: + if fullChrom: + if row["chrom"] not in results: + results.append(row["chrom"]) + else: + if len(row["chrom"][3:].strip()) < 1: + continue + + if row["chrom"][3:] not in results: + results.append(row["chrom"][3:]) + + results.sort() + + return results + + + def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True, + doMulti=False, doSplices=False): + """ returns the maximum coordinate for reads on a given chromosome. + """ + maxCoord = 0 + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + if doUniqs: + try: + sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom) + maxCoord = int(sql.fetchall()[0][0]) + except: + print "couldn't retrieve coordMax for chromosome %s" % chrom + + if doSplices: + sql.execute("select max(startR) from splices where chrom = '%s'" % chrom) + try: + spliceMax = int(sql.fetchall()[0][0]) + if spliceMax > maxCoord: + maxCoord = spliceMax + except: + pass + + if doMulti: + sql.execute("select max(start) from multi where chrom = '%s'" % chrom) + try: + multiMax = int(sql.fetchall()[0][0]) + if multiMax > maxCoord: + maxCoord = multiMax + except: + pass + + if verbose: + print "%s maxCoord: %d" % (chrom, maxCoord) + + return maxCoord + + + def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, + withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False, + readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False, + flagLike=False, strand="", entryDict=False, combine5p=False): + """ returns a dictionary of reads in a variety of formats + and which can be restricted by chromosome or custom-flag. + Returns unique reads by default, but can return multireads + with doMulti set to True. + """ + whereClause = [] + resultsDict = {} + + if chrom != "" and chrom != self.memChrom: + whereClause.append("chrom = '%s'" % chrom) + + if flag != "": + if flagLike: + flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") + whereClause.append(flagLikeClause) + else: + whereClause.append("flag = '%s'" % flag) + + if start > -1: + whereClause.append("start > %d" % start) + + if stop > -1: + whereClause.append("stop < %d" % stop) + + if len(readLike) > 0: + readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") + whereClause.append(readIDClause) + + if hasMismatch: + whereClause.append("mismatch != ''") + + if strand in ["+", "-"]: + whereClause.append("sense = '%s'" % strand) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + groupBy = [] + if findallOptimize: + selectClause = ["select start, sense, sum(weight)"] + groupBy = ["GROUP BY start, sense"] + else: + selectClause = ["select ID, chrom, start, readID"] + if bothEnds: + selectClause.append("stop") + + if not noSense: + selectClause.append("sense") + + if withWeight: + selectClause.append("weight") + + if withFlag: + selectClause.append("flag") + + if withMismatch: + selectClause.append("mismatch") + + if limit > 0 and not combine5p: + groupBy.append("LIMIT %d" % limit) + + selectQuery = string.join(selectClause, ",") + groupQuery = string.join(groupBy) + if doUniqs: + stmt = [selectQuery, "from uniqs", whereQuery, groupQuery] + if doMulti: + stmt.append("UNION ALL") + stmt.append(selectQuery) + stmt.append("from multi") + stmt.append(whereQuery) + stmt.append(groupQuery) + else: + stmt = [selectQuery, "from multi", whereQuery] + + if combine5p: + if findallOptimize: + selectQuery = "select start, sense, weight, chrom" + + if doUniqs: + subSelect = [selectQuery, "from uniqs", whereQuery] + if doMulti: + subSelect.append("union all") + subSelect.append(selectQuery) + subSelect.append("from multi") + subSelect.append(whereQuery) + else: + subSelect = [selectQuery, "from multi", whereQuery] + + sqlStmt = string.join(subSelect) + if findallOptimize: + selectQuery = "select start, sense, sum(weight)" + + stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union", + selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"] + + if findallOptimize: + if self.memBacked: + self.memcon.row_factory = None + sql = self.memcon.cursor() + else: + self.dbcon.row_factory = None + sql = self.dbcon.cursor() + + stmt.append("order by start") + elif readIDDict: + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + stmt.append("order by readID, start") + else: + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + stmt.append("order by chrom, start") + + sqlQuery = string.join(stmt) + sql.execute(sqlQuery) + + if findallOptimize: + resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql] + if self.memBacked: + self.memcon.row_factory = sqlite.Row + else: + self.dbcon.row_factory = sqlite.Row + else: + currentChrom = "" + currentReadID = "" + pairID = 0 + for row in sql: + readID = row["readID"] + if fullChrom: + chrom = row["chrom"] + else: + chrom = row["chrom"][3:] + + if not readIDDict and chrom != currentChrom: + resultsDict[chrom] = [] + currentChrom = chrom + dictKey = chrom + elif readIDDict: + theReadID = readID + if "::" in readID: + (theReadID, multiplicity) = readID.split("::") + + if "/" in theReadID and withPairID: + (theReadID, pairID) = readID.split("/") + + if theReadID != currentReadID: + resultsDict[theReadID] = [] + currentReadID = theReadID + dictKey = theReadID + + if entryDict: + newrow = {"start": int(row["start"])} + if bothEnds: + newrow["stop"] = int(row["stop"]) + + if not noSense: + newrow["sense"] = row["sense"] + + if withWeight: + newrow["weight"] = float(row["weight"]) + + if withFlag: + newrow["flag"] = row["flag"] + + if withMismatch: + newrow["mismatch"] = row["mismatch"] + + if withID: + newrow["readID"] = readID + + if withChrom: + newrow["chrom"] = chrom + + if withPairID: + newrow["pairID"] = pairID + else: + newrow = [int(row["start"])] + if bothEnds: + newrow.append(int(row["stop"])) + + if not noSense: + newrow.append(row["sense"]) + + if withWeight: + newrow.append(float(row["weight"])) + + if withFlag: + newrow.append(row["flag"]) + + if withMismatch: + newrow.append(row["mismatch"]) + + if withID: + newrow.append(readID) + + if withChrom: + newrow.append(chrom) + + if withPairID: + newrow.append(pairID) + + resultsDict[dictKey].append(newrow) + + return resultsDict + + + def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, + withID=False, withChrom=False, withPairID=False, readIDDict=False, + splitRead=False, hasMismatch=False, flagLike=False, start=-1, + stop=-1, strand="", entryDict=False): + """ returns a dictionary of spliced reads in a variety of + formats and which can be restricted by chromosome or custom-flag. + Returns unique spliced reads for now. + """ + whereClause = [] + resultsDict = {} + + if chrom != "" and chrom != self.memChrom: + whereClause = ["chrom = '%s'" % chrom] + + if flag != "": + if flagLike: + flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") + whereClause.append(flagLikeClause) + else: + whereClause.append("flag = '%s'" % flag) + + if hasMismatch: + whereClause.append("mismatch != ''") + + if strand != "": + whereClause.append("sense = '%s'" % strand) + + if start > -1: + whereClause.append("startL > %d" % start) + + if stop > -1: + whereClause.append("stopR < %d" % stop) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"] + if not noSense: + selectClause.append("sense") + + if withWeight: + selectClause.append("weight") + + if withFlag: + selectClause.append("flag") + + if withMismatch: + selectClause.append("mismatch") + + selectQuery = string.join(selectClause, " ,") + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + if chrom == "" and not readIDDict: + stmt = "select distinct chrom from splices %s" % whereQuery + sql.execute(stmt) + for row in sql: + if fullChrom: + chrom = row["chrom"] + else: + chrom = row["chrom"][3:] + + resultsDict[chrom] = [] + elif chrom != "" and not readIDDict: + resultsDict[chrom] = [] + + stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) + sql.execute(stmt) + currentReadID = "" + for row in sql: + pairID = 0 + readID = row["readID"] + if fullChrom: + chrom = row["chrom"] + else: + chrom = row["chrom"][3:] + + if readIDDict: + if "/" in readID: + (theReadID, pairID) = readID.split("/") + else: + theReadID = readID + + if theReadID != currentReadID: + resultsDict[theReadID] = [] + currentReadID = theReadID + dictKey = theReadID + else: + dictKey = chrom + + if entryDict: + newrow = {"startL": int(row["startL"])} + newrow["stopL"] = int(row["stopL"]) + newrow["startR"] = int(row["startR"]) + newrow["stopR"] = int(row["stopR"]) + if not noSense: + newrow["sense"] = row["sense"] + + if withWeight: + newrow["weight"] = float(row["weight"]) + + if withFlag: + newrow["flag"] = row["flag"] + + if withMismatch: + newrow["mismatch"] = row["mismatch"] + + if withID: + newrow["readID"] = readID + + if withChrom: + newrow["chrom"] = chrom + + if withPairID: + newrow["pairID"] = pairID + + if splitRead: + leftDict = newrow + del leftDict["startR"] + del leftDict["stopR"] + rightDict = newrow + del rightDict["start"] + del rightDict["stopL"] + resultsDict[dictKey].append(leftDict) + resultsDict[dictKey].append(rightDict) + else: + resultsDict[dictKey].append(newrow) + else: + newrow = [int(row["startL"])] + newrow.append(int(row["stopL"])) + newrow.append(int(row["startR"])) + newrow.append(int(row["stopR"])) + if not noSense: + newrow.append(row["sense"]) + + if withWeight: + newrow.append(float(row["weight"])) + + if withFlag: + newrow.append(row["flag"]) + + if withMismatch: + newrow.append(row["mismatch"]) + + if withID: + newrow.append(readID) + + if withChrom: + newrow.append(chrom) + + if withPairID: + newrow.append(pairID) + + if splitRead: + resultsDict[dictKey].append(newrow[:2] + newrow[4:]) + resultsDict[dictKey].append(newrow[2:]) + else: + resultsDict[dictKey].append(newrow) + + return resultsDict + + + def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False, + splices=False, reportCombined=True, sense="both"): + """ return read counts for a given region. + """ + ucount = 0 + mcount = 0 + scount = 0 + restrict = "" + if sense in ["+", "-"]: + restrict = " sense ='%s' " % sense + + if uniqs: + try: + ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict)) + except: + ucount = 0 + + if multi: + try: + mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict)) + except: + mcount = 0 + + if splices: + try: + scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict)) + except: + scount = 0 + + if reportCombined: + total = ucount + mcount + scount + return total + else: + return (ucount, mcount, scount) + + + def getTotalCounts(self, chrom="", rmin="", rmax=""): + return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both") + + + def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): + """ returns the number of row in the uniqs table. + """ + whereClause = [] + count = 0 + + if chrom !="" and chrom != self.memChrom: + whereClause = ["chrom='%s'" % chrom] + + if rmin != "": + whereClause.append("%s >= %s" % (startField, str(rmin))) + + if rmax != "": + whereClause.append("%s <= %s" % (startField, str(rmax))) + + if restrict != "": + whereClause.append(restrict) + + if len(whereClause) > 0: + whereStatement = string.join(whereClause, " and ") + whereQuery = "where %s" % whereStatement + else: + whereQuery = "" + + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + if distinct: + sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery)) + else: + sql.execute("select sum(weight) from %s %s" % (table, whereQuery)) + + result = sql.fetchone() + + try: + count = int(result[0]) + except: + count = 0 + + return count + + + def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the number of row in the splices table. + """ + return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") + + + def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the number of distinct readIDs in the uniqs table. + """ + return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct) + + + def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): + """ returns the total weight of readIDs in the multi table. + """ + return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct) + + + def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1): + """ get readID's. + """ + stmt = [] + limitPart = "" + if limit > 0: + limitPart = "LIMIT %d" % limit + + if uniqs: + stmt.append("select readID from uniqs") + + if multi: + stmt.append("select readID from multi") + + if splices: + stmt.append("select readID from splices") + + if len(stmt) > 0: + selectPart = string.join(stmt, " union ") + else: + selectPart = "" + + sqlQuery = "%s group by readID %s" (selectPart, limitPart) + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + sql.execute(sqlQuery) + result = sql.fetchall() + + if paired: + return [x.split("/")[0][0] for x in result] + else: + return [x[0] for x in result] + + + def getMismatches(self, mischrom = None, verbose=False, useSplices=True): + """ returns the uniq and spliced mismatches in a dictionary. + """ + revcomp = {"A": "T", + "T": "A", + "G": "C", + "C": "G", + "N": "N" + } + + readlen = self.getReadSize() + if mischrom: + hitChromList = [mischrom] + else: + hitChromList = self.getChromosomes() + hitChromList.sort() + + snpDict = {} + for achrom in hitChromList: + if verbose: + print "getting mismatches from chromosome %s" % (achrom) + + snpDict[achrom] = [] + hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True) + if useSplices and self.dataType == "RNA": + spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) + spliceIDList = spliceDict.keys() + for k in spliceIDList: + (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0] + spMismatchList = mismatches.split(",") + for mismatch in spMismatchList: + if "N" in mismatch: + continue + + change_len = len(mismatch) + if sense == "+": + change_from = mismatch[0] + change_base = mismatch[change_len-1] + change_pos = int(mismatch[1:change_len-1]) + elif sense == "-": + change_from = revcomp[mismatch[0]] + change_base = revcomp[mismatch[change_len-1]] + change_pos = readlen - int(mismatch[1:change_len-1]) + 1 + + firsthalf = int(lefthalf)-int(startpos)+1 + secondhalf = 0 + if int(change_pos) <= int(firsthalf): + change_at = startpos + change_pos - 1 + else: + secondhalf = change_pos - firsthalf + change_at = rightstart + secondhalf + + snpDict[achrom].append([startpos, change_at, change_base, change_from]) + + if achrom not in hitDict: + continue + + for (start, sense, mismatches) in hitDict[achrom]: + mismatchList = mismatches.split(",") + for mismatch in mismatchList: + if "N" in mismatch: + continue + + change_len = len(mismatch) + if sense == "+": + change_from = mismatch[0] + change_base = mismatch[change_len-1] + change_pos = int(mismatch[1:change_len-1]) + elif sense == "-": + change_from = revcomp[mismatch[0]] + change_base = revcomp[mismatch[change_len-1]] + change_pos = readlen - int(mismatch[1:change_len-1]) + 1 + + change_at = start + change_pos - 1 + snpDict[achrom].append([start, change_at, change_base, change_from]) + + return snpDict + + + def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True, + useSplices=False, normalizationFactor = 1.0, trackStrand=False, + keepStrand="both", shiftValue=0): + """return a profile of the chromosome as an array of per-base read coverage.... + keepStrand = 'both', 'plusOnly', or 'minusOnly'. + Will also shift position of unique and multireads (but not splices) if shift is a natural number + """ + metadata = self.getMetadata() + readlen = int(metadata["readsize"]) + dataType = metadata["dataType"] + scale = 1. / normalizationFactor + shift = {} + shift["+"] = int(shiftValue) + shift["-"] = -1 * int(shiftValue) + + if cstop > 0: + lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen + else: + lastNT = cstop - cstart + readlen + shift["+"] + + chromModel = array("f", [0.] * lastNT) + hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True) + if cstart < 0: + cstart = 0 + + for (hstart, sense, weight) in hitDict[chromosome]: + hstart = hstart - cstart + shift[sense] + for currentpos in range(hstart,hstart+readlen): + try: + if not trackStrand or (sense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale * weight + elif sense == '-' and keepStrand != "plusOnly": + chromModel[currentpos] -= scale * weight + except: + continue + + del hitDict + if useSplices and dataType == "RNA": + if cstop > 0: + spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop) + else: + spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True) + + if chromosome in spliceDict: + for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]: + if (Rstop - cstart) < lastNT: + for index in range(abs(Lstop - Lstart)): + currentpos = Lstart - cstart + index + # we only track unique splices + if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale + elif rsense == "-" and keepStrand != "plusOnly": + chromModel[currentpos] -= scale + + for index in range(abs(Rstop - Rstart)): + currentpos = Rstart - cstart + index + # we only track unique splices + if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): + chromModel[currentpos] += scale + elif rsense == "-" and keepStrand != "plusOnly": + chromModel[currentpos] -= scale + + del spliceDict + + return chromModel + + + def insertMetadata(self, valuesList): + """ inserts a list of (pname, pvalue) into the metadata + table. + """ + self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList) + self.dbcon.commit() + + + def updateMetadata(self, pname, newValue, originalValue=""): + """ update a metadata field given the original value and the new value. + """ + stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname) + if originalValue != "": + stmt += " and value='%s' " % str(originalValue) + + self.dbcon.execute(stmt) + self.dbcon.commit() + + + def insertUniqs(self, valuesList): + """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) + into the uniqs table. + """ + self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def insertMulti(self, valuesList): + """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) + into the multi table. + """ + self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def insertSplices(self, valuesList): + """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) + into the splices table. + """ + self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + + def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): + """ update reads on file database in a list region of regions for a chromosome to have a new flag. + regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with + sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense). + """ + restrict = "" + if sense != "both": + restrict = " and sense = ? " + + if uniqs: + self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) + + if multi: + self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) + + if self.dataType == "RNA" and splices: + self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList) + self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList) + + self.dbcon.commit() + + + def setFlags(self, flag, uniqs=True, multi=True, splices=True): + """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. + """ + if uniqs: + self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag) + + if multi: + self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag) + + if self.dataType == 'RNA' and splices: + self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag) + + self.dbcon.commit() + + + def resetFlags(self, uniqs=True, multi=True, splices=True): + """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. + """ + if uniqs: + self.dbcon.execute("UPDATE uniqs SET flag = ''") + + if multi: + self.dbcon.execute("UPDATE multi SET flag = ''") + + if self.dataType == "RNA" and splices: + self.dbcon.execute("UPDATE splices SET flag = ''") + + self.dbcon.commit() + + + def reweighMultireads(self, readList): + self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList) + + + def setSynchronousPragma(self, value="ON"): + try: + self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value) + except: + print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value + + + def setDBcache(self, cache, default=False): + self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache) + if default: + self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache) + + + def execute(self, statement, returnResults=False, forceCommit=False): + if self.memBacked: + sql = self.memcon.cursor() + else: + sql = self.dbcon.cursor() + + sql.execute(statement) + if returnResults: + result = sql.fetchall() + return result + + if forceCommit: + if self.memBacked: + self.memcon.commit() + else: + self.dbcon.commit() + + + def buildIndex(self, cache=100000): + """ Builds the file indeces for the main tables. + Cache is the number of 1.5 kb pages to keep in memory. + 100000 pages translates into 150MB of RAM, which is our default. + """ + if cache > self.getDefaultCacheSize(): + self.setDBcache(cache) + self.setSynchronousPragma("OFF") + self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") + print "built uPosIndex" + self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)") + print "built uChromIndex" + self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") + print "built mPosIndex" + self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)") + print "built mChromIndex" + + if self.dataType == "RNA": + self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)") + print "built sPosIndex" + self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)") + print "built sPosIndex2" + self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)") + print "built sChromIndex" + + self.dbcon.commit() + self.setSynchronousPragma("ON") + + + def dropIndex(self): + """ drops the file indices for the main tables. + """ + try: + self.setSynchronousPragma("OFF") + self.dbcon.execute("DROP INDEX uPosIndex") + self.dbcon.execute("DROP INDEX uChromIndex") + self.dbcon.execute("DROP INDEX mPosIndex") + self.dbcon.execute("DROP INDEX mChromIndex") + + if self.dataType == "RNA": + self.dbcon.execute("DROP INDEX sPosIndex") + try: + self.dbcon.execute("DROP INDEX sPosIndex2") + except: + pass + + self.dbcon.execute("DROP INDEX sChromIndex") + + self.dbcon.commit() + except: + print "problem dropping index" + + self.setSynchronousPragma("ON") + + + def memSync(self, chrom="", index=False): + """ makes a copy of the dataset into memory for faster access. + Can be restricted to a "full" chromosome. Can also build the + memory indices. + """ + self.memcon = "" + self.memcon = sqlite.connect(":memory:") + self.initializeTables(self.memcon) + cursor = self.dbcon.cursor() + whereclause = "" + if chrom != "": + print "memSync %s" % chrom + whereclause = " where chrom = '%s' " % chrom + self.memChrom = chrom + else: + self.memChrom = "" + + self.memcon.execute("PRAGMA temp_store = MEMORY") + self.memcon.execute("PRAGMA CACHE_SIZE = 1000000") + # copy metadata to memory + self.memcon.execute("delete from metadata") + results = cursor.execute("select name, value from metadata") + results2 = [] + for row in results: + results2.append((row["name"], row["value"])) + + self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2) + # copy uniqs to memory + results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause) + results2 = [] + for row in results: + results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) + + self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2) + # copy multi to memory + results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause) + results2 = [] + for row in results: + results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) + + self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2) + # copy splices to memory + if self.dataType == "RNA": + results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause) + results2 = [] + for row in results: + results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) + + self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2) + if index: + if chrom != "": + self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)") + self.memcon.execute("CREATE INDEX mPosIndex on multi(start)") + if self.dataType == "RNA": + self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)") + self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)") + else: + self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") + self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") + if self.dataType == "RNA": + self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)") + self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)") + + self.memBacked = True + self.memcon.row_factory = sqlite.Row + self.memcon.commit() diff --git a/crossmatch.py b/crossmatch.py new file mode 100755 index 0000000..6a36758 --- /dev/null +++ b/crossmatch.py @@ -0,0 +1,38 @@ +try: + import psyco + psyco.full() +except: + pass + +import sys +from cistematic.core.orthomatcher import orthoMatcher + + +def main(argv=None): + if not argv: + argv = sys.argv + + print "version 1.1" + if len(argv) < 7: + print "usage: python %s prefix directory genome1 genefile1 genome2 genefile2 [genome3 genefile3 .....]" % argv[0] + sys.exit(1) + + prefix = argv[1] + directory = argv[2] + matchFiles = {} + + genomesToMatch = (len(argv) - 3) / 2 + for index in range(genomesToMatch): + genome = argv[3 + index * 2] + print genome + if genome not in matchFiles: + matchFiles[genome] = [] + + matchFiles[genome].append(argv[4 + index * 2]) + + print matchFiles + orthoMatcher(matchFiles, prefix, directory, fileList=True) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/distalPairs.py b/distalPairs.py new file mode 100755 index 0000000..d24781a --- /dev/null +++ b/distalPairs.py @@ -0,0 +1,133 @@ +# +# distalPairs.py +# ENRAGE +# +# Created by Ali Mortazavi on 10/14/08. +# + + +try: + import psyco + psyco.full() +except: + pass + +from commoncode import readDataset +import sys, time, optparse + + +def main(argv=None): + if not argv: + argv = sys.argv + + print "%prog: version 3.3" + print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM" + usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--maxDist", type="int", dest="maxDist") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + minDist = int(args[0]) + rdsfile = args[1] + outfilename = args[2] + + distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages) + + +def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None): + if cachePages is not None: + doCache = True + else: + doCache = False + cachePages = -1 + + RDS = readDataset(rdsfile, verbose = True, cache=doCache) + if not RDS.hasIndex(): + print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun" + sys.exit(1) + + if cachePages > RDS.getDefaultCacheSize(): + RDS.setDBcache(cachePages) + + print time.ctime() + + if doSplices: + print "getting splices" + splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True) + print "got splices" + + print "getting uniq reads" + uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True) + print "got uniqs" + + if doSplices: + for readID in splicesDict: + theRead = splicesDict[readID] + read0 = theRead[0] + del read0[1] + try: + uniqDict[readID].append(read0) + except: + if len(theRead) == 4: + read2 = theRead[2] + del read2[1] + uniqDict[readID] = [read0,read2] + + if doVerbose: + print len(uniqDict), time.ctime() + + outfile = open(outfilename,"w") + + diffChrom = 0 + distal = 0 + total = 0 + for readID in uniqDict: + readList = uniqDict[readID] + if len(readList) == 2: + total += 1 + (start1, sense1, chrom1, pair1) = readList[0] + (start2, sense2, chrom2, pair2) = readList[1] + + if chrom1 != chrom2: + diffChrom += 1 + if sameChromOnly: + continue + else: + outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2) + outfile.write(outline + "\n") + if doVerbose: + print diffChrom, outline + else: + dist = abs(start1 - start2) + + if minDist < dist < maxDist: + distal += 1 + outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist) + outfile.write(outline + "\n") + if doVerbose: + print distal, outline + + outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total)) + total = float(total) + if total < 1: + total = 1. + + outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))) + outfile.close() + print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total)) + print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)) + print time.ctime() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/docs/ERANGE.copyright b/docs/ERANGE.copyright new file mode 100644 index 0000000..ac0d4fd --- /dev/null +++ b/docs/ERANGE.copyright @@ -0,0 +1,29 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2007-09 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# diff --git a/docs/README.build-rds b/docs/README.build-rds new file mode 100644 index 0000000..ef668d2 --- /dev/null +++ b/docs/README.build-rds @@ -0,0 +1,328 @@ +This is a description of the sqlite-based read storage +files and of the scripts designed to import read +mappings from supported short read mappers. The code +should run on any Unix-like system supporting python 2.5 +or better. The code is developed on Linux and MacOS X on +python 2.5. + +This code is made available as open-source, as described +in the copyright file ERANGE.COPYRIGHT. + +1. REQUIREMENTS +2. COMMAND LINE OPTIONS +3. CREATING THE NECESSARY INPUT (RDS) FILES +4. BUILDING EXPANDED GENOMES +5. MAPPING READS WITH ELAND +6. MAPPING READS WITH BOWTIE +7. MAPPING READS WITH BLAT +8. IMPORTING BED FILES +9. COMBINING RDS FILES +10. MANIPULATING RDS METADATA AND CACHING +11. VISUALIZING THE DATA IN RDS FILES + + +1. REQUIREMENTS + +See README.chip-seq or README.rna-seq to see the requirements +for installing and running ERANGE specific to each +application. + + +2. COMMAND LINE OPTIONS + +You can find out more about the settings for each script +by typing: + +python $ERANGEPATH/ + +to see the command line options, where ERANGEPATH is the +environmental variable set to the path to the directory +holding the ERANGE scripts. Note that the command line +options are case sensitive and that they could well +fail silently. + + +3. CREATING THE NECESSARY INPUT (RDS) FILES + +Before you can use the rest of the ERANGE scripts to do +CHiP-seq or RNA-seq analyses, you will need to first +convert your read mappings to the native ERANGE read +storage format, which is sqlite-based, and which is +called RDS (Read DataSet). RDS files consist of four +tables: +- metadata (tracks required and optional metadata) +- uniqs (stores uniquely mappable-reads) +- multi (stores reads that map equally well to multiple +locations in the genome) +- splices (stores split reads) + +a readDataset python object (in commoncode.py) provides +the encapsulation of the read database which is accessed +through specific methods. Since an RDS file is a sqlite3 +database, you can additionally use any of the sqlite-based +tools to look at the reads in the tables, if you wish to +do so. + +You will need to first map your reads with one of the +supported read mappers (see next paragraph) against a copy +of the appropriate genome. For ChIP-seq, it will be your +genome of interest, whereas for RNA-seq reads should be +mapped against an expanded genome, which consists of +chromosomes + splice junctions which depend on the read +length used. Note that several parts of the code assume +that your genomic sequences are labelled with the "chr" +chromosomes prefix. For more information on creating +expanded genomes, see BUILDING EXPANDED GENOMES. + +The currently supported read mappers are: +- Eland (part of the Illumina GA pipeline) +- Bowtie (bowtie-bio.sourceforge.net) +- Blat (from UCSC) + +These are described in the sections on MAPPING READS WITH +ELAND, MAPPING READS WITH BOWTIE, MAPPING READS WITH BLAT. + +For ChIP-seq, you can also import bed files of unique reads +only using makerdsfrombed.py . + +Also see MANIPULATING RDS METADATA AND CACHING to learn about +some important aspects of working with RDS files. + + +4. BUILDING EXPANDED GENOMES + +For RNA-seq using ELAND or BOWTIE mappings, you will need to build +an expanded genome consisting of genomic sequences, spike sequences, +and splice-spanning sequences in order to run ERANGE on your own +datasets. This expanded genome is specific to the read size used, +i.e. there will be a different expanded genome for mouse when using +25bp reads or 32bp reads. For reads longer than 32 bp, we recommend +using BOWTIE. If your reads are longer than 50bp, consider using +BLAT instead. + +Download the chromosomes from UCSC, as well as the knownGene.txt (or +equivalent table) and a directory of repeatmask annotations for each +chromosome (also from UCSC) for your genome of interest. + +You will need to build a splice fasta file using the script +getsplicefa.py, which needs Cistematic, the knownGene table, and a +paremeter for splice radius, which is 4 bp shorter than the length +of the reads. + +Once you have the splice fasta file, drop it into the same directory +as well as a fasta file for your spikes. Then use squashGenome +(part of Eland) or bowtie-build (part of Bowtie), to build the +expanded genome. Please refer to the documentation for each +package to run the genome squasher/builder. + +You will also build a repeat database using buildrmaskdb.py for use +in the candidate exon analysis from UCSC repeatmasker annotations. + + +5. MAPPING READS WITH ELAND + +Please refer to the Illumina documentation for the details on +running squashGenome and Eland. If you do not have access to the +Illumina pipeline, use bowtie as described in the next section. + +For ChIP-seq, you could take the output of the Illumina pipeline, +e.g. eland_multi.txt or eland_extended.txt and use them as inputs +for makerdsfromeland2.py . + +Once you have run Eland with the --multi option (which we +colloquially call "eland2") for each RNA-seq lane against the +expanded genome, combine all of the outputs for one sample into a +single file e.g. test.comb.eland2 + +The makerdsfromeland2.py script is used to import the reads +into RDS: + +python makerdsfromeland2.py label infilename outrdsfile [-append] [-RNA ucscGeneModels] +[propertyName::propertyValue] [-index] [-paired 1 or 2] [-extended] [-verbose] +[-olddelimiter] [-maxlines num] [-cache numPages] + +The first 3 arguments are required: +- label is any label that you wish (a combination flowcell+lane# +is a good choice) +- infilename is the output of eland in eland_multi format +(default) or eland_extended format (with the -extended flag) +- outdbname is the name of the rds file, e.g. test.rds + +If the reads are from paired-end runs, enter each eland_multi +(or extended) file separately with the "-paired 1" or "-paired 2" +flag, as appropriate. + +If entering more than one lane, use -append for all subsequent +lanes. Upon entering the last lane, use -index to build a read +index. Refer to MANIPULATING RDS METADATA AND CACHING for +information on the optional property::value pairs and caching. + +For RNA-seq, you must in addition specify the path to knownGene.txt +using the -RNA flag, e.g. + +python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options] + + +6. MAPPING READS WITH BOWTIE + +Bowtie (bowtie-bio.sourceforge.net) is a new read-mapper that +is very fast and friendly. ERANGE supports version 0.10.X +and higher that allow you to control how many multireads +are reported. We recommend the following settings: + +$BOWTIEDIR/bowtie zzz -v 2 -k 11 -m 10 -t --strata --best -f s1.query32.txt --un s1.unm.fa --max s1.max.fa s1.zzz.bowtie.txt + +where zzz is the genome prefix that you gave when building the +genome. In particular, we ask bowtie to map all multireads up +to 11 ("-k") with up to 2 mismatches ("-v" and "--best"), however +we will only import all multireads up to 10x multiplicity ("-m"). +Note that bowtie is multithreaded and can use multiple cpu based +on the -p flag (e.g. use "-p 4" to use 4 CPUs). Unmapped reads +are saved in unmapped.fa for later analysis. + +Once reads are mapped, they can be imported using: + +python $ERANGEPATH/makerdsfrombowtie.py testLabel s1.mm9.bowtie.txt bowtietest.rds + +The options for the script are: + +python makerdsfrombowtie.py label infilename outrdsfile +[-RNA ucscGeneModels] [-append] [-index] [propertyName::propertyValue] +[-rawreadID] [-verbose] [-cache numPages] + +Refer to "MAPPING READS WITH ELAND" for a description of label, +infilename, outdbname, '-append', '-index', and '-cache'. + +****REMEMBER TO USE -index WHEN LOADING THE LAST LANE OF YOUR +DATASET.**** + +The script assumes that the read ID are from Illumina, i.e. that +they have multiple fields separated by ':' and that paired-end +reads have an additional '/1' or '/2' depending on the end. +It will by default strip the first part of the readID (up to the +first ':') and replace it with the label. If you want raw readIDs +because you mapped raw reads that do not have an associated ID or +an ID that doesn't follow Illumina's conventions, use -rawreadID. + +If not using Illumina readIDs, use any identifier of the format + +throw_away:uniqueid if unpaired +throw_away:uniqueid/1 and throw_away:uniqueid/2 for paired-ends. + +For RNA-seq, you must in addition specify the path to knownGene.txt +using the -RNA flag, e.g. + +python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options] + + +7. MAPPING READS WITH BLAT + +BLAT SUPPORT IN ERANGE IS STILL UNDER DEVELOPMENT AND THE +SCRIPTS AND SETTINGS BELOW MAY BE OPTIMIZED FURTHER IN +FUTURE RELEASES OF ERANGE. + +Reads longer than 40-50bp can be fruitfully mapped with BLAT +against the reference genome without needing to provide the +exon junctions. While BLAT is much slower than BOWTIE, it +has the great advantage of seeing novel splices (i.e. +splices not present in knownGene models). + +We use the following settings to map 75bp reads with BLAT and +filter them with pslReps: + +$BLATPATH/blat /tmp/hg18.fa s3_1.query75.txt -out=pslx s3_1.hg18.blat +$BLATPATH/pslReps -minNearTopSize=70 s3_1.hg18.blat s3_1.hg18.blatbetter s3_1.blatpsr + +where the binaries are in $BLATPATH anywhere on your system. + +Once the reads have been filtered, the makerdsfromblat.py +script is used to import the mapped reads (in the example +above s3_1.hg18.blatbetter) into RDS: + +python makerdsfromblat.py label infilename outrdsfile [-append] [-index] [propertyName::propertyValue] +[-rawreadID] [-forceRNA] [-flag] [-strict minSpliceLen] [-spliceonly] [-verbose] [-cache numPages] + +If you are using BLAT for RNA-seq, please be sure to use +-forceRNA in order to import spliced reads and consider +using -strict to require a minimum length of bases on +each side of the splice. + +You can combine BOWTIE and BLAT by mapping reads with BOWTIE +first, and then using BLAT to map the unmapped reads. In +that case, you may want to only load the spliced reads +using the -spliceonly flag. To track those reads in the RDS +file, use -flag ; you can then retrieve those reads using +the options "-flag blat -flagLike" with the makebedfromrds.py +script. + + +8. IMPORTING BED FILES + +If you do not have the raw read data, you can import unique +reads only using the script makerdsfrombed.py . Note that +this is not particularly useful for RNA-seq since you will +have neither the multireads nor the spliced reads. + +The command line options are similar to those for other +scripts described in part 5-7: + +python makerdsfrombed.py label bedfile outrdsfile [-append] [-index] [propertyName::propertyValue] [-cache numPages] + + +9. COMBINING RDS FILES + +Previously created RDS files can be combined into a new RDS +dataset using the combinerds.py command with the granularity +of importing all tables or specific ones (e.g. uniqs, splices). + +The combinerds.py command options are: + +python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [-init] [-initrna] [-index] [-cache pages] + + +10. MANIPULATING RDS METADATA AND CACHING + +One of the advantages of RDS over bed, is the possibility of +attaching arbitrary sets of annotations with the data, which +are then carried along. Both the makerds* scripts and +rdsmetadata.py allows you to both enter key::value +combinations. Entering a key multiple times will cause the +same instance to be recorded multiple times, which is +appropriate in some settings (e.g. to enter flowcell info). +In addition rdsmetadata.py allows you to inspect various +attributes of your RDS files such as # of reads and size +of the default cache size. + +Sqlite files have a certain amount of RAM set aside as cache +for lookups, indexes, etc.... where the amount is measured in +1.5kb pages. Each RDS instance come with a default of 100000 +pages (150MB) of cache, which is needlessly small in most +situations. Whenever appropriate, try using more cache (e.g. +750000 pages on a 2GB RAM machine, much more if more RAM is +available) for a significant speed increase in indexing and +lookups. You can change the default value for each RDS file +by using the -defaultcache option of rdsmetadata.py. + +Note that sqlite can be very slow over NFS. Wherever +possible, copy your RDS file locally before running an I/O +intensive script. + + +11. VISUALIZING THE DATA IN RDS FILES + +You can output bed-files of the raw reads using +makebedfromrds.py. A more practical way to look at the data +might be to ouput it as a bedGraph file using makewiggle.py . + +Note that UCSC has a hard limit on the size of their files +and you will likely need to break the wiggles on a per-chromosome +basis for mammalian genomes. + +RELEASE HISTORY + +version 3.2 October 2009 - added combinerds.py +version 3.01 February 2009 - bug fixes +version 3.0 January 2009 - added logging to buildrdsfrom* +version 3.0rc1 December 2008 - added blat support + + diff --git a/docs/README.chip-seq b/docs/README.chip-seq new file mode 100644 index 0000000..6529a6f --- /dev/null +++ b/docs/README.chip-seq @@ -0,0 +1,232 @@ +This is an updated version of the core of the ChIP-seq +analysis code described in Johnson et al (2007). It +should run on any Unix-like system supporting python 2.5 +or better. The code is developed on Linux and MacOS X on +python 2.5. + +These scripts in the ChIPSeqMini package are now part of +the ERANGE package, but are still available as a +standalone package for now. + +This code is made available as open-source, as described +in the copyright file ERANGE.COPYRIGHT. + + +1. REQUIREMENTS +2. COMMAND LINE OPTIONS +3. MAKING THE NECESSARY INPUT (RDS) FILES +4. WEIGHING MULTIREADS +5. RUNNING THE PEAK FINDER +6. DISPLAYING DATA ONTO THE UCSC GENOME BROWSER +7. DOWNSTREAM ANALYSES + + +1. REQUIREMENTS + +1) Python 2.5 is required because some of the scripts and +Cistematic (see below) need pysqlite, which is now bundled in +Python. + +2) You will also need to use Cistematic 2.3 (available at +cistematic.caltech.edu) for all of the scripts that are +part of the downstream analyses. + +(optional) Use of the psyco module (psyco.sf.net) on 32-bit +Linux or Mac Intel machines is highly recommended. + +(optional) Three visualization scripts also depend on the +additional package pylab (matplotlib). These scripts are: +- getgosig.py +- plotbardist.py +- scatterfields.py +You do not need to install pylab if you will be +visualizing some of your analysis results differently. + + +2. COMMAND LINE OPTIONS + +You can find out more about the settings for each script +by typing: + +python $ERANGEPATH/ + +to see the command line options, where ERANGEPATH is the +environmental variable set to the path to the directory +holding the ERANGE scripts. Note that the command line +options are case sensitive and that they could well +fail silently. + + +3. MAKING THE NECESSARY INPUT (RDS) FILES + +You will want to first convert your read mappings to the +native ERANGE read store. Please see the file +README.build-rds for instructions on how to do this. + +Build an RDS file for both the ChIP, and if available and +appropriate, the control. Note that we *HIGHLY* recommend +the use of a matched control sample to account for some +of the general background artifacts that can be present +in ChIP-seq samples (e.g. DNAse hypersensitivity, +assembly collapse of some sattelite repeats, etc....). + + +4. WEIGHING MULTIREADS + +Version 3.0 of the peak finder can use multireads, i.e. +reads that map equally well to more than one location +in the genome, to find binding sites that are in low +copy-number on-unique regions (typically less than 10). + +ERANGE offers 3 ways to analyze these regions: +(a) default weighing of 1/multiplicity +(b) ignoring multireads +(c) weighing of multireads based on unique reads in a +given radius + +(a) is the default in the current release of ERANGE. +Simply proceed to RUNNING THE PEAK FINDER for (a) and +(a). You can ignore multireads (b) by using the -nomulti +flag with findall.py. For (c), use weighMultireads.py +to weigh multireads based on a unique reads in the +respective radius of each potential location. Once run, +proceed to the section below. + + +5. RUNNING THE PEAK FINDER + +To run the peak finder without read shifting, use the +following command: + +python $ERANGEPATH/findall.py label chip.rds chip.regions.txt -control control.rds -listPeak -revbackground + +which will run the peak finder on chip.rds / control.rds , +store the enriched region coordinates in chip.regions.txt, +also store the actual local maximum in each region in the +same file, and also calculate an FDR by running the +finder on control.rds / chip.rds . + +A log file (findall.log by default, change with -log) +tracks the settings used to run the program as well as +some of the summary statistics, which are also stored +at the bottom of the regions.txt output file. + +findall.py is tuned to conservative settings for 10-12M +mappable read IPs of static, sequence-specific +transcription factors in mammals with very short +fragment sizes, on the order of 40-60 bp. + +You will *NEED* to change some of the default parameters +if working in smaller genomes (e.g. use smaller -spacing), +if working with certain types of IPs such as histones and +polymerases (test with and without -notrim and +-nodirectionality), if working with rather weak IPs +(e.g. -minimum and -ratio), or if working with larger +fragment sizes (see the paragraph below discussing read +shifting). + +findall.py returns a per-peak p-value. By default, this +is calculated using a Poisson distribution of peak RPMs +(or counts, if using -raw) for each chromosome in the IP. +P-value calculations can be turned off using +'-pvalue none '. Alternatively, the p-value can be +calculated from the background using the option +'-pvalue back ', which must be combined with the option +-revbackground. + +By default, findall.py does not try to adjust the location +of the reads based on half the size of the expected fragment +length (the "shift"). If you believe that you need to shift +your peaks, findall.py can try to pick the best shift based +on the best shift for strong sites using the parameter +'-shift learn '. You can also either manually specify a +shift value using '-shift #bp ' or ou can calculate a +"best shift" for each region using '-autoshift'. If you +need to using the shift options, the recommended usage is: +(i) first run findall.py with '-shift learn ', which will +peak a shift if there are at least 30 regions that meet +its training criteria. +(ii) if (i) couldn't pick a shift, run findall.py with +-autoshift and -reportshift +(iii) look at the mode (most common #) for the shift +(iv) rerun findall.py with -shift #bp where #bp is the mode + +If you are storing the RDS files on an network-mounted +directory, make sure to use '-cache XXXXX' to enable +local caching, where is as large as appropriate as +described in section 9 of README.build-rds . + +Note that ERANGE will cache by default to /tmp, but this +can be redirected to any directory pointed to by the +environmental variable CISTEMATIC_TEMP. + +To find out the current default settings and options, +simply type: + +python $ERANGEPATH/findall.py + +for more information. + + +6. DISPLAYING DATA ONTO THE UCSC GENOME BROWSER + +You can output bed-files of the raw reads using +makebedfromrds.py and BEDGRAPH file using +makewiggle.py as described in README.build-rds . + +You can create bed files of regions and sites (see +below) using regiontobed.py and makesitetrack.py . + + +7. DOWNSTREAM ANALYSES + +Recall that Cistematic 2.3 is a required to do motif +and gene-level analyses of the output of findall.py. + +Use getallgenes.py to find the nearest gene within a +radius of each binding site. + +Use analyzego.py to do a Gene Ontology enrichment +analysis of a gene list (such as from getallgenes.py). +You can look at a heatmap of your GO enrichments using +getgosig.py. You can also use getGOgenes.py to look at +the genes with particular GO annotations. + +To do motif-finding, use getfasta.py to get the sequences +centered on the peaks of your regions of interest. For +the sake of a pleasant experience, try limiting yourself +to less than 100kb of combined sequence (the easiest being +by picking your regions with the strongest signals). + +Once you have a fasta file of the regions of interest, you +can use findMotifs.py to find motifs using either +cisGreedy (bundled with Cistematic 2.2) which is good for +shorter motifs or Meme (must be installed separately - +refer to the instructions on cistematic.caltech.edu for +more information), which is better for longer motifs. +findmotifs.py will return a set motifs in Cistematic format +with a .mot extension. These motifs can then be used with +getallsites.py to get the coordinates and instances of each +motif in all of the regions found by the peak finder. + +The sites can be checked against repeat-masker annotations +(preloaded from UCSC with buildrmaskdb.py) using +checkrmask.py. The sites for each motif can also be fed +back into getallgenes.py to get genes, redo the GO analyses, +etc.... + +You can use the intersect scripts (intersects.py, +gointersects.py, and siteintersects.py) to compare different +sets of genes/GO/site results across multiple experiments, +for example. + + +RELEASE HISTORY + +version 3.1 February 2009 - support for read shifting +version 3.0 February 2009 - support for UCSC narrowPeak format in regiontobed.py +version 3.0rc1 December 2008 - added parameter to control peak-trimming +version 3.0b2 December 2008 - added per-peak p-value +version 3.0b November 2008 - initial release of RDS-based code +with support for eland and bowtie. + diff --git a/docs/README.rna-esnp b/docs/README.rna-esnp new file mode 100644 index 0000000..fbb2b96 --- /dev/null +++ b/docs/README.rna-esnp @@ -0,0 +1,75 @@ +This is a description of the pipeline designed to analyze single +nucleotide changes found in the mapped reads. The code should run +on any Unix-like system supporting python 2.5 or better. The code +is developed on MacOS X on python 2.5. + +1. COMMAND LINE OPTIONS +2. BUILDING THE SNP DATABASE +3. RUNNING THE SNP PIPELINE + + +1. COMMAND LINE OPTIONS + +To find out more about the settings for each script, type: + +python $ERANGEPATH/ + +to see the command line options. Note that all ERANGE command-line +options are case-sensitive & that the scripts typically ignore +command-line arguments that they do not recognize! + + +2. BUILDING THE SNP DATABASE + +In order to check the candidate SNPs versus known SNPs, you will need +to first download the corresponding dbSNP database file from UCSC and +then build a sqlite version of it using: + +python $ERANGEPATH/buildsnpdb.py ucscSNPfile outdb + +e.g. + +python buildsnpdb.py snp128.txt dbSNP128 + + +3. RUNNING THE SNP PIPELINE + +The runSNPAnalysis.sh shell script is designed to retrieve SNPs, filter +them against repeat annotations, cross-check them against known SNPs and +annotate the novel SNPs. It will automatically run a set of python scripts +that are required for the SNPs analysis using the RDS (Read DataSet) file. +This script assumes the existence of a known SNP database as described in +the previous section as well as of a repeatmask database + +Usage: $ERANGEPATH/runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages + +where ERANGEPATH is the environmental variable set to the path to the directory holding the ERANGE scripts. + +Parameters: +- genome: the name of the organism in the analysis. +- rdsfile: read DataSet file. See README.build-rds for +more information. +- label: the file name of your choice for the analysis. +- rmaskdbfile: repeat mask database, a sqlite database file. See +README.rna-seq for more information on creating the database. +- dbsnpfile: dbsnp database, a sqlite database file, built from the +dbSNP database text file from UCSC. Please see command line option +for building dbsnp sqlite database using buildsnpdb.py . +- uniqStartMin: the ratio of the number of unique reads supporting a +SNP at base s and the maximum number of unique read coverage at base s . +5 is a good number to start with. +- totalRatio: the ratio of the number of reads supporting an +expressed SNP at s and the total read coverage at s . 0.75 should allow +you to get the homozygous SNPs. +- rpkmfile: rpkm file can be generated using the RNA-seq pipeline as +described in README.rna-seq. If you do not have that file, you can +set it to NONE. +- cachepages: cache pages. Make sure to use as much caching as your +system will accomodate. See README.build-rds for more information. + +Example: $ERANGEPATH/runSNPAnalysis.sh mouse 24T4spike.rds 24Tspike rmask.db dbSNP128.db 5 0.75 c2c12rna.24R.final.rpkm 5000000 + +version 3.0 January 2009 - logging +version 3.0rc1 December 2008 - major rewrite and speed-up of getSNPs.py and chksnp.py +version 3.0b2 December 2008 - bug fixes & ERANGEPATH variable + diff --git a/docs/README.rna-seq b/docs/README.rna-seq new file mode 100644 index 0000000..5a866f3 --- /dev/null +++ b/docs/README.rna-seq @@ -0,0 +1,267 @@ +The latest version of this software is available at + +http://woldlab.caltech.edu/rnaseq + +please check the website for updates. + +This is the core of the RNA-seq analysis code described in Mortazavi +et al (2008). Please make sure that you have read Figure 3 and the +methods / supplemental methods of that paper before attempting to +use this package for RNA-Seq data analysis. + +ERANGE should run on any Unix-like system supporting python 2.5 or +better. The code is developed on Linux and MacOS X on python 2.5. + +Historically, the code for ERANGE grew out of the ChIPSeqMini +package from Johnson et al (2007), and some of the key scripts +(findallnocontrol.py and getallgenes.py) are shared between the two. +This is why ERANGE is "dual-use" and is also why the code for both +analyses were kept in common as much as possible. This should be +helpful when someone tries to combine ChIP-seq and RNA-seq +analyses ! + +This code is made available as open-source, as described in the +copyright file ERANGE.COPYRIGHT. + +1. SETTING EXPECTATIONS +2. REQUIREMENTS +3. COMMAND LINE OPTIONS +4. DISPLAYING DATA +5. ANALYSIS +6. PIPELINE +7. CUSTOM CISTEMATIC GENOME ANNOTATIONS +8. PAIRED-END RNA-SEQ ANALYSIS +9. EXPRESSED SNP ANALYSIS + +1. SETTING EXPECTATIONS + +ERANGE is not a point-and-click, turn-key package. + +It is a set of python scripts that, when run in order as a pipeline +on the "right" input, will take read data in RDS format and +calculate gene expression levels in RPKM (Reads Per kb per Million +reads). This pipeline for unpaired reads is embodied in a shell +script called runStandardAnalysis.sh, which only takes a few inputs, +described in the ANALYSIS and PIPELINE section below. + +You should be able to download the data from our website and run the +analysis through the pipeline. You will need to map the reads and +import them into an RDS dataset as described in README.build-rds. + +Because you will likely want to run this package on other genomes +(or builds) than the one described in our original paper, you will +need to do several additional steps, such as: + +- build expanded genomes with splices and spikes +- check overlap of RNAFAR predictions with repeats + +This will require some comfort with running and, if necessary, +editing scripts. While the code is sparsely documented, we are +making it available so that you can *read it*. We'll be happy to +help modifying and updating the code within a reasonable extent +and will try to provide more in depth documentation and tutorials +on our web site. + +While the scripts produce several forms of RPKM, we suggest that +the "final" RPKM are the values that most people will be interested +in. + +*WARNING* A couple of these scripts are pretty memory hungry. If +you are going to analyze datasets with > 20M reads or reads with +high error rates, you will easily need > 8 GB RAM. We'll rewrite +these scripts before releasing 3.0 final to lower the memory +footprint. + +2. REQUIREMENTS + +1) Python 2.5+ is required because some of the scripts and +Cistematic (see below) need pysqlite, which is now bundled in +Python. + +2) You will also need to use Cistematic 3.0 for some of the scripts +marked below that use genes and genomic sequence; in particular, you +will also likely need the Cistematic version of the genomes, unless +providing your own custom genome and annotations. + +Cistematic is available at http://cistematic.caltech.edu + +3) You will need genomic sequences to build the expanded genome, as +well as gene models from UCSC. + +(Optional) Python is very slow on large datasets. Use of the psyco +module (psyco.sf.net) on 32-bit Linux or all Mac Intel machines to +significantly speed up runtime is highly recommended. + +(Optional) Several of the ploting scripts also rely on Matplotlib, +which is available at matplotlib.sf.net. + + +3. COMMAND LINE OPTIONS + +You can find out more about the settings for each python script by +typing: + +python $ERANGEPATH/ + +to see the command line options, where ERANGEPATH is the +environmental variable set to the path to the directory +holding the ERANGE scripts. + + +For example, if you wanted to know the command line options of the +script used to generate supplementary datasets 2-4, combineRPKMs.py , +you would type: + +python $ERANGEPATH/combineRPKMs.py + +and get back a version number and all possible command line options: + +version 1.0 +usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [-withmultifraction] + +where fields in brackets are optional. + + +4. DISPLAYING DATA + +You can output bed-files of the raw reads in the RDS file +using makebedfromrds.py and WIG file using makewiggle.py as +described in README.build-rds . + + +5. ANALYSIS + +The main steps of a typical, unpaired analysis using ERANGE +is shown in RNA-seq.analysisSteps.txt, where each script +would be run in order, with the caveat that there are two +ways to do the candidate exon analysis (RNAFAR), creatively +called "alternative 1" and "alternative 2". + +In alternative 1, we use reads that did not match an existing gene +model to identify candidate regions: + +# Alternative 1: find new regions outside of gene models with reads piled up +python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1 + +# Alternative 1: filter out new regions that overlap repeats more than a certain fraction +# use "none" if you don't have a repeatmask database +python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1 + +In alternative 2, we pool multiple RNA-seq datasets into a single +RDS database, run it through the two scripts of alternative 1 above, +and then use these precomputed candidates to count reads falling in +these regions: + +# Alternative 2: use a precomputed list of "new" regions (outside of gene models) +python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good + +Alternative 1 is the one used by the pipeline script described below. + +The scripts will generate a set of intermediate files, the most +interesting of which are the final RPKM values. These will be in the +following files for the test example: + +test.firstpass.rpkm (the unique reads only) +test.expanded.rpkm (the unique reads + spliced reads + RNAFAR) +test.final.rpkm (uniques + spliced + RNAFAR + multireads) + + +6. PIPELINE + +IF YOU ARE STORING THE RDS FILE ON A NETWORK-MOUNTED DIRECTORY, +PLEASE ALSO READ SECTION 7. + +Most of the analysis steps described in the section above are +automated in a pipeline shell script called runStandardAnalysis.sh . +Note that the pipeline assumes that it will call its own RNAFAR +regions, which is called "alternative 1" in the ANALYSIS section, +which is a good starting point. You can modify the pipeline script +to use alternative 2, if appropriate. + +The pipeline assumes that one RDS database containing the appropriate +uniq, multi, and spliced reads exists as desribed in README.build-rds. + +We assume that Cistematic 2.3 is installed, including a version of +the appropriate Cistematic genome. You will need to build your own +Cistematic genome for any unsupported genome. + +We will also need a radius (e.g. 20000 bp) within which a candidate +exon will be consolidated with an existing gene. + +For example, for the test.rds dataset from the ANALYSIS section, we +would run the pipeline as: + +. $ERANGEPATH/runStandardAnalysis.sh mouse test ../mm9repeats/rmask.db 20001 + +where ERANGEPATH is the environmental variable set to the path to +the directory holding the ERANGE scripts. Remember that you can +replace '../mm9repeats/rmask.db' with 'none' if you don't have a +repeatmask database. + +This could run from an hour to a whole day depending on how many +reads are involved (1M vs 80M) and how big a consolidation radius +is used. + + +7. CUSTOM CISTEMATIC GENOME ANNOTATIONS + +Cistematic 3.0 added support for generic genomes and loadable +(or alternative) annotations. While this support is still +experimental, the general idea is to take a GTF/GFF3 file, +convert it into the format that cistematic expects using + +$ERANGEPATH/gfftocis.py infile.gff outfile.cis + +NOTE THAT YOU WILL MOST LIKELY HAVE TO EDIT THIS FILE TO +ACCOMODATE YOUR SPECIFIC GFF FORMAT TO THE CISTEMATIC +FORMAT, WHICH IS + +geneIDuniqRefchromstartstopsensetype + +where type is one of 'CDS','5UTR','3UTR'. + +You can then run the standard analysis script with the additional +flag " -models outfile.cis ", e.g. + +. runStandardAnalysis.sh generic asteph none 1000 -models agambiae.base.cis + +Custom annotation support will be extended to other PIPELINE +scripts as part of 3.2 final. + + +8. PAIRED-END RNA-SEQ ANALYSIS + +We are now experimentally supporting paired-end RNA-seq, as +implemented in the pipeline script runRNAPairedAnalysis.sh and +is only provided as a "work-in-progress" snapshot. + +This is done primarily by marking all of the reads that map in a +known exon or a novel RNAFAR region in the RDS database, which +is a slow and time-consuming step (and is off by default for +single-ended RNA-seq). This mapping step is done without +accounting for paired-end information. + +The paired-end information is then used to connect RNAFAR +regions to known genes or to other RNAFAR regions using +reads with one end in a given region and the other end +in different (known or novel) region, as implemented in +rnafarPairs.py ; note that there is currently a default +limit of 500000 bp maximum distance between the two pairs. + + +9. EXPRESSED SNP ANALYSIS + +ERANGE3 now supports SNP analysis in RNA-seq data as described +in README.rna-esnp . + +RELEASE HISTORY + +version 3.2 December 2009 - support for custom genome annotations with Cistematic 3.0 +version 3.1 April 2009 - modified normalizeFinalExonic.py to remove genome +version 3.0 January 2009 - added logging to shell pipelines +version 3.0rc1 December 2008 - added blat support +version 3.0b2 December 2008 - bug fixes & ERANGEPATH variable +version 3.0b November 2008 - Support for paired end analysis +version 3.0a October 2008 - Preview release of ERANGE3.0 +version 2.0 May 2008 - First public release of ERANGE + diff --git a/docs/README.rnapath b/docs/README.rnapath new file mode 100644 index 0000000..c64579b --- /dev/null +++ b/docs/README.rnapath @@ -0,0 +1,49 @@ +This is a description of the pipeline designed to do scaffolding +of fragmented genomes using RNA-seq. The code should run +on any Unix-like system supporting python 2.6 or better. The code +is developed on MacOS X on python 2.6. + +Note that RNAPATH is not currently optimized for running on machines with +small or medium amounts of RAM. 32 Gb minimum is recommended for the current +version. + +1. COMMAND LINE OPTIONS +2. MAPPING THE READS AND BUILDING THE RDS FILES +3. GETTING THE SCAFFOLDING READS +4. RUNNING RNAPATH.py + + +1. COMMAND LINE OPTIONS + +To find out more about the settings for each script, type: + +python $ERANGEPATH/ + +to see the command line options. Note that all ERANGE command-line +options are case-sensitive & that the scripts typically ignore +command-line arguments that they do not recognize! + + +2. MAPPING THE READS AND BUILDING THE RDS FILES + +Before running the RNAPATH script on a genome (assumed to be in fasta format), +you will need to first map the RNA-seq reads using BLAT and import those reads +into an RDS file, as described in README.build-rds . + +3. GETTING THE SCAFFOLDING READS + +Once you have an indexed RDS file, use the scriipit distalPairs.py to output +the list of paired reads that do not map to the same contig. This involves +specifying a distance to distalPairs.py that is greater than the length of the +largest existing genomic contig. For example: + +python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs -splices -cache 20000000 + +4. RUNNING RNAPATH.py + +You can now run RNAPATH.py. I suggest optionallly using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data. + +Example: $ERANGEPATH/rnapath/RNAPATH.py genomic_contigs.fa rna_on_genomic.crosspairs RNAPATH.log genome.RNAPATH.fa + +version 3.2 May 2010 - first release + diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt new file mode 100644 index 0000000..e9a5213 --- /dev/null +++ b/docs/RNA-seq.analysisSteps.txt @@ -0,0 +1,87 @@ +# analysis steps for an ERANGE analysis of RNA-seq data +# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh + +# preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g. +# export PYTHONPATH=/my/path/to/cistematic +# +# preliminary: set CISTEMATIC_ROOT to the directory that contains the genome directories (such as H_sapiens or M_musculus), e.g. +# export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes +# +# preliminary: set ERANGEPATH, e.g. +# export ERANGEPATH=/proj/genome/experiments/commoncode +# +# preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g. +# export CISTEMATIC_TEMP=/any/local/dir +# +# preliminary: create splice file using getsplicefa.py with maxBorder set to 4 bp shorter than the read length, e.g. +# python $ERANGEPATH/getsplicefa.py hsapiens /my/path/to/human/knownGene.txt hg18splice32.fa 28 +# +# preliminary: build expanded genome using Eland's squashGenome or Bowtie's bowtie-build (see README.build-rds) +# a slower alternative is to use blat just on the genome. +# +# preliminary: build repeatmask database using buildrmaskdb.py, e.g. +# python $ERANGEPATH/buildrmaskdb.py /path/to/hg19repeats /path/to/hg18repeats/rmask.db +# if you don't have an repeatmask database, just use "none" for the rmask database below + +# run bowtie on expanded genome or just blat on the regular genome +# as described in README.build-rds +# + +# create rds file with one lane's worth of data (add -index if using only one lane) +# The example below sets the default cache to 1000000 +# The name::value pairs are optional documentart metadata, and can be set to any desired name or value +python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX + +# can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups +# rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer +#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount + +# append more data (only add -index when adding last lane) +python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index + +# count the unique reads falling on the gene models ; the nomatch files are +# mappable reads that fell outside of the Cistematic gene models and not the +# unmappable of Eland (i.e, the "NM" reads) +python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1 + +# count splice reads +python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1 + +# calculate a first-pass RPKM to re-weigh the unique reads, +# using 'none' for the splice count +python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache + +# recount the unique reads with weights calculated during the first pass +python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1 + +# There is a choice of either identifying new regions from the data alone +# (Alternative 1), or using a pre-computed list of new regions (presumably +# pooled from multiple nomatch.bed files, or literature) against the nomatch.bed +# file (Alternative 2) + +# Alternative 1: find new regions outside of gene models with reads piled up +python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1 + +# Alternative 1: filter out new regions that overlap repeats more than a certain fraction +# use "none" if you don't have a repeatmask database +python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1 + +# Alternative 2: use a precomputed list of "new" regions (outside of gene models) +#python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good + +# map all candidate regions that are within a 20kb radius of a gene in bp +# take out -cache if running locally +python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache + +# calculate expanded exonic read density +python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache + +# create bed file of accepted candidate regions +python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0 + +# weigh multi-reads +python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1 + +# calculate final exonic read density +python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache + diff --git a/docs/buildMatrix.sh b/docs/buildMatrix.sh new file mode 100644 index 0000000..9575071 --- /dev/null +++ b/docs/buildMatrix.sh @@ -0,0 +1,44 @@ +#!/bin/bash +echo 'buildMatrix.sh: version 1.1' + +indexPrev=0 +indexCur=0 + +truncateRPKM="" +if [ $# -eq 3 ]; then + truncateRPKM="-truncate "$3 +fi + +if [ $# -eq 4 ]; then + truncateRPKM="-rescale -truncate "$3 +fi + +if [ $# -lt 2 ]; then + echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [-rescale]' + echo + echo 'where the datalist file is a comma-delimited list of prefix and rds-files' + echo +else + python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "with parameters: $1 $2 $truncateRPKM" + while read line + do + prefix=`echo $line | cut -f 1 -d ','` + filename=$prefix.partcount + if [ -e $filename ]; then + if [ $indexCur -lt 1 ]; then + echo "building $1.step0" + echo -e '\t' > $1.step0 + cut -f 1 $filename >> $1.step0 + indexCur=1 + fi + python $ERANGEPATH/buildMatrix.py $1.step$indexPrev $filename $1.step$indexCur $truncateRPKM + rm $1.step$indexPrev + let indexPrev=indexPrev+1 + let indexCur=indexCur+1 + else + echo "could not find $filename - skipping" + python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "could not find $rds - skipping" + fi + done < $2 + mv $1.step$indexPrev $1.matrix.tab +fi diff --git a/docs/partition.sh b/docs/partition.sh new file mode 100644 index 0000000..1955e99 --- /dev/null +++ b/docs/partition.sh @@ -0,0 +1,34 @@ +# an example shell script to combine multiple region calls into one partition +# + +if [ -z "$1" ]; then + PARTNAME=comb +else + PARTNAME=$1 +fi + +if [ -z "$2" ]; then + MINSIZE=400 +else + MINSIZE=$2 +fi + +N=0 +if [ $# -lt 2 ]; then + echo 'usage: partition.sh name minSize datalist.file' + echo + echo 'where the datalist file is a list of region files' + echo +else + while read line + do + if [ $N -lt 1 ]; then + FILELIST='' + else + FILELIST=$FILELIST, + fi + FILELIST=$FILELIST$line + let N=N+1 + done < $3 + python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part -minFeature $MINSIZE -nomerge -locid -norandom +fi diff --git a/docs/regionCounts.sh b/docs/regionCounts.sh new file mode 100644 index 0000000..13c60ad --- /dev/null +++ b/docs/regionCounts.sh @@ -0,0 +1,28 @@ +#!/bin/bash +echo 'regionCounts.sh: version 1.0' + +cachepages="" +if [ $# -eq 3 ]; then + cachepages="-cache "$3 +fi + +if [ $# -lt 2 ]; then + echo 'usage: regionCounts.sh partitionfile datalist.file [cachevalue]' + echo + echo 'where the datalist file is a comma-delimited list of prefix and rds-files' + echo +else + arguments=$1' '$2' '$cachepages + python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "with parameters: $arguments" + while read line + do + prefix=`echo $line | cut -f 1 -d ','` + rds=`echo $line | cut -f 2 -d ','` + if [ -e $rds ]; then + python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount -force -nomerge -rpkm $cachepages + else + echo "could not find $rds - skipping" + python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "could not find $rds - skipping" + fi + done < $2 +fi diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh new file mode 100755 index 0000000..baf7f04 --- /dev/null +++ b/docs/runRNAPairedAnalysis.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# +# runRNAPairedAnalysis.sh +# ENRAGE +# +# example: . ../commoncode/runRNAPairedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db +# +# assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run. + +# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment + +if [ -z "$ERANGEPATH" ] +then + ERANGEPATH='../commoncode' +fi + +echo 'runRNAPairedAnalysis.sh: version 3.7' + +models="" +if [ $# -eq 5 ]; then + models=" -models "$5 +fi + +replacemodels="" +if [ $# -eq 6 ]; then + replacemodels=" -models $5 -replacemodels " +fi + +if [ -z "$1" ] +then + echo + echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [-replacemodels]' + echo + echo 'where rdsprefix is the name of the rds file without the .rds extension' + echo 'use "none" for the repeatmaskdb if you do not have one' + echo +else + +# log the parameters +arguments=$1' '$2' '$3' '$models' '$5 +echo 'running with settings: ' $arguments +python $ERANGEPATH/recordLog.py rna.log runRNAPairedAnalysis.sh "with parameters: $arguments" + +# count the unique reads falling on the gene models ; the nomatch files are +# mappable reads that fell outside of the Cistematic gene models and not the +# unmappable of Eland (i.e, the "NM" reads) +echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels" +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels + +# calculate a first-pass RPKM to re-weigh the unique reads, +# using 'none' for the splice count +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels + +# recount the unique reads with weights calculated during the first pass +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels + +# count splice reads +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -markGID -cache 1 $models $replacemodels + +# find new regions outside of gene models with reads piled up +python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1 + +# filter out new regions that overlap repeats more than a certain fraction +python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked -startField 1 -log rna.log -cache 1 + +# calculate the read densities +python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good -markRDS -cache -log rna.log + +# map all candidate regions that have paired ends overlapping with known genes +python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt -cache $models $replacemodels + +# calculate expanded exonic read density +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels + +# weigh multi-reads +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels + +# calculate final exonic read density +python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache + +fi diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh new file mode 100755 index 0000000..0e4ff92 --- /dev/null +++ b/docs/runSNPAnalysis.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# runSNPAnalysis.sh +# +# Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages +# Example: /getSNPs.sh mouse /woldlab/trog/sdc/alim/24T4spike_10212/24T4spike.rds 24Tspike /woldlab/trog/data1/wlee/db/rmask.db /woldlab/trog/data1/wlee/db/dbSNP128.db 5 0.75 ~/proj/c2c12rna24R/c2c12rna.24R.final.rpkm 5000000 + +# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment + +if [ -z "$ERANGEPATH" ] +then + ERANGEPATH='../commoncode' +fi + +echo 'runSNPAnalysis.sh: version 3.1' + +cachepages="" +if [ $# -eq 9 ]; then + cachepages="-cache "$9 +fi + +nosplices="" +if [ $# -eq 10 ]; then + nosplices=" -nosplices " +fi + +if [ $# -lt 8 ]; then + echo 'runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]' + echo 'where for each position S:' + echo ' uniqStartMin = # independent reads supporting base change at S' + echo ' totalRatio = total # reads supporting base change at S / total # reads that pass through S' +else +# log the parameters +arguments=$1' '$2' '$3' '$4' '$5' '$6' '$7' '$8' '$cachepages$nosplices +echo 'running with settings: ' $arguments +python $ERANGEPATH/recordLog.py snp.log runSNPAnalysis.sh "with parameters: $arguments" + +# get all SNPs by extracting it from the RDS +python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt -enforceChr $cachepages $nosplices + +# get SNPs in non-repeat regions only +python $ERANGEPATH/chkSNPrmask.py $4 $3.snps.txt $3.nr_snps.txt $cachepages + +# Check to see if SNPs are found in dbSNP +# if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC +# usage: python2.5 buildsnpdb.py snpdbdir snpdbname +# the database flat file must be in the snpdbdir directory +# To build dbSNP database file, run the following command +# python2.5 buildsnpdb.py snp128.txt dbSNP128 + +# get dbSNP info for SNPs that are found in the dbSNP database +python $ERANGEPATH/chksnp.py $5 $3.nr_snps.txt $3.nr_dbsnp.txt $cachepages + +# get gene info for the snps found in dbSNP +python $ERANGEPATH/getSNPGeneInfo.py $1 $3.nr_dbsnp.txt $8 $3.nr_dbsnp_geneinfo.txt $cachepages + +# get gene info for snps that are not found in dbSNP +python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt + +# make bed file for displaying the snps on UCSC genome browser +python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed +fi \ No newline at end of file diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh new file mode 100755 index 0000000..6d83297 --- /dev/null +++ b/docs/runStandardAnalysis.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# +# runStandardAnalysis.sh +# ENRAGE +# +# example: . $ERANGEPATH/runStandardAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000 +# +# assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run. + +# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment + +if [ -z "$ERANGEPATH" ] +then + ERANGEPATH='../commoncode' +fi + +echo 'runStandardAnalysis.sh: version 4.2' + +models="" +if [ $# -eq 5 ]; then + models=" -models "$5 +fi + +replacemodels="" +if [ $# -eq 6 ]; then + replacemodels=" -models $5 -replacemodels " +fi + +if [ -z "$1" ] +then + echo + echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [-replacemodels]' + echo + echo 'where rdsprefix is the name of the rds file without the .rds extension' + echo 'use "none" for the repeatmaskdb if you do not have one' + echo +else + +# log the parameters +arguments=$1' '$2' '$3' '$4' '$models' '$6 +echo 'running with settings: ' $arguments +python $ERANGEPATH/recordLog.py rna.log runStandardAnalysis.sh "with parameters: $arguments" + +# count the unique reads falling on the gene models ; the nomatch files are +# mappable reads that fell outside of the Cistematic gene models and not the +# unmappable of Eland (i.e, the "NM" reads) +echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels" +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels + +# calculate a first-pass RPKM to re-weigh the unique reads, +# using 'none' for the splice count +echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels" +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels + +# recount the unique reads with weights calculated during the first pass +echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels" +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels + +# count splice reads +echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels" +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels + +# Alternative 1: find new regions outside of gene models with reads piled up +echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1" +python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1 + +# Alternative 1: filter out new regions that overlap repeats more than a certain fraction +echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -cache 1" +python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -log rna.log -startField 1 -cache 1 + +# map all candidate regions that are within a given radius of a gene in bp +echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels" +python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels + +# make sure candidates.txt file exists +echo "touch $2.candidates.txt" +touch $2.candidates.txt + +# calculate expanded exonic read density +echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels" +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels + +# weigh multi-reads +echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels" +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels + +# calculate final exonic read density +echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache" +python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache + +fi \ No newline at end of file diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh new file mode 100755 index 0000000..2626ed0 --- /dev/null +++ b/docs/runStrandedAnalysis.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# +# runStrandedAnalysis.sh +# ENRAGE +# +# example: . ../commoncode/runStrandedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000 +# +# assuming that we have rds database with the prefix c2c12rna.24R. + +# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment + +if [ -z "$ERANGEPATH" ] +then + ERANGEPATH='../commoncode' +fi + +echo 'runStrandedAnalysis.sh: version 4.1' + +if [ -z "$1" ] +then + echo + echo 'usage:runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius' + echo + echo 'where rdsprefix is the name of the rds file without the .rds extension' + echo 'use "none" for the repeatmaskdb if you do not have one' + echo +else + +# log the parameters +arguments=$1' '$2' '$3' '$4 +echo 'running with settings: ' $arguments +python $ERANGEPATH/recordLog.py rna.log runStrandedAnalysis.sh "with parameters: $arguments" + +# count the unique reads falling on the gene models ; the nomatch files are +# mappable reads that fell outside of the Cistematic gene models and not the +# unmappable of Eland (i.e, the "NM" reads) +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -stranded -markGID -cache 1 + +# calculate a first-pass RPKM to re-weigh the unique reads, +# using 'none' for the splice count +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache + +# recount the unique reads with weights calculated during the first pass +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -stranded -uniq -cache 1 + +# count splice reads +python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -stranded -splices -noUniqs -cache 1 + +# find new regions outside of gene models with reads piled up +python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter plus -log rna.log -cache 1 +python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter minus -log rna.log -cache 1 -append + +# filter out new regions that overlap repeats more than a certain fraction +python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -log rna.log -cache 1 + +# Alternative 2: use a precomputed list of "new" regions (outside of gene models) +#python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed +#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good + +# map all candidate regions that are within a given radius of a gene in bp +python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -stranded -cache + +# calculate expanded exonic read density +python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache + +# weigh multi-reads +python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -stranded -multi -cache 1 + +# calculate final exonic read density +python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache + +fi diff --git a/farPairs.py b/farPairs.py new file mode 100644 index 0000000..73dd3ca --- /dev/null +++ b/farPairs.py @@ -0,0 +1,162 @@ +# +# farPairs.py +# ENRAGE +# +# Created by Ali Mortazavi on 7/13/10. +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, time +import optparse +from commoncode import readDataset + +print "%prog: version 1.3" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--minDist", type="int", dest="minDist") + parser.add_option("--maxDist", type="int", dest="maxDist") + parser.add_option("--minCount", type="int", dest="minCount") + parser.add_option("--label", dest="label") + parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None, + minDist=1000, maxDist=500000, minCount=2, label=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + print "\tIs both slow and takes up large amount of RAM" + sys.exit(1) + + rdsfile = args[0] + outfilename = args[1] + outbedname = args[2] + + farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose, + options.cachePages, options.minDist, options.maxDist, options.minCount, + options.label) + + +def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False, + cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None): + + doCache = False + if cachePages is not None: + doCache = True + else: + cachePages = 0 + + if label is None: + label = rdsfile + + RDS = readDataset(rdsfile, verbose=True, cache=doCache) + rdsChromList = RDS.getChromosomes() + + if doVerbose: + print time.ctime() + + total = 0 + outfile = open(outfilename, "w") + outbed = open(outbedname, "w") + outbed.write('track name="%s distal pairs" color=0,255,0\n' % label) + + readlen = RDS.getReadSize() + flagDict = {} + for chromosome in rdsChromList: + if doNotProcessChromosome(chromosome): + continue + + print chromosome + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True) + if doVerbose: + print len(uniqDict), time.ctime() + + for readID in uniqDict: + readList = uniqDict[readID] + if len(readList) == 2: + total += 1 + (start1, flag1, pair1) = readList[0] + (start2, flag2, pair2) = readList[1] + + if flag1 != flag2: + dist = abs(start1 - start2) + startList = [start1, start2] + stopList = [start1 + readlen, start2 + readlen] + startList.sort() + stopList.sort() + if flag1 != "" and flag2 != "" and minDist < dist < maxDist: + outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0") + outbed.write(outputLine) + if doVerbose: + print flag1, flag2, dist + + try: + flagDict[flag1].append((flag2, start1, start2)) + except KeyError: + flagDict[flag1] = [(flag2, start1, start2)] + + try: + flagDict[flag2].append((flag1, start1, start2)) + except KeyError: + flagDict[flag2] = [(flag2, start1, start2)] + + print "%d connected regions" % len(flagDict) + + for region in flagDict: + flagDict[region].sort() + regionConnections = {} + for (region2, start1, start2) in flagDict[region]: + try: + regionConnections[region2] += 1 + except KeyError: + regionConnections[region2] = 1 + + for region2 in regionConnections: + if regionConnections[region2] >= minCount: + outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2])) + if doVerbose: + print "%s\t%s\t%d" % (region, region2, regionConnections[region2]) + + outfile.close() + outbed.close() + if doVerbose: + print "finished: ", time.ctime() + + +def doNotProcessChromosome(chrom): + return chrom == "chrM" + + +def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense): + readSizes = "%d" % (stopList[0] - startList[0]) + readCoords = "0" + leftStart = startList[0] - 1 + rightStop = stopList[-1] + for index in range(1, numPieces): + readSizes += ",%d" % (stopList[index] - startList[index] + 1) + readCoords += ",%d" % (startList[index] - startList[0]) + + if rsense == "+": + senseCode = plusSense + else: + senseCode = minusSense + + outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords) + return outline + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/featureIntersects.py b/featureIntersects.py new file mode 100755 index 0000000..e0b7726 --- /dev/null +++ b/featureIntersects.py @@ -0,0 +1,62 @@ +# +# featureIntersects.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from cistematic.core import featuresIntersecting + +print "%prog: version 1.0" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s tabfile [--cistype type] [--radius radius]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cistype", action="store_false", dest="cistype") + parser.add_option("--radius", type="int", dest="radius") + parser.set_defaults(cistype="TFBSCONSSITES", radius=100) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 1: + print usage + sys.exit(1) + + tabfile = args[0] + + featureIntersects(tabfile, options.cistype, options.radius) + + +def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100): + tabfile = open(tabFileName) + previous = "" + + posList = [] + for line in tabfile: + fields = line.split("\t") + current = fields[0] + if previous == current: + continue + + previous = current + chrom = fields[1][3:] + posList.append((chrom, (int(fields[2]) + int(fields[3]))/2)) + + feats = featuresIntersecting("human", posList, radius, cistype) + featkeys = feats.keys() + featkeys.sort() + for (chrom, pos) in featkeys: + print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)])) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/findMotifs.py b/findMotifs.py new file mode 100755 index 0000000..e79401b --- /dev/null +++ b/findMotifs.py @@ -0,0 +1,112 @@ +# +# findMotifs.py +# ENRAGE +# +try: + import psyco + psyco.full() +except: + pass + +import sys, os, optparse +from cistematic.experiments.fasta import Fasta +from cistematic.programs.meme import Meme +from cistematic.programs.cisGreedy import CisGreedy +#TODO: cisSampler is not supported yet! +#from cistematic.programs.cisSampler import CisSampler + +print "%prog: version 3.4" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog explabel regions.fsa [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--meme", action="store_true", dest="doMeme") + parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy") + parser.add_option("--logo", action="store_true", dest="saveLogo") + parser.add_option("--threshold", type="float", dest="threshold") + parser.add_option("--prefix", dest="motifPrefix") + parser.add_option("--numMotifs", dest="numMotifs") + parser.add_option("--maxWidth", type="int", dest="maxWidth") + parser.add_option("--maskLower", action="store_true", dest="maskLower") + parser.set_defaults(doMeme=False, doCisGreedy=False, saveLogo=False, + threshold=75., numMotifs="10", maxWidth=28, maskLower=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + print "\n\twhere at least one of the motif finders (meme or cisGreedy) must be specified\n" + sys.exit(1) + + expbase = args[0] + fsafile = args[1] + + doCisSampler = False + if "--cisSampler" in sys.argv: + print "cisSampler is not supported yet! avoid using it for now" + doCisSampler = True + + findMotifs(expbase, fsafile, options.doMeme, options.doCisGreedy, options.saveLogo, + options.threshold, options.numMotifs, options.maxWidth, options.maskLower, + doCisSampler) + + +def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75., + numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False): + + motifPrefix = expbase + + #TODO: cisSampler is not supported yet! + #if doMeme or doCisGreedy or doCisSampler: + if not (doMeme or doCisGreedy): + print "error: must specify at least one motif finder - exiting" + sys.exit(1) + + exp = Fasta(expbase, "%s.db" % expbase) + + exp.initialize() + if maskLower: + exp.setMaskLowerCase(True) + + if doMeme: + prog4 = Meme() + prog4.setMaxWidth(maxWidth) + prog4.setNumMotifs(numMotifs) + prog4.setModel("zoops") + exp.appendProgram(prog4) + + if doCisGreedy: + prog5 = CisGreedy() + prog5.setGenExpOptions([]) + prog5.setMaxWidth(maxWidth) + prog5.setNumMotifs(numMotifs) + exp.appendProgram(prog5) + + #TODO: cisSampler is not supported yet! + #if doCisSampler: + # prog6 = CisSampler() + # prog6.setGenExpOptions([]) + # prog6.setMaxWidth(maxWidth) + # prog6.setNumMotifs(numMotifs) + # exp.appendProgram(prog6) + + exp.run(fsafile) + exp.createAnalysis() + exp.loadAnalysis() + exp.mapMotifs(threshold, verbose=False) + exp.exportMotifs(prefix = motifPrefix) + if saveLogo: + exp.exportLogos(prefix = motifPrefix) + + exp.draw("%s.png" % expbase, maxOccurences=4000) + print "deleting database..." + del exp + os.remove("%s.db" % expbase) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/findall.py b/findall.py new file mode 100755 index 0000000..10f007b --- /dev/null +++ b/findall.py @@ -0,0 +1,699 @@ +""" + usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile + [--control controlrdsfile] [--minimum minHits] [--ratio minRatio] + [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num] + [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction] + [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw] + [--revbackground] [--pvalue self|back|none] [--nodirectionality] + [--strandfilter plus/minus] [--trimvalue percent] [--notrim] + [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA] + + where values in brackets are optional and label is an arbitrary string. + + Use -ratio (default 4 fold) to set the minimum fold enrichment + over the control, -minimum (default 4) is the minimum number of reads + (RPM) within the region, and -spacing (default readlen) to set the maximum + distance between reads in the region. -listPeak lists the peak of the + region. Peaks mut be higher than -minPeak (default 0.5 RPM). + Pvalues are calculated from the sample (change with -pvalue), + unless the -revbackground flag and a control RDS file are provided. + + By default, all numbers and parameters are on a reads per + million (RPM) basis. -raw will treat all settings, ratios and reported + numbers as raw counts rather than RPM. Use -notrim to turn off region + trimming and -trimvalue to control trimming (default 10% of peak signal) + + The peak finder uses minimal directionality information that can + be turned off with -nodirectionality ; the fraction of + strand reads + required to be to the left of the peak (default 0.3) can be set with + -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction + of plus reads in a region, which (defaults 0.25 and 0.75, respectively). + + Use -shift to shift reads either by half the expected + fragment length (default 0 bp) or '-shift learn ' to learn the shift + based on the first chromosome. If you prefer to learn the shift + manually, use -autoshift to calculate a per-region shift value, which + can be reported using -reportshift. -strandfilter should only be used + when explicitely calling unshifted stranded peaks from non-ChIP-seq + data such as directional RNA-seq. regionoutfile is written over by + default unless given the -append flag. +""" + +try: + import psyco + psyco.full() +except: + pass + +import sys +import math +import string +import optparse +from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion + + +versionString = "%s: version 3.2" % sys.argv[0] +print versionString + +def usage(): + print __doc__ + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = __doc__ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--control", dest="mockfile") + parser.add_option("--minimum", type="float", dest="minHits") + parser.add_option("--ratio", type="float", dest="minRatio") + parser.add_option("--spacing", type="int", dest="maxSpacing") + parser.add_option("--listPeak", action="store_true", dest="listPeak") + parser.add_option("--shift", dest="shift") + parser.add_option("--learnFold", type="float", dest="stringency") + parser.add_option("--noshift", action="store_true", dest="noShift") + parser.add_option("--autoshift", action="store_true", dest="autoshift") + parser.add_option("--reportshift", action="store_true", dest="reportshift") + parser.add_option("--nomulti", action="store_true", dest="noMulti") + parser.add_option("--minPlus", type="float", dest="minPlusRatio") + parser.add_option("--maxPlus", type="float", dest="maxPlusRatio") + parser.add_option("--leftPlus", type="float", dest="leftPlusRatio") + parser.add_option("--minPeak", type="float", dest="minPeak") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--revbackground", action="store_true", dest="doRevBackground") + parser.add_option("--pvalue", dest="ptype") + parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality") + parser.add_option("--strandfilter", dest="strandfilter") + parser.add_option("--trimvalue", type="float", dest="trimValue") + parser.add_option("--notrim", action="store_false", dest="doTrim") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--log", dest="logfilename") + parser.add_option("--flag", dest="withFlag") + parser.add_option("--append", action="store_true", dest="doAppend") + parser.add_option("--RNA", action="store_true", dest="rnaSettings") + parser.add_option("--combine5p", action="store_true", dest="combine5p") + parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, + stringency=4.0, noshift=False, autoshift=False, reportshift=False, + minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, + normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, + trimValue=None, doTrim=True, doAppend=False, rnaSettings=False, + cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, + strandfilter=None, combine5p=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + usage() + sys.exit(2) + + factor = args[0] + hitfile = args[1] + outfilename = args[2] + + findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift, + options.stringency, options.noshift, options.autoshift, options.reportshift, + options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, + options.normalize, options.logfilename, options.withFlag, options.doDirectionality, + options.trimValue, options.doTrim, options.doAppend, options.rnaSettings, + options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti, + options.strandfilter, options.combine5p) + + +def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, + stringency=4.0, noshift=False, autoshift=False, reportshift=False, + minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, + normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, + trimValue=None, doTrim=True, doAppend=False, rnaSettings=False, + cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, + strandfilter=None, combine5p=False): + + shiftValue = 0 + if autoshift: + shiftValue = "auto" + + if shift is not None: + try: + shiftValue = int(shift) + except ValueError: + if shift == "learn": + shiftValue = "learn" + print "Will try to learn shift" + + if noshift: + shiftValue = 0 + + if trimValue is not None: + trimValue = float(trimValue) / 100. + trimString = "%2.1f%s" % ((100. * trimValue), "%") + else: + trimValue = 0.1 + trimString = "10%" + + if not doTrim: + trimString = "none" + + if doRevBackground: + print "Swapping IP and background to calculate FDR" + pValueType = "back" + + doControl = False + if mockfile is not None: + doControl = True + + doPvalue = True + if ptype is not None: + ptype = ptype.upper() + if ptype == "NONE": + doPvalue = False + pValueType = "none" + p = 1 + poissonmean = 0 + elif ptype == "SELF": + pValueType = "self" + elif ptype == "BACK": + if doControl and doRevBackground: + pValueType = "back" + else: + print "must have a control dataset and -revbackground for pValue type 'back'" + else: + print "could not use pValue type : %s" % ptype + else: + pValueType = "self" + + if cachePages is not None: + doCache = True + else: + doCache = False + cachePages = -1 + + if withFlag != "": + print "restrict to flag = %s" % withFlag + + useMulti = True + if noMulti: + print "using unique reads only" + useMulti = False + + if rnaSettings: + print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" + shiftValue = 0 + doTrim = False + doDirectionality = False + + stranded = "" + if strandfilter is not None: + if strandfilter == "plus": + stranded = "+" + minPlusRatio = 0.9 + maxPlusRatio = 1.0 + print "only analyzing reads on the plus strand" + elif strandfilter == "minus": + stranded = "-" + minPlusRatio = 0.0 + maxPlusRatio = 0.1 + print "only analyzing reads on the minus strand" + + stringency = max(stringency, 1.0) + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + if doControl: + print "\ncontrol:" + mockRDS = readDataset(mockfile, verbose=True, cache=doCache) + + if cachePages > mockRDS.getDefaultCacheSize(): + mockRDS.setDBcache(cachePages) + + print "\nsample:" + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + readlen = hitRDS.getReadSize() + if rnaSettings: + maxSpacing = readlen + + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache) + print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded) + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType) + + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + hitRDSsize = len(hitRDS) / 1000000. + if doControl: + mockRDSsize = len(mockRDS) / 1000000. + + if normalize: + if doControl: + mockSampleSize = mockRDSsize + + hitSampleSize = hitRDSsize + + if doAppend: + outfile = open(outfilename, "a") + else: + outfile = open(outfilename, "w") + + outfile.write("#ERANGE %s\n" % versionString) + if doControl: + outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize)) + else: + outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize)) + + if withFlag != "": + outfile.write("#restrict to Flag = %s\n" % withFlag) + + outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache)) + outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)) + outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)) + if normalize: + print "Normalizing to RPM" + countLabel = "RPM" + else: + countLabel = "COUNT" + + headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"] + if doDirectionality: + headerList.append("plus%\tleftPlus%") + + if listPeak: + headerList.append("peakPos\tpeakHeight") + + if reportshift: + headerList.append("readShift") + + if doPvalue: + headerList.append("pValue") + + headline = string.join(headerList, "\t") + print >> outfile, headline + + statistics = {"index": 0, + "total": 0, + "mIndex": 0, + "mTotal": 0, + "failed": 0 + } + + if minRatio < minPeak: + minPeak = minRatio + + hitChromList = hitRDS.getChromosomes() + if doControl: + mockChromList = mockRDS.getChromosomes() + + hitChromList.sort() + + for chromosome in hitChromList: + if doNotProcessChromosome(chromosome, doControl, mockChromList): + continue + + print "chromosome %s" % (chromosome) + hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p) + maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) + if shiftValue == "learn": + shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, + stringency, readlen, minHits, logfilename, outfile, outfilename) + + regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti, + normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, + noMulti, doControl, factor, trimValue, outputRegionList=True) + + statistics["index"] += regionStats["index"] + statistics["total"] += regionStats["total"] + statistics["failed"] += regionStats["failed"] + if not doRevBackground: + if doPvalue: + p, poissonmean = calculatePValue(allRegionWeights) + + print headline + shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) + continue + + #now do background swapping the two samples around + print "calculating background..." + backgroundTrimValue = 1/20. + backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti, + normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, + noMulti, doControl, factor, backgroundTrimValue) + + statistics["mIndex"] += backgroundRegionStats["index"] + statistics["mTotal"] += backgroundRegionStats["total"] + statistics["failed"] += backgroundRegionStats["failed"] + print statistics["mIndex"], statistics["mTotal"] + if doPvalue: + if pValueType == "self": + p, poissonmean = calculatePValue(allRegionWeights) + else: + p, poissonmean = calculatePValue(backgroundRegionWeights) + + print headline + shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) + + footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue) + print footer + outfile.write(footer) + outfile.close() + + writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | "))) + + +def doNotProcessChromosome(chromosome, doControl, mockChromList): + skipChromosome = False + if chromosome == "chrM": + skipChromosome = True + + if doControl and (chromosome not in mockChromList): + skipChromosome = True + + return skipChromosome + + +def calculatePValue(dataList): + dataList.sort() + listSize = float(len(dataList)) + try: + poissonmean = sum(dataList) / listSize + except ZeroDivisionError: + poissonmean = 0 + + print "Poisson n=%d, p=%f" % (listSize, poissonmean) + p = math.exp(-poissonmean) + + return p, poissonmean + + +def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, + stringency, readlen, minHits, logfilename, outfile, outfilename): + + print "learning shift.... will need at least 30 training sites" + previousHit = -1 * maxSpacing + hitList = [-1] + weightList = [0] + readList = [] + shiftDict = {} + count = 0 + numStarts = 0 + for (pos, sense, weight) in hitDict[chrom]: + if abs(pos - previousHit) > maxSpacing or pos == maxCoord: + sumAll = sum(weightList) + if normalize: + sumAll /= hitSampleSize + + regionStart = hitList[0] + regionStop = hitList[-1] + regionLength = regionStop - regionStart + # we're going to require stringent settings + if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen: + foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) + + if foldRatio >= minRatio: + localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True) + try: + shiftDict[localshift] += 1 + except KeyError: + shiftDict[localshift] = 1 + + count += 1 + + hitList = [] + weightList = [] + readList = [] + numStarts = 0 + + if pos not in hitList: + numStarts += 1 + + hitList.append(pos) + weightList.append(weight) + readList.append((pos, sense, weight)) + previousHit = pos + + bestShift = 0 + bestCount = 0 + outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count) + print outline + writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) + if count < 30: + outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold" + print outline + writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) + shiftValue = 0 + else: + for shift in sorted(shiftDict): + if shiftDict[shift] > bestCount: + bestShift = shift + bestCount = shiftDict[shift] + + shiftValue = bestShift + print shiftDict + + outline = "#picked shiftValue to be %d" % shiftValue + print outline + print >> outfile, outline + writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) + + return shiftValue + + +def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio): + if doControl: + foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll) + else: + foldRatio = minRatio + + return foldRatio + + +def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll): + numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + if normalize: + numMock /= sampleSize + + foldRatio = sumAll / numMock + + return foldRatio + + +def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, + normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, + noMulti, doControl, factor, trimValue, outputRegionList=False): + + index = 0 + total = 0 + failedCounter = 0 + previousHit = - 1 * maxSpacing + currentHitList = [-1] + currentWeightList = [0] + currentReadList = [] + regionWeights = [] + outregions = [] + numStarts = 0 + hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True) + maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti) + for (pos, sense, weight) in hitDict[chrom]: + if abs(pos - previousHit) > maxSpacing or pos == maxCoord: + sumAll = sum(currentWeightList) + if normalize: + sumAll /= rdsSampleSize + + regionStart = currentHitList[0] + regionStop = currentHitList[-1] + regionWeights.append(int(sumAll)) + if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: + sumMulti = 0. + #first pass uses getFoldRatio on mockRDS as there may not be control + foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll) + if foldRatio >= minRatio: + # first pass, with absolute numbers + if doDirectionality: + (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True) + else: + (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True) + + bestPos = topPos[0] + peakScore = smoothArray[bestPos] + if normalize: + peakScore /= rdsSampleSize + + if doTrim: + minSignalThresh = trimValue * peakScore + start = 0 + stop = regionStop - regionStart - 1 + startFound = False + while not startFound: + if smoothArray[start] >= minSignalThresh or start == bestPos: + startFound = True + else: + start += 1 + + stopFound = False + while not stopFound: + if smoothArray[stop] >= minSignalThresh or stop == bestPos: + stopFound = True + else: + stop -= 1 + + regionStop = regionStart + stop + regionStart += start + try: + if doDirectionality: + (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift) + else: + (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift) + except: + continue + + if normalize: + sumAll /= rdsSampleSize + + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio) + if outputRegionList: + sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) + # just in case it changed, use latest data + try: + bestPos = topPos[0] + peakScore = smoothArray[bestPos] + except: + continue + + # normalize to RPM + if normalize: + peakScore /= rdsSampleSize + + elif outputRegionList: + sumMulti = sum(currentWeightList) - currentWeightList.count(1.0) + + if outputRegionList: + # normalize to RPM + if normalize: + sumMulti /= rdsSampleSize + + try: + multiP = 100. * (sumMulti / sumAll) + except: + break + + if noMulti: + multiP = 0. + + # check that we still pass threshold + if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen: + plusRatio = float(numPlus)/numHits + if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: + if outputRegionList: + peak = "" + if listPeak: + peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore) + + if doDirectionality: + if leftPlusRatio < numLeft / numPlus: + index += 1 + if outputRegionList: + plusP = plusRatio * 100. + leftP = 100. * numLeft / numPlus + # we have a region that passes all criteria + outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift)) + + total += sumAll + else: + failedCounter += 1 + else: + # we have a region, but didn't check for directionality + index += 1 + total += sumAll + if outputRegionList: + outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift)) + + currentHitList = [] + currentWeightList = [] + currentReadList = [] + numStarts = 0 + + if pos not in currentHitList: + numStarts += 1 + + currentHitList.append(pos) + currentWeightList.append(weight) + currentReadList.append((pos, sense, weight)) + previousHit = pos + + statistics = {"index": index, + "total": total, + "failed": failedCounter + } + + if outputRegionList: + return statistics, regionWeights, outregions + else: + return statistics, regionWeights + + +def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue): + bestShift = 0 + shiftDict = {} + for region in outregions: + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest + if doPvalue: + sumAll = int(region[5]) + for i in xrange(sumAll): + pValue *= poissonmean + pValue /= i+1 + + if shiftValue == "auto" and reportshift: + try: + shiftDict[region[-1]] += 1 + except KeyError: + shiftDict[region[-1]] = 1 + + try: + if reportshift: + outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region] + else: + outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]] + except: + if reportshift: + outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region] + else: + outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]] + + if doPvalue: + outputList.append("%1.2g" % pValue) + + outline = string.join(outputList, "\t") + print outline + print >> outfile, outline + + if shiftValue == "auto" and reportshift: + bestCount = 0 + for shift in sorted(shiftDict): + if shiftDict[shift] > bestCount: + bestShift = shift + bestCount = shiftDict[shift] + + return bestShift + + +def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue): + footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])] + if doDirectionality: + footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"]) + + if doRevBackground: + try: + percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100) + except (ValueError, ZeroDivisionError): + percent = 0. + + footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent)) + + if shiftValue == "auto" and reportshift: + footerList.append("#mode of shift values: %d" % shiftModeValue) + + footer = string.join(footerList, "\n") + + return footer + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/fraction.py b/fraction.py new file mode 100755 index 0000000..f955fce --- /dev/null +++ b/fraction.py @@ -0,0 +1,51 @@ +# +# fraction.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +from random import random +import sys + +print "%s: version 1.0" % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(sys.argv) < 4: + print "usage: python %s fraction infile outfile" % sys.argv[0] + sys.exit(1) + + fraction = float(sys.argv[1]) + infile = sys.argv[2] + outfile = argv[3] + + doFraction(fraction, infile, outfile) + + +def doFraction(fraction, inFileName, outFileName): + infile = open(inFileName) + outfile = open(outFileName, "w") + + totalIndex = 0 + fractionIndex = 0 + for line in infile: + totalIndex += 1 + if random() <= fraction: + outfile.write(line) + fractionIndex += 1 + + infile.close() + outfile.close() + + print "%d / %d = %.2f" % (fractionIndex, totalIndex, float(fractionIndex) / totalIndex) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py new file mode 100755 index 0000000..058ad82 --- /dev/null +++ b/geneDownstreamBins.py @@ -0,0 +1,149 @@ +# +# geneDownstreamBins.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +# originally from version 1.3 of geneDnaDownstreamCounts.py +import sys, optparse +from commoncode import readDataset +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 2.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: %prog genome rdsfile outfilename [--max regionSize]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--max", type="int", dest="standardMinDist", + help="maximum region in bp") + parser.set_defaults(standardMinDist=3000) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + hitfile = args[1] + outfilename = args[2] + + geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist) + + +def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False): + bins = 10 + standardMinThresh = standardMinDist / bins + + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + normalizationFactor = 1.0 + if normalize: + hitDictSize = len(hitRDS) + normalizationFactor = hitDictSize / 1000000. + + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=True) + + geneinfoDict = idb.getallGeneInfo(genome) + featuresDict = hg.getallGeneFeatures() + + outfile = open(outfilename, "w") + + gidList = hg.allGIDs() + gidList.sort() + for gid in gidList: + symbol = "LOC" + gid + geneinfo = "" + featureList = [] + try: + geneinfo = geneinfoDict[gid] + featureList = featuresDict[gid] + symbol = geneinfo[0][0] + except: + print gid + + if len(featureList) == 0: + continue + + newfeatureList = [] + for (ftype, chrom, start, stop, fsense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + + if chrom not in hitDict: + continue + + newfeatureList.sort() + if len(newfeatureList) < 1: + continue + + glen = standardMinDist + if fsense == "F": + nextGene = hg.rightGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + + if glen < 1: + glen = 1 + + gstart = newfeatureList[-1][1] + else: + nextGene = hg.leftGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + + if glen < 1: + glen = 1 + + gstart = newfeatureList[0][0] - glen + if gstart < 0: + gstart = 0 + + tagCount = 0 + if glen < standardMinDist: + continue + + binList = [0.] * bins + for (tagStart, sense, weight) in hitDict[chrom]: + tagStart -= gstart + if tagStart >= glen: + break + + if tagStart > 0: + tagCount += weight + if fsense == "F": + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + binList[binID] += weight + else: + rdist = glen - tagStart + binID = rdist / standardMinThresh + binList[binID] += weight + + if tagCount < 2: + continue + + tagCount *= normalizationFactor + print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList)) + outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen)) + for binAmount in binList: + outfile.write("\t%.2f" % binAmount) + + outfile.write("\n") + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneLocusBins.py b/geneLocusBins.py new file mode 100755 index 0000000..e6b403f --- /dev/null +++ b/geneLocusBins.py @@ -0,0 +1,136 @@ +# +# geneLocusBins.py +# ENRAGE +# + +# originally from version 1.3 of geneDownstreamBins.py +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print '%s: version 2.1' % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--bins", type="int", dest="bins", + help="number of bins to use [default: 10]") + parser.add_option("--flank", type="int", dest="flankBP", + help="number of flanking BP on both upstream and downstream [default: 0]") + parser.add_option("--upstream", type="int", dest="upstreamBP", + help="number of upstream flanking BP [default: 0]") + parser.add_option("--downstream", type="int", dest="downstreamBP", + help="number of downstream flanking BP [default: 0]") + parser.add_option("--nocds", action="store_false", dest="doCDS", + help="do not CDS") + parser.add_option("--raw", action="store_false", dest="normalizeBins", + help="do not normalize results") + parser.add_option("--force", action="store_false", dest="limitNeighbor", + help="limit neighbor region") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--cache", action="store_true", dest="doCache", + help="use cache") + parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + hitfile = args[1] + outfilename = args[2] + + upstreamBp = 0 + downstreamBp = 0 + doFlank = False + if options.flankBP is not None: + upstreamBp = options.flankBP + downstreamBp = options.flankBP + doFlank = True + + if options.upstreamBP is not None: + upstreamBp = options.upstreamBP + doFlank = True + + if options.downstreamBP is not None: + downstreamBp = options.downstreamBP + doFlank = True + + geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile) + + +def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): + if acceptfile is None: + acceptDict = {} + else: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + readlen = hitRDS.getReadSize() + normalizationFactor = 1.0 + if normalizeBins: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. + + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=doCache) + + geneinfoDict = idb.getallGeneInfo(genome) + if doFlank: + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) + else: + locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) + + gidList = hg.allGIDs() + gidList.sort() + for chrom in acceptDict: + for (label, start, stop, length) in acceptDict[chrom]: + if label not in gidList: + gidList.append(label) + + (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False) + + outfile = open(outfilename,'w') + + for gid in gidList: + if 'FAR' not in gid: + symbol = 'LOC' + gid + geneinfo = '' + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = gid + if gid in gidBins and gid in gidLen: + tagCount = 0. + for binAmount in gidBins[gid]: + tagCount += binAmount + outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid])) + for binAmount in gidBins[gid]: + if normalizeBins: + if tagCount == 0: + tagCount = 1 + outfile.write('\t%.1f' % (100. * binAmount / tagCount)) + else: + outfile.write('\t%.1f' % binAmount) + outfile.write('\n') + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneLocusCounts.py b/geneLocusCounts.py new file mode 100755 index 0000000..0e8792b --- /dev/null +++ b/geneLocusCounts.py @@ -0,0 +1,138 @@ +# +# geneLocusCounts.py +# ENRAGE +# +""" usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices] + where upstream and downstream are in bp and and optional + using noCDS requires either upstream or downstream (but not both) + to be nonzero. Using -locuslength will report the first bplength + or the last bplength of the gene region depending on whether it + is positive or negative. + will by default only count the uniq reads (use -noUniqs to turn off) + but can also count multi and splice reads given the appropriate flags +""" +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset, getMergedRegions, getLocusByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print '%s: version 3.0' % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome readDB outfilename [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--noUniqs", action="store_false", dest="doUniqs", + help="do not count unique reads") + parser.add_option("--multi", action="store_true", dest="doUniqs", + help="count multi reads") + parser.add_option("--splices", action="store_true", dest="doUniqs", + help="count splice reads") + parser.add_option("--spanTSS", action="store_true", dest="spanTSS") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--noCDS", action="store_false", dest="useCDS") + parser.add_option("--locusLength", type="int", dest="bplength", + help="number of bases to report") + parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="") + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print __doc__ + sys.exit(1) + + genome = args[0] + hitfile = args[1] + outfilename = args[2] + + upstream = 0 + downstream = 0 + try: + upstream = int(args[3]) + except ValueError: + pass + except IndexError: + pass + + try: + if "-" not in args[3]: + downstream = int(args[4]) + except ValueError: + pass + + geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile) + + +def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""): + print 'returning only up to %d bp from gene locus' % bplength + print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS) + + if acceptfile: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + + hitRDS = readDataset(hitfile, verbose = True) + + totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices) + + hg = Genome(genome) + idb = geneinfoDB(cache=True) + + gidCount = {} + gidList = [] + gidLen = {} + geneinfoDict = idb.getallGeneInfo(genome) + locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength) + + locusChroms = locusByChromDict.keys() + chromList = hitRDS.getChromosomes(fullChrom=False) + chromList.sort() + for chrom in chromList: + if chrom == 'M' or chrom not in locusChroms: + continue + + print 'chr' + chrom + fullchrom = 'chr' + chrom + hitRDS.memSync(fullchrom, index=True) + for (start, stop, gid, length) in locusByChromDict[chrom]: + if gid not in gidList: + gidList.append(gid) + gidCount[gid] = 0 + gidLen[gid] = length + + gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + + outfile = open(outfilename,'w') + + totalCount /= 1000000. + + outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n') + gidList.sort() + for gid in gidList: + if 'FAR' not in gid: + symbol = 'LOC' + gid + geneinfo = '' + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = gid + + if gid in gidCount and gid in gidLen: + rpm = gidCount[gid] / totalCount + rpkm = 1000. * rpm / gidLen[gid] + outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm)) + + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneLocusPeaks.py b/geneLocusPeaks.py new file mode 100755 index 0000000..fdfddf9 --- /dev/null +++ b/geneLocusPeaks.py @@ -0,0 +1,117 @@ +# +# geneLocusPeaks.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +import sys, optparse + +print "%prog: version 2.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--up", type="int", dest="upstream") + parser.add_option("--down", type="int", dest="downstream") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + print "\twhere upstream and downstream are in bp and and optional" + sys.exit(1) + + genome = args[0] + hitfile = args[1] + outfilename = args[2] + + geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache) + + +def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False): + acceptDict = {} + + if acceptfile: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + + print "upstream = %d downstream = %d" % (upstream, downstream) + + hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + readlen = hitRDS.getReadSize() + normalizationFactor = 1.0 + if normalize: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. + + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=True) + + gidCount = {} + gidPos = {} + geneinfoDict = idb.getallGeneInfo(genome) + locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict) + + gidList = hg.allGIDs() + gidList.sort() + for chrom in acceptDict: + for (label, start, stop, length) in acceptDict[chrom]: + if label not in gidList: + gidList.append(label) + + for gid in gidList: + gidCount[gid] = 0 + + for chrom in hitDict: + if chrom not in locusByChromDict: + continue + + print chrom + for (start, stop, gid, glen) in locusByChromDict[chrom]: + gidCount[gid] = 0. + (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen) + if len(topPos) > 0: + gidCount[gid] = smoothArray[topPos[0]] + gidPos[gid] = (chrom, start + topPos[0]) + else: + gidPos[gid] = (chrom, start) + + outfile = open(outfilename, "w") + + for gid in gidList: + if "FAR" not in gid: + symbol = "LOC" + gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = gid + + if gid in gidCount and gid in gidPos: + (chrom, pos) = gidPos[gid] + outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid])) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py new file mode 100755 index 0000000..b905cf0 --- /dev/null +++ b/geneMrnaCounts.py @@ -0,0 +1,198 @@ +try: + import psyco + psyco.full() +except: + print "psyco not running" + +import sys +import optparse +from commoncode import readDataset, getFeaturesByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%s: version 5.1" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome rdsfile outfilename [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--stranded", action="store_true", dest="trackStrand") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--noUniqs", action="store_false", dest="doUniqs") + parser.add_option("--multi", action="store_true", dest="doMulti") + parser.add_option("--models", dest="extendGenome") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.add_option("--searchGID", action="store_true", dest="searchGID") + parser.add_option("--countfeatures", action="store_true", dest="countFeats") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--markGID", action="store_true", dest="markGID") + parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False, + extendGenome="", replaceModels=False, searchGID=False, + countFeats=False, cachePages=None, markGID=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genomeName = args[0] + hitfile = args[1] + outfilename = args[2] + + geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices, + options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels, + options.searchGID, options.countFeats, options.cachePages, options.markGID) + + +def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False, + doUniqs=True, doMulti=False, extendGenome="", replaceModels=False, + searchGID=False, countFeats=False, cachePages=None, markGID=False): + + if trackStrand: + print "will track strandedness" + doStranded = "track" + else: + doStranded = "both" + + if extendGenome: + if replaceModels: + print "will replace gene models with %s" % extendGenome + else: + print "will extend gene models with %s" % extendGenome + else: + replaceModels = False + + if cachePages is not None: + doCache = True + else: + cachePages = 100000 + doCache = False + + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + genome = Genome(genomeName, inRAM=True) + if extendGenome != "": + genome.extendFeatures(extendGenome, replace=replaceModels) + + print "getting gene features...." + featuresByChromDict = getFeaturesByChromDict(genome) + + seenFeaturesByChromDict = {} + print "getting geneIDs...." + gidList = genome.allGIDs() + gidList.sort() + gidCount = {} + for gid in gidList: + gidCount[gid] = 0 + + chromList = hitRDS.getChromosomes(fullChrom=False) + if len(chromList) == 0 and doSplices: + chromList = hitRDS.getChromosomes(table="splices", fullChrom=False) + + if markGID: + print "Flagging all reads as NM" + hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices) + + for chrom in chromList: + if chrom not in featuresByChromDict: + continue + + if countFeats: + seenFeaturesByChromDict[chrom] = [] + + print "\nchr%s" % chrom + fullchrom = "chr%s" % chrom + regionList = [] + print "counting GIDs" + for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]: + try: + if doStranded == "track": + checkSense = "+" + if featureSense == "R": + checkSense = "-" + + regionList.append((gid, fullchrom, start, stop, checkSense)) + count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense) + else: + regionList.append((gid, fullchrom, start, stop)) + count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + if count != 0: + print count + + gidCount[gid] += count + if countFeats: + if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]: + seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense)) + except: + print "problem with %s - skipping" % gid + + if markGID: + print "marking GIDs" + hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded) + print "finished marking" + + print " " + if countFeats: + numFeatures = countFeatures(seenFeaturesByChromDict) + print "saw %d features" % numFeatures + + writeOutputFile(outfilename, genome, gidList, gidCount, searchGID) + if markGID and doCache: + hitRDS.saveCacheDB(hitfile) + + +def countFeatures(seenFeaturesByChromDict): + count = 0 + for chrom in seenFeaturesByChromDict.keys(): + try: + count += len(seenFeaturesByChromDict[chrom]) + except TypeError: + pass + + return count + + +def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID): + geneAnnotDict = genome.allAnnotInfo() + genomeName = genome.genome + outfile = open(outfilename, "w") + idb = geneinfoDB(cache=True) + geneInfoDict = idb.getallGeneInfo(genomeName) + for gid in gidList: + symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict) + if gid in gidCount: + outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid])) + else: + outfile.write("%s\t%s\t0\n" % (gid, symbol)) + + outfile.close() + + +def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict): + lookupGID = gid + if searchGID and gid not in geneInfoDict: + actualGeneID = idb.getGeneID(genomeName, gid) + if len(actualGeneID) > 0: + lookupGID = actualGeneID[1] + + try: + geneinfo = geneInfoDict[lookupGID] + symbol = geneinfo[0][0] + except (KeyError, IndexError): + try: + symbol = geneAnnotDict[(genomeName, gid)][0] + except (KeyError, IndexError): + symbol = "LOC%s" % gid + + return symbol + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py new file mode 100755 index 0000000..7acf0b9 --- /dev/null +++ b/geneMrnaCountsWeighted.py @@ -0,0 +1,266 @@ +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys, optparse +from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB + +print '%s: version 4.1' % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--stranded", action="store_false", dest="ignoreSense") + parser.add_option("--uniq", action="store_true", dest="withUniqs") + parser.add_option("--multi", action="store_true", dest="withMulti") + parser.add_option("--record", action="store_true", dest="recording", + help="ignored with uniq reads") + parser.add_option("--accept", dest="acceptfile") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--models", dest="extendGenome") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False, + acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", + replaceModels=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + hitfile = args[1] + countfile = args[2] + outfilename = args[3] + + geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense, + options.withUniqs, options.withMulti, options.recording, + options.acceptfile, options.cachePages, options.doVerbose, + options.extendGenome, options.replaceModels) + + +def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, + withUniqs=False, withMulti=False, recording=False, acceptfile=None, + cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): + + if (not withUniqs and not withMulti) or (withUniqs and withMulti): + print "must have either one of -uniq or -multi set. Exiting" + sys.exit(1) + + if cachePages is not None: + cacheGeneDB(genome) + hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) + idb = geneinfoDB(cache=True) + print "%s cached" % genome + doCache = True + else: + doCache = False + cachePages = 0 + hg = Genome(genome, inRAM=True) + idb = geneinfoDB() + + if acceptfile is not None: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + else: + acceptDict = {} + + if recording and withUniqs: + recording = False + + if extendGenome: + if replaceModels: + print "will replace gene models with %s" % extendGenome + else: + print "will extend gene models with %s" % extendGenome + else: + replaceModels = False + + if extendGenome != "": + hg.extendFeatures(extendGenome, replace = replaceModels) + + hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + readlen = hitRDS.getReadSize() + + geneinfoDict = idb.getallGeneInfo(genome) + geneannotDict = hg.allAnnotInfo() + gidCount = {} + gidReadDict = {} + + featuresByChromDict = getFeaturesByChromDict(hg, acceptDict) + gidList = hg.allGIDs() + + gidList.sort() + for chrom in acceptDict: + for (label, start, stop, length) in acceptDict[chrom]: + if label not in gidList: + gidList.append(label) + + for gid in gidList: + gidCount[gid] = 0 + gidReadDict[gid] = [] + + uniqueCountDict = {} + read2GidDict = {} + + uniquecounts = open(countfile) + for line in uniquecounts: + fields = line.strip().split() + # add a pseudo-count here to ease calculations below + uniqueCountDict[fields[0]] = float(fields[-1]) + 1 + + uniquecounts.close() + + outfile = open(outfilename, "w") + + index = 0 + if withMulti and not withUniqs: + chromList = hitRDS.getChromosomes(table="multi", fullChrom=False) + else: + chromList = hitRDS.getChromosomes(fullChrom=False) + + for achrom in chromList: + if achrom not in featuresByChromDict: + continue + + print "\n" + achrom + " ", + startFeature = 0 + fullchrom = "chr" + achrom + hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti) + featList = featuresByChromDict[achrom] + if ignoreSense: + for (tagStart, tagReadID) in hitDict[fullchrom]: + index += 1 + if index % 100000 == 0: + print "read %d" % index, + + stopPoint = tagStart + readlen + if startFeature < 0: + startFeature = 0 + + for (start, stop, gid, sense, ftype) in featList[startFeature:]: + if tagStart > stop: + startFeature += 1 + continue + + if start > stopPoint: + startFeature -= 100 + break + + if start <= tagStart <= stop: + try: + gidReadDict[gid].append(tagReadID) + if tagReadID in read2GidDict: + if gid not in read2GidDict[tagReadID]: + read2GidDict[tagReadID].append(gid) + else: + read2GidDict[tagReadID] = [gid] + + gidCount[gid] += 1 + except: + print "gid %s not in gidReadDict" % gid + + stopPoint = stop + else: + for (tagStart, tSense, tagReadID) in hitDict[fullchrom]: + index += 1 + if index % 100000 == 0: + print "read %d" % index, + + stopPoint = tagStart + readlen + if startFeature < 0: + startFeature = 0 + + for (start, stop, gid, sense, ftype) in featList[startFeature:]: + if tagStart > stop: + startFeature += 1 + continue + + if start > stopPoint: + startFeature -= 100 + break + + if sense == "R": + sense = "-" + else: + sense = "+" + + if start <= tagStart <= stop and sense == tSense: + try: + gidReadDict[gid].append(tagReadID) + if tagReadID in read2GidDict: + if gid not in read2GidDict[tagReadID]: + read2GidDict[tagReadID].append(gid) + else: + read2GidDict[tagReadID] = [gid] + + gidCount[gid] += 1 + except: + print "gid %s not in gidReadDict" % gid + + stopPoint = stop + + for gid in gidList: + if "FAR" not in gid: + symbol = "LOC" + gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + if genome == "celegans": + symbol = geneinfo[0][1] + else: + symbol = geneinfo[0][0] + except: + try: + symbol = geneannotDict[(genome, gid)][0] + except: + symbol = "LOC" + gid + else: + symbol = gid + + tagCount = 0. + for readID in gidReadDict[gid]: + try: + tagValue = uniqueCountDict[gid] + except: + tagValue = 1 + + tagDenom = 0. + for aGid in read2GidDict[readID]: + try: + tagDenom += uniqueCountDict[aGid] + except: + tagDenom += 1 + + try: + tagCount += tagValue / tagDenom + except ZeroDivisionError: + tagCount = 0 + + if doVerbose: + print "%s %s %f" % (gid, symbol, tagCount) + + outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount)) + + outfile.close() + + if doCache: + uncacheGeneDB(genome) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneNeighbors.py b/geneNeighbors.py new file mode 100755 index 0000000..8ec363c --- /dev/null +++ b/geneNeighbors.py @@ -0,0 +1,152 @@ +# +# geneNeighbors.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import getMergedRegions, getLocusByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 2.4" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--regions", dest="acceptFile") + parser.add_option("--downstream", type="int", dest="downMax") + parser.add_option("--upstream", type="int", dest="upMax") + parser.add_option("--mindist", type="int", dest="minDist") + parser.add_option("--minlocus", type="int", dest="minLocus") + parser.add_option("--maxlocus", type="int", dest="maxLocus") + parser.add_option("--samesense", action="store_true", dest="checkSense") + parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000, + upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + genome = args[0] + outfilename = args[1] + + index = geneNeighbors(genome, outfilename, options.acceptFile, options.checkSense, + options.downMax, options.upMax, options.minDist, options.minLocus, + options.maxLocus) + + print "\n%d genes matched" % index + + +def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False, + downMax=10000000, upMax=10000000, minDist=0, minLocus=-1, + maxLocus=10000000): + + acceptDict = {} + if acceptfile: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=True) + + geneinfoDict = idb.getallGeneInfo(genome) + locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) + + gidList = hg.allGIDs() + gidList.sort() + for chrom in acceptDict: + for (label, start, stop, length) in acceptDict[chrom]: + if label not in gidList: + gidList.append(label) + + index = 0 + outfile = open(outfilename,"w") + chromList = locusByChromDict.keys() + chromList.sort() + for chrom in chromList: + if len(locusByChromDict[chrom]) < 3 or "NT" in chrom or "MT" in chrom: + continue + + print chrom + " ", + + prevStop = locusByChromDict[chrom][0][1] + prevGID = locusByChromDict[chrom][0][2] + if "FAR" not in prevGID: + symbol = "LOC" + prevGID + geneinfo = "" + try: + geneinfo = geneinfoDict[prevGID] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = prevGID + + prevGID = symbol + prevSense = locusByChromDict[chrom][0][4] + + currentStart = locusByChromDict[chrom][1][0] + currentStop = locusByChromDict[chrom][1][1] + currentGID = locusByChromDict[chrom][1][2] + if "FAR" not in currentGID: + symbol = "LOC" + currentGID + geneinfo = "" + try: + geneinfo = geneinfoDict[currentGID] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = currentGID + + currentGID = symbol + currentGlen = locusByChromDict[chrom][1][3] + currentSense = locusByChromDict[chrom][1][4] + + for (nextStart, nextStop, nextGID, nextGlen, nextSense) in locusByChromDict[chrom][2:]: + if "FAR" not in nextGID: + symbol = "LOC" + nextGID + geneinfo = "" + try: + geneinfo = geneinfoDict[nextGID] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = nextGID + + nextGID = symbol + leftDist = currentStart - prevStop + rightDist = nextStart - currentStop + if (currentSense == "F" and minDist < leftDist < upMax and minDist < rightDist < downMax) or (currentSense == "R" and minDist < rightDist < upMax and minDist < leftDist < downMax): + if not checkSense or currentSense == nextSense: + if minLocus <= currentGlen <= maxLocus: + outfile.write("%s\t%s\t%s\t%s\t%d\t%s\t%s\t%d\n" % (currentGID, currentSense, prevGID, prevSense, leftDist, nextGID, nextSense, rightDist)) + index += 1 + + prevStop = currentStop + prevGID = currentGID + prevSense = currentSense + currentStart = nextStart + currentStop = nextStop + currentGID = nextGID + currentGlen = nextGlen + currentSense = nextSense + + outfile.close() + return index + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneStallingBins.py b/geneStallingBins.py new file mode 100755 index 0000000..f08abe6 --- /dev/null +++ b/geneStallingBins.py @@ -0,0 +1,156 @@ +# +# +# geneStallingBins.py +# ENRAGE +# + +# originally from geneLocusBins.py +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 1.3" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--upstream", type="int", dest="upstreamBp") + parser.add_option("--downstream", type="int", dest="downstreamBp") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--normalize", action="store_true", dest="normalize") + parser.add_option("--tagCount", action="store_true", dest="doTagCount") + parser.add_option("--bins", type="int", dest="bins") + parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="", + doCache=False, normalize=False, doTagCount=False, bins=4) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + hitfile = args[1] + controlfile = args[2] + outfilename = args[3] + + geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp, + options.downstreamBp, options.acceptfile, options.doCache, + options.normalize, options.doTagCount, options.bins) + + +def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300, + downstreamBp=0, acceptfile="", doCache=False, normalize=False, + doTagCount=False, bins=4): + + acceptDict = {} + if acceptfile: + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + + doCDS = True + limitNeighbor = False + + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + readlen = hitRDS.getReadSize() + hitNormalizationFactor = 1.0 + if normalize: + hitDictSize = len(hitRDS) + hitNormalizationFactor = hitDictSize / 1000000. + + controlRDS = readDataset(hitfile, verbose=True, cache=doCache) + controlNormalizationFactor = 1.0 + if normalize: + controlDictSize = len(hitRDS) + controlNormalizationFactor = controlDictSize / 1000000. + + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=doCache) + + geneinfoDict = idb.getallGeneInfo(genome) + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) + + gidList = hg.allGIDs() + gidList.sort() + for chrom in acceptDict: + for (label, start, stop, length) in acceptDict[chrom]: + if label not in gidList: + gidList.append(label) + + (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp) + (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp) + + outfile = open(outfilename, "w") + + for gid in gidList: + if "FAR" not in gid: + symbol = "LOC" + gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except: + pass + else: + symbol = gid + + if gid in gidBins and gid in gidLen: + tagCount = 0. + controlCount = 0. + for binAmount in gidBins[gid]: + tagCount += binAmount + + for binAmount in controlBins[gid]: + controlCount += abs(binAmount) + + diffCount = tagCount + controlCount + if diffCount < 0: + diffCount = 0 + + outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid])) + if (gidLen[gid] - 3 * upstreamBp) < upstreamBp: + outfile.write("\tshort\n") + continue + + TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.) + finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.) + if finalbin <= 0.: + finalbin = 0.01 + + if TSSbins < 0: + TSSbins = 0 + + ratio = float(TSSbins)/float(finalbin) + for binAmount in gidBins[gid]: + if doTagCount: + binAmount = binAmount * tagCount / 100. + + if normalize: + if tagCount == 0: + tagCount = 1 + + outfile.write("\t%.1f" % (100. * binAmount / tagCount)) + else: + outfile.write("\t%.1f" % binAmount) + + outfile.write("\t%.2f\n" % ratio) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/geneStartBins.py b/geneStartBins.py new file mode 100755 index 0000000..cbb3c4a --- /dev/null +++ b/geneStartBins.py @@ -0,0 +1,134 @@ +# +# geneStartBins.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +# originally from version 1.3 of geneDownstreamBins.py +from commoncode import * +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +import sys + +print '%s: version 2.0' % sys.argv[0] +if len(sys.argv) < 4: + print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0] + print '\n\twhere regionSize is the optional maximum region in bp\n' + sys.exit(1) + +genome = sys.argv[1] +hitfile = sys.argv[2] +outfilename = sys.argv[3] + +standardMinDist = 3000 +if '-max' in sys.argv: + standardMinDist = int(sys.argv[sys.argv.index('-max') + 1]) + +if '-raw' in sys.argv: + normalize = False + normalizeBins = False +else: + normalize = True + normalizeBins = True + +doCache = False +if '-cache' in sys.argv: + doCache = True + +bins = 10 +standardMinThresh = standardMinDist / bins + +hitRDS = readDataset(hitfile, verbose = True, cache=doCache) +readlen = hitRDS.getReadSize() +normalizationFactor = 1.0 +if normalize: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. + +hg = Genome(genome) +idb = geneinfoDB(cache=True) + +gidDict = {} +geneinfoDict = idb.getallGeneInfo(genome) +featuresDict = hg.getallGeneFeatures() + +#infile = open(infilename) +outfile = open(outfilename,'w') + +gidList = hg.allGIDs() +gidList.sort() +for gid in gidList: + symbol = 'LOC' + gid + geneinfo = '' + featureList = [] + try: + geneinfo = geneinfoDict[gid] + featureList = featuresDict[gid] + symbol = geneinfo[0][0] + except: + print geneinfo + newfeatureList = [] + if len(featureList) == 0: + continue + for (ftype, chrom, start, stop, fsense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + if chrom not in hitDict: + continue + newfeatureList.sort() + if len(newfeatureList) < 1: + #print '%s %s %d' % (gid, symbol, -1) + #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1)) + continue + glen = standardMinDist / 2 + if fsense == 'F': + nextGene = hg.leftGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + if glen < 1: + glen = 1 + gstart = newfeatureList[0][0] - glen + if gstart < 0: + gstart = 0 + gstop = newfeatureList[0][0] + glen + else: + nextGene = hg.rightGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + if glen < 1: + glen = 1 + gstart = newfeatureList[-1][1] - glen + gstop = newfeatureList[-1][1] + glen + tagCount = 0 + if glen < standardMinDist / 2: + continue + binList = [0] * bins + for (tagStart, sense, weight) in hitDict[chrom]: + tagStart -= gstart + if tagStart >= 2 * glen: + break + if tagStart > 0: + tagCount += weight + if fsense == 'R': + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + binList[binID] += weight + else: + rdist = 2 * glen - tagStart + binID = rdist / standardMinThresh + binList[binID] += weight + if tagCount < 2: + continue + print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList)) + outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen)) + for binAmount in binList: + outfile.write('\t%d' % binAmount) + outfile.write('\n') +#infile.close() +outfile.close() + diff --git a/geneUpstreamBins.py b/geneUpstreamBins.py new file mode 100755 index 0000000..e855416 --- /dev/null +++ b/geneUpstreamBins.py @@ -0,0 +1,149 @@ +# +# geneUpstreamBins.py +# ENRAGE +# +# originally from version 1.3 of geneDownstreamBins.py +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 2.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--raw", action="store_false", dest="normalize", + help="maximum region in bp") + parser.add_option("--max", type="int", dest="standardMinDist") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + hitfile = args[1] + outfilename = args[3] + + geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache) + + +def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False): + bins = 10 + standardMinThresh = standardMinDist / bins + + hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + normalizationFactor = 1.0 + if normalize: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. + + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + + hg = Genome(genome) + idb = geneinfoDB(cache=True) + + geneinfoDict = idb.getallGeneInfo(genome) + featuresDict = hg.getallGeneFeatures() + + outfile = open(outfilename,"w") + + gidList = hg.allGIDs() + gidList.sort() + for gid in gidList: + symbol = "LOC" + gid + geneinfo = "" + featureList = [] + try: + geneinfo = geneinfoDict[gid] + featureList = featuresDict[gid] + symbol = geneinfo[0][0] + except: + print geneinfo + + newfeatureList = [] + if len(featureList) == 0: + continue + + for (ftype, chrom, start, stop, fsense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + + if chrom not in hitDict: + continue + + newfeatureList.sort() + if len(newfeatureList) < 1: + continue + + glen = standardMinDist + if fsense == "F": + nextGene = hg.leftGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + + if glen < 1: + glen = 1 + + gstart = newfeatureList[0][0] - glen + if gstart < 0: + gstart = 0 + + else: + nextGene = hg.rightGeneDistance((genome, gid), glen * 2) + if nextGene < glen * 2: + glen = nextGene / 2 + + if glen < 1: + glen = 1 + + gstart = newfeatureList[-1][1] + + tagCount = 0 + if glen < standardMinDist: + continue + + binList = [0] * bins + for (tagStart, sense, weight) in hitDict[chrom]: + tagStart -= gstart + if tagStart >= glen: + break + + if tagStart > 0: + tagCount += weight + if fsense == "R": + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + binList[binID] += weight + else: + rdist = glen - tagStart + binID = rdist / standardMinThresh + binList[binID] += weight + + if tagCount < 2: + continue + + print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList)) + outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen)) + for binAmount in binList: + outfile.write("\t%d" % binAmount) + outfile.write("\n") + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getGOgenes.py b/getGOgenes.py new file mode 100755 index 0000000..0a320ee --- /dev/null +++ b/getGOgenes.py @@ -0,0 +1,113 @@ +import sys, optparse +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 3.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome GOID1 [GOID2 ....] [--outfile outfilename] [--append] [--restrict genefile]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--outfile", dest="outfilename") + parser.add_option("--append", action="store_true", dest="append") + parser.add_option("--restrict", dest="restrictfilename") + parser.set_defaults(outfilename=None, restrictfilename=None, append=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + genome = args[0] + + GOIDlist = [] + for arg in args: + if "GO:" in arg: + GOIDlist.append(arg) + + getGOgenes(genome, GOIDlist, options.outfilename, options.restrictfilename, options.append) + + +def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append=False): + writeOut = False + if outfilename is not None: + writeOut = True + + restrict = False + if restrictfilename is not None: + restrict = True + + hg = Genome(genome) + idb = geneinfoDB() + + print sys.argv + print GOIDlist + + firstGeneList = [] + for GOID in GOIDlist: + testList = hg.allGIDsbyGOID(GOID) + print "GOID: %s (%d)" % (GOID, len(testList)) + firstGeneList += testList + + geneDict = {} + for gid in firstGeneList: + geneDict[gid] = 1 + + geneList = geneDict.keys() + print len(geneList) + geneInfoList = idb.getallGeneInfo(genome) + + if writeOut: + if append: + outfile = open(outfilename, "a") + else: + outfile = open(outfilename, "w") + + for GOID in GOIDlist: + outfile.write("#%s\n" % GOID) + + restrictList = [] + restrictDict = {} + if restrict: + restrictFile = open(restrictfilename) + for line in restrictFile: + fields = line.strip().split() + restrictList.append(fields[0]) + restrictDict[fields[0]] = line + + outList = [] + symbolDict = {} + for gid in geneList: + symbol = "LOC" + gid + if restrict and gid not in restrictList: + continue + + try: + symbol = geneInfoList[gid][0][0] + except: + pass + + if restrict: + symbolDict[symbol] = restrictDict[gid] + + outList.append(symbol) + + outList.sort() + for symbol in outList: + if writeOut: + if restrict: + outfile.write(symbolDict[symbol]) + else: + outfile.write(symbol + "\n") + else: + print symbol + + if writeOut: + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getNovelSNPs.py b/getNovelSNPs.py new file mode 100755 index 0000000..0936a7f --- /dev/null +++ b/getNovelSNPs.py @@ -0,0 +1,96 @@ +# +# getNovelSNPs.py +# ENRAGE +# +# This script attempts to annotate the novel sncs/snps from the snp summary file +# Written by: Wendy Lee +# Written on: Aug 7th, 2008 + +import sys +import string +from cistematic.genomes import Genome +from commoncode import writeLog + +print "%prog: version 1.5" + +try: + import psyco + psyco.full() +except: + pass + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome snpsfile nondbsnp_geneinfo_outfile" % argv[0] + + if len(argv) < 4: + print usage + sys.exit(2) + + genome = argv[1] + snpfile = argv[2] + outfilename = argv[3] + + getNovelSNPsFromFile(genome, snpfile, outfilename) + + +def getNovelSNPsFromFile(genome, snpfile, outfilename): + infile = file(snpfile, "r") + writeNovelSNPFile(genome, infile, outfilename) + writeLog("snp.log", sys.argv[0], "outputfile: %s" % outfilename) + infile.close() + + +def writeNovelSNPFile(genome, snpPropertiesList, outfilename): + hg = Genome(genome) + outString = "" + outfile = open(outfilename, "w") + outfile.write("#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n") + for line in snpPropertiesList: + if doNotProcessLine(line): + continue + + outString = getNovelSNPInfo(genome, line, hg) + if outString == line: + outfile.write(outString) + else: + outfile.write("%s\n" % outString) + + outfile.close() + + +def doNotProcessLine(line): + return line[0] == "#" + + +def getNovelSNPInfo(genome, snpEntry, hg): + fields = snpEntry.split() + #TODO: refactor naming. is fields[8] rpkm? + if fields[8].find("N\A") == -1: + return snpEntry + else: + snpInfo = "" + gid = fields[11] + snc_start = int(fields[3]) + featuresList = hg.getGeneFeatures((genome, gid)) + func = "N\A" + for (ftype, chromosome, start, stop, orientation) in featuresList: + if int(start) <= snc_start <= int(stop): + func = ftype + break + + for i in range (0, 9): + snpInfo = string.join([snpInfo, fields[i]], "\t") + + snpInfo = string.join([snpInfo, func], "\t") + for i in range (10, 13): + snpInfo = string.join([snpInfo, fields[i]], "\t") + + return snpInfo + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getSNPGeneInfo.py b/getSNPGeneInfo.py new file mode 100755 index 0000000..307413b --- /dev/null +++ b/getSNPGeneInfo.py @@ -0,0 +1,177 @@ +# +# getSNPGeneInfo.py +# ENRAGE +# +# This script look for the gene info and expression level for the snps. +# Written by: Wendy Lee +# Written on: August 7th, 2008 + +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys +import optparse +import string +from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB +from cistematic.core.geneinfo import geneinfoDB + +print "%prog: version 4.5" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--cache", action="store_true", dest="cachePages") + parser.add_option("--withoutsense", action="store_false", dest="withSense") + parser.add_option("--flank", type="int", dest="flankBP") + parser.set_defaults(doCache=False, withSense=True, flankBP=0) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(2) + + genome = args[0] + infilename = args[1] + rpkmfilename = args[2] + outfilename = args [3] + + writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP) + + +def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0): + + outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP) + outfile = open(outfilename, "w") + + for outputLine in outList: + outfile.write("%s\n" % outputLine) + + outfile.close() + + +def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0): + + rpkmDict = {} + rpkmField = 3 + if rpkmfilename != "NONE": + rpkmfile = open(rpkmfilename, "r") + for line in rpkmfile: + lineFields = line.split() + rpkmDict[lineFields[0]] = lineFields[rpkmField] + + rpkmfile.close() + + infile = open(infilename) + snpPositionList = [] + snpDict = {} + + for line in infile: + if doNotProcessLine(line): + continue + + fields = line.split("\t") + chrom = fields[2][3:] + start = int(fields[3]) + chromosomePosition = (chrom, start) + snpPositionList.append(chromosomePosition) + snpDict[chromosomePosition] = line + + if doCache: + cacheGeneDB(genome) + idb = geneinfoDB(cache=True) + print "cached %s" % genome + else: + idb = geneinfoDB() + + geneinfoDict = idb.getallGeneInfo(genome) + geneDict = {} + + if flankBP > 0: + matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP) + else: + matchingGenesDict = genesIntersecting(genome, snpPositionList) + + for pos in matchingGenesDict: + geneID = matchingGenesDict[pos][0][0] + try: + symbol = geneinfoDict[geneID][0][0] + except: + symbol = "LOC%s" % geneID + + geneDescriptor = (symbol, geneID) + if geneDict.has_key(geneDescriptor): + geneDict[geneDescriptor]["position"].append(pos) + else: + geneDict[geneDescriptor] = {"position": [pos], + "sense": matchingGenesDict[pos][0][-1]} + + if doCache: + uncacheGeneDB(genome) + + return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense) + + +def doNotProcessLine(line): + return line[0] == "#" + + +def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense): + snpGeneOutputList = [] + snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense) + + for snpEntry in snpGeneInfoList: + outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]] + if withSense: + outputItems.append(snpEntry["sense"]) + + line = string.join(outputItems, "\t") + snpGeneOutputList.append(line) + + snpGeneOutputList.sort(reverse=True) + + return snpGeneOutputList + + +def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense): + + snpGeneInfoList = [] + + for geneDescriptor in geneDict.keys(): + alreadyDoneList = [] + (symbol, geneID) = geneDescriptor + genePositionList = geneDict[geneDescriptor]["position"] + genePositionList.sort() + + for position in genePositionList: + if snpDict[position] in alreadyDoneList: + continue + + snpGeneInfoDict = {"symbol": symbol, + "geneID": geneID} + + rpkm = "N\A" + if rpkmDict.has_key(geneID): + rpkm = str(rpkmDict[geneID]) + + snpGeneInfoDict["rpkm"] = rpkm + snpGeneInfoDict["snpDescription"] = snpDict[position][:-1] + if withSense: + snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"] + + alreadyDoneList.append(snpDict[position]) + snpGeneInfoList.append(snpGeneInfoDict) + + snpGeneInfoList.sort(reverse=True) + + return snpGeneInfoList + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getSNPs.py b/getSNPs.py new file mode 100755 index 0000000..0adde42 --- /dev/null +++ b/getSNPs.py @@ -0,0 +1,216 @@ +# +# getSNPs.py +# ENRAGE +# +# Originally written by: Wendy Lee +# Last modified: May 11th, 2009 by Ali Mortazavi + +""" + Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl). + For each mismatch, choose the base change that occur most frequently (ie: has the highest number + of independent reads) + Threshold of Sl and Cl are from user input + Sl = # of independent reads supporting a base change at position S + Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S + + usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where + + uniqStartMin = # of independent reads supporting a base change at position S + totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S +""" + +import sys, optparse +from commoncode import readDataset, writeLog + +print "%prog: version 3.5" + +try: + import psyco + psyco.full() +except: + print "psyco is not running" + pass + +def usage(): + print __doc__ + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = __doc__ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--nosplices", action="store_false", dest="doSplices") + parser.add_option("--enforceChr", action="store_true", dest="forceChr") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(doSplices=True, forceChr=False, cachePages=0) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + usage() + sys.exit(2) + + hitfile = args[0] + uniqStartMin = float(args[1]) + totalRatioMin = float(args[2]) + outfilename = args[3] + + if options.cachePages > 0: + doCache = True + else: + doCache = False + + writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr) + + +def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False): + writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin)) + + outfile = open(outfilename, "w") + header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange" + outfile.write(header + "\n") + + snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr) + for snpEntry in snpPropertiesList: + outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry + print outline + outfile.write(outline + "\n") + outfile.flush() + + outfile.close() + + writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList)) + + +def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False): + + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + if cachePages > 20000: + hitRDS.setDBcache(cachePages) + + snpPropertiesList = [] + readLength = hitRDS.getReadSize() + chromList = hitRDS.getChromosomes() + + for chrom in chromList: + if doNotProcessChromosome(forceChr, chrom): + continue + + matchDict = getMatchDict(hitRDS, chrom, doSplices) + print "got match dict for %s " % chrom + mismatchDict = getMismatchDict(hitRDS, chrom, doSplices) + print "got mismatch dict for %s " % chrom + mismatchPositions = mismatchDict.keys() + mismatchPositions.sort() + for position in mismatchPositions: + totalCount = mismatchDict[position]["totalCount"] + uniqBaseDict = mismatchDict[position]["uniqBaseDict"] + totalBaseDict = mismatchDict[position]["totalBaseDict"] + highestCount = 0 + highestBaseChange = "N-N" + highestTotalCount = 0 + for baseChange in uniqBaseDict: + if totalBaseDict[baseChange] > highestTotalCount: + highestBaseChange = baseChange + highestCount = uniqBaseDict[baseChange] + highestTotalCount = totalBaseDict[baseChange] + + Cl = 0. + matchCount = 0 + if highestCount >= uniqStartMin: + for matchpos in xrange(position - readLength + 1, position + 1): + try: + matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop]) + except: + pass + + matchCount -= totalCount + if matchCount < 0: + matchCount = 0 + + Sl = highestCount/float(readLength) + Cl = highestTotalCount/float(highestTotalCount + matchCount) + if Cl >= totalRatioMin: + snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange) + snpPropertiesList.append(snpProperties) + + return snpPropertiesList + + +def doNotProcessChromosome(forceChr, chromosome): + if forceChr: + if chromosome[:3] != "chr": + return True + else: + return False + + +def getMatchDict(rds, chrom, withSplices=True): + spliceDict = {} + readDict = {} + finalDict = {} + + try: + readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom) + except: + readDict[chrom] = [] + + for (start, stop) in readDict[chrom]: + if finalDict.has_key(start): + finalDict[start].append(stop) + else: + finalDict[start] = [stop] + + if withSplices: + try: + spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True) + except: + spliceDict[chrom] = [] + + for (start, stop) in spliceDict[chrom]: + if finalDict.has_key(start): + finalDict[start].append(stop) + else: + finalDict[start] = [stop] + + return finalDict + + +def getMismatchDict(rds, chrom, withSplices=True): + mismatchDict = {} + spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices) + for (start, change_at, change_base, change_from) in spliceDict[chrom]: + change = "%s-%s" % (change_base, change_from) + uniqueReadCount = 1 + totalCount = 1 + back = "%s:%s" % (str(start), change) + uniqBaseDict = {change: 1} + totalBaseDict = {change: 1} + if mismatchDict.has_key(change_at): + (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at] + pos = "%s:%s" % (str(start), change) + totalCount += 1 + if totalBaseDict.has_key(change): + totalBaseDict[change] += 1 + + if pos not in back: + uniqueReadCount += 1 + if uniqBaseDict.has_key(change): + uniqBaseDict[change] += 1 # dict contains total unique read counts + + back = "%s,%s" % (back, pos) + + mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount, + "totalCount": totalCount, + "back": back, + "uniqBaseDict": uniqBaseDict, + "totalBaseDict": totalBaseDict + } + + return mismatchDict + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getallNRSE.py b/getallNRSE.py new file mode 100755 index 0000000..c2e639f --- /dev/null +++ b/getallNRSE.py @@ -0,0 +1,354 @@ +import sys, optparse + +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +from cistematic.core import complement +from cistematic.core.motif import Motif +from cistematic.genomes import Genome +from commoncode import readDataset, getMergedRegions, findPeak +from pylab import * +import matplotlib + +print '%s: version 3.4' % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome regionfile siteOutfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--dataset", dest="chipfilename") + parser.add_option("--min", type="float", dest="minHeight") + parser.add_option("--minfraction", type="float", dest="minFraction") + parser.add_option("--plot", dest="plotname") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--markov1", action="store_true", dest="doMarkov1") + parser.add_option("--peakdist", type="int", dest="maxpeakdist") + parser.add_option("--fullOnly", action="store_true", dest="fullOnly") + parser.add_option("--motifdir", dest="motifDir") + parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="", + doCache=False, normalize=True, doVerbose=False, doMarkov1=False, + maxpeakdist=None, fullOnly=False, motifDir="./") + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = argv[0] + infilename = args[1] + outfilename = args[2] + + getallNRSE(genome, infilename, outfilename, options.chipfilename, + options.minHeight, options.minFraction, options.plotname, + options.doCache, options.normalize, options.doVerbose, + options.doMarkov1, options.maxpeakdist, options.fullOnly, + options.motifDir) + + +def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2., + minFraction=-2., plotname="", doCache=False, normalize=True, + doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False, + motifDir="./"): + + doPlot = False + if plotname: + matplotlib.use("Agg") + doPlot = True + + if motifDir[-1] != "/": + motifDir += "/" + + doDataset = False + normalizeBy = 1 + if chipfilename: + hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache) + doDataset = True + if normalize: + normalizeBy = len(hitRDS) / 1000000. + + if minFraction > 1.: + minFraction /= 100. + print "scaling minFraction to %.2f" % minFraction + + if maxpeakdist is not None: + enforcePeakDist = True + else: + enforcePeakDist = False + maxpeakdist = 101 + + mot = Motif("", motifFile="%sNRSE3.mot" % motifDir) + motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir) + motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir) + bestScore = mot.bestConsensusScore() + bestLeft = motL.bestConsensusScore() + bestRight = motR.bestConsensusScore() + + hg = Genome(genome) + + regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False) + + outfile = open(outfilename, "w") + outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1)) + outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly)) + outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n") + + index = 0 + regionList = [] + + for rchrom in regions: + if "rand" in rchrom or "M" in rchrom or "hap" in rchrom: + continue + + for (start, stop, length) in regions[rchrom]: + regionList.append((rchrom, start, length)) + + notFoundIndex = 0 + currentChrom = "" + for (rchrom, start, length) in regionList: + seq = hg.sequence(rchrom, start, length) + if doDataset: + if rchrom != currentChrom: + fullchrom = "chr" + rchrom + hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True) + currentChrom = rchrom + + (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True) + if len(topPos) == 0: + print "topPos error" + + peakpos = topPos[0] + peakscore = smoothArray[peakpos] + if peakscore == 0.: + peakscore = -1. + + if normalize: + numHits /= normalizeBy + peakscore /= normalizeBy + else: + peakpos = length + peakscore = -1 + numHits = 0 + smoothArray = [0.] * length + + found = [] + if doMarkov1: + lefts = motL.locateMarkov1(seq, 3.) + rights = motR.locateMarkov1(seq, 3.) + else: + lefts = motL.locateMotif(seq, 70) + rights = motR.locateMotif(seq, 70) + + allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights] + allhalfs.sort() + + # look for canonicals and non-canonicals + if len(allhalfs) > 1: + (firstpos, firstsense, firsttype) = allhalfs[0] + for (secondpos, secondsense, secondtype) in allhalfs[1:]: + if enforcePeakDist: + withinDistance = False + for aPos in topPos: + if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist: + withinDistance = True + if not withinDistance: + firstpos = secondpos + firstsense = secondsense + firsttype = secondtype + continue + + if firsttype == "L": + dist = secondpos - firstpos + 2 + else: + dist = secondpos - firstpos -1 + + if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]: + if (firsttype == "L" and secondtype == "R" and secondsense == "F"): + found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist)) + + if (firsttype == "R" and secondtype == "L" and secondsense == "R"): + found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist)) + + firstpos = secondpos + firstsense = secondsense + firsttype = secondtype + + # did we miss any 70%+ matches ? + if doMarkov1: + matches = mot.locateMarkov1(seq, 3.5) + else: + matches = mot.locateMotif(seq, 70) + + for (pos, sense) in matches: + alreadyFound = False + for (fpos, fpeakdist, fdist) in found: + if pos + start == fpos: + alreadyFound = True + + if not alreadyFound: + if enforcePeakDist: + withinDistance = False + for aPos in topPos: + if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist: + withinDistance = True + thePos = aPos + + if withinDistance: + found.append((start + pos, pos - thePos + 10, 11)) + + else: + found.append((start + pos, pos - peakpos + 10, 11)) + + # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all + if len(found) == 0 and not fullOnly: + bestone = -1 + if not doDataset: + bestdist = maxpeakdist + else: + bestdist = length + + index = 0 + for (pos, sense, type) in allhalfs: + if doDataset: + for aPos in topPos: + if abs(pos - aPos) < bestdist: + bestdist = abs(pos - aPos) + bestone = index + peakpos = aPos + else: + found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0)) + + index += 1 + + if (doDataset and bestdist < 101): + try: + found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0)) + except: + continue + + # see if we found an acceptable match + foundValue = False + for (foundpos, posdist, dist) in found: + # get a score for 21-mer, report + seq = hg.sequence(rchrom, foundpos, 21) + # height will be measured from the center of the motif + height = -2. + for pos in range(10 + dist): + try: + currentHeight = smoothArray[int(peakpos + posdist + pos)] + except: + pass + + if currentHeight > height: + height = currentHeight + + if normalize: + height /= normalizeBy + + fractionHeight = height / peakscore + if height < minHeight or fractionHeight < minFraction: + continue + + foundValue = True + (front, back) = mot.scoreMotif(seq) + sense = "+" + if front > back: + score = int(100 * front / bestScore) + theseq = hg.sequence(rchrom, foundpos, 10 + dist) + else: + score = int(100 * back / bestScore) + theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist)) + sense = "-" + foundpos + 1 + + leftScore = -1. + rightScore = -1. + leftseq = "" + rightseq = "" + if dist > 0: + testseq = hg.sequence(rchrom, foundpos, 10 + dist) + if sense == "-": + testseq = complement(testseq) + + leftseq = testseq[:9] + rightseq = testseq[dist-2:] + elif dist == 0: + testseq = hg.sequence(rchrom, foundpos, 12) + if sense == "-": + testseq = complement(testseq) + leftseq = testseq[3:] + else: + leftseq = testseq[:9] + + rightseq = testseq + + (lfront, lback) = motL.scoreMotif(leftseq) + (rfront, rback) = motR.scoreMotif(rightseq) + if lfront > lback: + leftScore = int(100 * lfront) / bestLeft + leftSense = "+" + else: + leftScore = int(100 * lback) / bestLeft + leftSense = "-" + + if rfront > rback: + rightScore = int(100 * rfront) / bestRight + rightSense = "+" + else: + rightScore = int(100 * rback) / bestRight + rightSense = "-" + + if dist != 11: + if rightScore > leftScore: + sense = rightSense + else: + sense = leftSense + + if sense == "-" and dist > 0: + theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist)) + + outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq) + if doVerbose: + print outline + + outfile.write(outline + "/n") + + # we didn't find a site - draw region + if not foundValue and doVerbose: + outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore) + print outline + outfile.write(outline + "\n") + + if not foundValue and doPlot: + drawarray = [val + notFoundIndex for val in smoothArray] + drawpos = [drawarray[val] for val in topPos] + plot(drawarray, "b") + plot(topPos, drawpos, "r.") + goodmatches = mot.locateMotif(seq, 75) + if len(goodmatches) > 0: + print topPos + print goodmatches + drawgood = [] + drawgoody = [] + for (mstart, sense) in goodmatches: + drawgood.append(mstart) + drawgoody.append(drawarray[mstart]) + + plot(drawgood, drawgoody, "g.") + + notFoundIndex -= 30 + + outfile.close() + if doPlot: + savefig(plotname) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getallgenes.py b/getallgenes.py new file mode 100755 index 0000000..addba36 --- /dev/null +++ b/getallgenes.py @@ -0,0 +1,301 @@ +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys, optparse +from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB +from cistematic.core.geneinfo import geneinfoDB +from cistematic.genomes import Genome + +print "%prog: version 5.5" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--radius", type="int", dest="maxRadius") + parser.add_option("--nomatch", dest="nomatchfilename") + parser.add_option("--trackfar", action="store_true", dest="trackFar") + parser.add_option("--stranded", action="store_true", dest="trackStrand") + parser.add_option("--cache", action="store_true", dest="cachePages") + parser.add_option("--compact", action="store_true", dest="compact") + parser.add_option("--step", type="int", dest="step") + parser.add_option("--startField", type="int", dest="colID") + parser.add_option("--models", dest="extendGenome") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False, + trackStrand=False, compact=False, colID=1, doCache=False, + extendGenome="", replaceModels=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(2) + + genome = args[0] + infilename = args[1] + outfilename = args[2] + + getallgenes(genome, infilename, outfilename, options.maxRadius, + options.nomatchfilename, options.step, options.trackFar, + options.trackStrand, options.compact, options.colID, + options.doCache, options.extendgenome, options.replaceModels) + + +def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="", + step=None, trackFar=False, trackStrand=False, compact=False, colID=1, + doCache=False, extendGenome="", replaceModels=False): + + if doCache: + idb = geneinfoDB(cache=True) + else: + idb = geneinfoDB() + + if not step: + step = maxRadius - 2 + + if extendGenome and replaceModels: + replaceModels = True + else: + replaceModels = False + + infile = open(infilename) + outfile = open(outfilename,"w") + + if genome == "dmelanogaster": + geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus") + else: + geneinfoDict = idb.getallGeneInfo(genome) + + posList = [] + altPosDict = {} + altPosRevDict = {} + posLine = {} + posStrand = {} + altPosList = [] + + for line in infile: + if line[0] == "#": + continue + + fields = line.split("\t") + if compact: + (chrom, pos) = fields[colID].split(":") + chrom = chrom[3:] + (start, stop) = pos.split("-") + pos = (chrom, int(start)) + altPos = (chrom, int(stop)) + else: + try: + chrom = fields[colID][3:] + except: + print line + continue + + pos = (chrom, int(fields[colID + 1])) + altPos = (chrom, int(fields[colID + 2])) + + altPosDict[pos] = altPos + altPosRevDict[altPos] = pos + posList.append(pos) + posList.append(altPos) + altPosList.append(altPos) + posLine[pos] = line + if trackStrand: + if "RNAFARP" in line: + posStrand[pos] = "+" + posStrand[altPos] = "+" + else: + posStrand[pos] = "-" + posStrand[altPos] = "-" + + geneList = [] + geneDict = {} + if maxRadius < step: + step = maxRadius - 2 + + hg = Genome(genome, inRAM=True) + if extendGenome != "": + hg.extendFeatures(extendGenome, replace = replaceModels) + + geneannotDict = hg.allAnnotInfo() + + for radius in range(1, maxRadius, step): + print "radius %d" % radius + print len(posList) + if radius == 1: + posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels) + else: + posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels) + posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels) + for apos in posDict2: + try: + posDict[apos] += posDict2[apos] + posDict[apos].sort() + except: + posDict[apos] = posDict2[apos] + + for pos in posDict: + geneID = "" + if len(posDict[pos]) == 1: + if trackStrand: + if posStrand[pos] == posDict[pos][0][-1]: + geneID = posDict[pos][0][0] + else: + geneID = posDict[pos][0][0] + elif len(posDict[pos]) > 1 and not trackStrand: + (chrom, loc) = pos + bestres = posDict[pos][0] + dist1 = abs(bestres[3] - loc) + dist2 = abs(bestres[4] - loc) + if dist1 < dist2: + bestdist = dist1 + else: + bestdist = dist2 + + for testres in posDict[pos]: + testdist1 = abs(testres[3] - loc) + testdist2 = abs(testres[4] - loc) + if testdist1 < testdist2: + testdist = testdist1 + else: + testdist = testdist2 + + if testdist < bestdist: + bestdist = testdist + bestres = testres + + geneID = bestres[0] + elif len(posDict[pos]) > 1: + (chrom, loc) = pos + bestres = posDict[pos][0] + dist1 = abs(bestres[3] - loc) + dist2 = abs(bestres[4] - loc) + bestStrand = posDict[pos][-1] + if dist1 < dist2: + bestdist = dist1 + else: + bestdist = dist2 + + for testres in posDict[pos]: + testdist1 = abs(testres[3] - loc) + testdist2 = abs(testres[4] - loc) + testStrand = testres[-1] + if testdist1 < testdist2: + testdist = testdist1 + else: + testdist = testdist2 + + if bestStrand != posStrand[pos] and testStrand == posStrand[pos]: + bestdist = testdist + bestres = testres + bestStrand = testStrand + elif testdist < bestdist: + bestdist = testdist + bestres = testres + + if bestStrand == posStrand[pos]: + geneID = bestres[0] + + if geneID != "": + try: + if genome == "dmelanogaster": + symbol = geneinfoDict["Dmel_" + geneID][0][0] + else: + symbol = geneinfoDict[geneID][0][0] + except: + try: + symbol = geneannotDict[(genome, geneID)][0] + except: + symbol = "LOC" + geneID + else: + continue + + if pos in altPosList and pos in posList: + posList.remove(pos) + if pos not in altPosRevDict: + continue + + if altPosRevDict[pos] in posList: + posList.remove(altPosRevDict[pos]) + + pos = altPosRevDict[pos] + elif pos in posList: + posList.remove(pos) + if pos not in altPosDict: + print pos + continue + + if altPosDict[pos] in posList: + posList.remove(altPosDict[pos]) + else: + continue + + if (symbol, geneID) not in geneList: + geneList.append((symbol, geneID)) + geneDict[(symbol, geneID)] = [] + + if pos not in geneDict[(symbol, geneID)]: + geneDict[(symbol, geneID)].append(pos) + + for (symbol, geneID) in geneList: + geneDict[(symbol, geneID)].sort() + seenLine = [] + for pos in geneDict[(symbol, geneID)]: + if pos in altPosRevDict: + pos = altPosRevDict[pos] + + if posLine[pos] in seenLine: + continue + + if "\t" in symbol: + symbol = symbol.replace("\t","|") + + if " " in symbol: + symbol = symbol.replace(" ","_") + + line = "%s %s %s" % (symbol, geneID, posLine[pos]) + seenLine.append(posLine[pos]) + outfile.write(line) + + matchIndex = 0 + if nomatchfilename != "": + nomatchfile = open(nomatchfilename, "w") + + prevStart = 0 + prevChrom = "" + farIndex = 0 + start = 0 + for pos in posList: + if pos not in altPosList: + if nomatchfilename != "": + nomatchfile.write(posLine[pos]) + + matchIndex += 1 + # need to add strand tracking here..... + if trackFar: + (chrom, start) = pos + if chrom != prevChrom: + farIndex += 1 + prevChrom = chrom + elif abs(int(start) - prevStart) > maxRadius: + farIndex += 1 + + line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos]) + outfile.write(line) + prevStart = int(start) + + if nomatchfilename != "": + nomatchfile.close() + + print "%d sites without a gene within radius of %d" % (matchIndex, radius) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getallsites.py b/getallsites.py new file mode 100755 index 0000000..39335e4 --- /dev/null +++ b/getallsites.py @@ -0,0 +1,209 @@ +import sys, optparse +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +from cistematic.core.motif import Motif, hasMotifExtension +from cistematic.core import complement +from cistematic.genomes import Genome +from commoncode import readDataset, getMergedRegions, findPeak + +print "%prog: version 2.4" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--dataset", dest="chipfilename") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--best", action="store_true", dest="bestOnly", + help="only report the best position for each region") + parser.add_option("--usepeak", action="store_true", dest="usePeak", + help="use peak position and height from regions file") + parser.add_option("--printseq", action="store_true", dest="printSeq") + parser.add_option("--nomerge", action="store_true", dest="noMerge") + parser.add_option("--markov1", action="store_true", dest="doMarkov1") + parser.add_option("--rank", type="int", dest="useRank", + help="return region ranking based on peak height ranking [requires --usepeak]") + parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False, + printSeq=False, doMarkov1=False, useRank=False, noMerge=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 5: + print usage + sys.exit(1) + + genome = args[0] + motfilename = args[1] + motThreshold = float(args[2]) + infilename = args[3] + outfilename = args[4] + + getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename, + options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1, + options.useRank, options.noMerge) + + +def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="", + doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False, + useRank=False, noMerge=False): + + if motThreshold < 1.0 and doMarkov1: + print "motThreshold should be between 1.0 and 10.0 for markov1" + sys.exit(1) + elif motThreshold < 55.0 and not doMarkov1: + print "motThreshold should be between 55 and 99 for a regular PSFM" + sys.exit(1) + + if hasMotifExtension: + print "will use cistematic.core.motif C-extension to speed up motif search" + + if useRank and usePeak: + print "will return region ranking based on peak height ranking" + useRank = True + else: + print "ignoring '-rank': can only use ranking when using a region file with peak position and height" + useRank = False + + mot = Motif("", motifFile=motfilename) + motLen = len(mot) + bestScore = mot.bestConsensusScore() + + hg = Genome(genome) + + # minHits=-1 will force regions to be used regardless + # maxDist= 0 prevents merging of non-overlapping regions + if noMerge: + regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak) + else: + regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak) + + doRDS = False + if chipfilename: + doRDS = True + + if doRDS: + hitRDS = readDataset(chipfilename, verbose = True, cache=doCache) + + outfile = open(outfilename, "w") + + regionList = [] + + for chrom in regions: + if "rand" in chrom or "M" in chrom: + continue + + if usePeak: + for (start, stop, length, peakPos, peakHeight) in regions[chrom]: + regionList.append((peakHeight, chrom, start, length, peakPos)) + else: + for (start, stop, length) in regions[chrom]: + regionList.append((chrom, start, length)) + + if usePeak: + regionList.sort() + regionList.reverse() + + notFoundIndex = 0 + currentChrom = "" + count = 0 + for tuple in regionList: + if usePeak: + (rpeakheight, rchrom, start, length, rpeakpos) = tuple + else: + (rchrom, start, length) = tuple + + try: + seq = hg.sequence(rchrom, start, length) + except: + print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length) + continue + + count += 1 + numHits = -1 + if usePeak: + peakpos = rpeakpos + if useRank: + numHits = count + else: + numHits = rpeakheight + elif doRDS: + if rchrom != currentChrom: + fullchrom = "chr" + rchrom + hitDict = hitRDS.getReadsDict(chrom=fullchrom) + currentChrom = rchrom + + (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length) + if len(topPos) == 0: + print "topPos error" + + peakpos = topPos[0] + + found = [] + if doMarkov1: + matches = mot.locateMarkov1(seq, motThreshold) + else: + matches = mot.locateMotif(seq, motThreshold) + + for (pos, sense) in matches: + alreadyFound = False + for (fpos, fdist) in found: + if pos + start == fpos: + alreadyFound = True + + if not alreadyFound: + if usePeak: + found.append((start + pos, start + pos + motLen/2 - peakpos)) + elif doRDS: + found.append((start + pos, pos + motLen/2 - peakpos)) + else: + found.append((start + pos, -1)) + + foundValue = False + bestList = [] + for (foundpos, peakdist) in found: + seq = hg.sequence(rchrom, foundpos, motLen) + foundValue = True + (front, back) = mot.scoreMotif(seq) + sense = "+" + if front >= back: + score = int(100 * front / bestScore) + else: + score = int(100 * back / bestScore) + sense = "-" + seq = complement(seq) + + if printSeq: + print seq + + outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense) + if bestOnly: + bestList.append((abs(peakdist), outline)) + else: + outfile.write(outline) + + if bestOnly and foundValue: + bestList.sort() + outfile.write(bestList[0][1]) + + if not foundValue: + if printSeq: + print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length) + + notFoundIndex += 1 + if (count % 10000) == 0 and not printSeq: + print count + + outfile.close() + print "did not find motif in %d regions" % notFoundIndex + + +if __name__ == "__main__": + main(sys.argv) diff --git a/getfasta.py b/getfasta.py new file mode 100755 index 0000000..0b2faf9 --- /dev/null +++ b/getfasta.py @@ -0,0 +1,183 @@ +# +# getfasta.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset, getMergedRegions, findPeak +from cistematic.genomes import Genome + +print "%s: version 3.4" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome regionfile outfilename [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--seqradius", type="int", dest="seqsize") + parser.add_option("--minreads", type="int", dest="minHitThresh") + parser.add_option("--returnTop", type="int", dest="topRegions") + parser.add_option("--maxsize", type="int", dest="maxsize") + parser.add_option("--usepeak", action="store_true", dest="usePeaks") + parser.add_option("--dataset", dest="hitfile") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--compact", action="store_true", dest="doCompact") + parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000, + usePeaks=False, hitfile=None, doCache=False, doCompact=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + genome = args[0] + regionfile = args[1] + outfilename = args[2] + + getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh, + options.topRegions, options.maxsize, options.usePeaks, options.hitFile, + options.doCache, options.doCompact) + + +def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0, + maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False): + doDataset = False + if hitfile is not None: + if usePeaks: + print "ignoring dataset and relying on peak data" + else: + doDataset = True + + if doCompact: + mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True, + chromField=0, compact=True, keepPeak=usePeaks, + returnTop=topRegions) + else: + mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True, + keepPeak=usePeaks, returnTop=topRegions) + + if usePeaks: + ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize) + elif doDataset: + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize) + else: + ncregions = getDefaultRegion(mergedRegions, maxsize) + + writeFastaFile(ncregions, genome, outfilename, seqsize) + + +def writeFastaFile(ncregions, genome, outfilename, seqsize=50): + hg = Genome(genome) + outfile = open(outfilename, "w") + for chrom in ncregions: + for regionDict in ncregions[chrom]: + rstart = regionDict["start"] + rlen = regionDict["length"] + topPos = regionDict["topPos"] + if topPos[0] >= 0: + newrstart = rstart + topPos[0] - seqsize + newrlen = 2 * seqsize + 1 + else: + newrstart = rstart + newrlen = rlen + + seq2 = hg.sequence(chrom, newrstart, newrlen) + outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2)) + + outfile.close() + + +def getDefaultRegion(regionDict, maxsize): + ncregions = {} + for chrom in regionDict: + ncregions[chrom] = [] + + for achrom in regionDict: + print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) + for region in regionDict[achrom]: + (rstart, rstop, rlen) = region + + if rlen > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + continue + + resultDict = {"start": rstart, + "length": rlen, + "topPos": [-1] + } + ncregions[achrom].append(resultDict) + + return ncregions + + +def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000): + + ncregions = {} + for chrom in regionDict: + ncregions[chrom] = [] + + for achrom in regionDict: + print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) + for region in regionDict[achrom]: + (rstart, rstop, rlen, peakPos, peakHeight) = region + + if rlen > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + continue + + topPos = peakPos - rstart + if peakHeight > minHitThresh: + resultDict = {"start": rstart, + "length": rlen, + "topPos": [topPos] + } + ncregions[achrom].append(resultDict) + + return ncregions + + +def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000): + + readlen = hitRDS.getReadSize() + + ncregions = {} + for chrom in regionDict: + ncregions[chrom] = [] + + for achrom in regionDict: + print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) + for region in regionDict[achrom]: + (rstart, rstop, rlen) = region + + if rlen > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + continue + + thechrom = "chr%s" % achrom + print "." + hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop) + print "hitDict length: %d", len(hitDict[thechrom]) + (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen) + if numHits > minHitThresh: + resultDict = {"start": rstart, + "length": rlen, + "topPos": topPos + } + ncregions[achrom].append(resultDict) + + return ncregions + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getgosig.py b/getgosig.py new file mode 100755 index 0000000..b04dca6 --- /dev/null +++ b/getgosig.py @@ -0,0 +1,241 @@ +try: + import psyco + psyco.full() +except: + pass + +from cistematic.genomes import Genome +from math import log +import os.path +import sys +import optparse +import matplotlib +from pylab import * + +print "%prog: version 2.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [--fontsize pts] [--length in] [--width in]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--fontsize", type="int", dest="fontSize") + parser.add_option("--length", type="int", dest="length") + parser.add_option("--width", type="int", dest="width") + parser.set_defaults(fontSize=5, length=10, width=7) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 5: + print usage + sys.exit(1) + + genome = args[0] + imagename = args[1] + + conditionList = args[2:] + conditions = len(conditionList) / 3 + fileroots = [] + titles = [] + for index in range(conditions): + conditionIndex = index * 3 + fileroots.append(conditionList[conditionIndex]) + titles.append((conditionList[conditionIndex + 1], "(%s)" % conditionList[conditionIndex + 2])) + + getgosig(genome, imagename, fileroots, titles, options.fontSize, options.length, options.width) + + +def getgosig(genome, imagename, fileroots=[], titles=[], fontSize=5, length=10, width=7): + hg = Genome(genome) + allgodesc = hg.allGOterms() + godesc = [] + + matplotlib.use("Agg") + + doGray = False + + rootdir = "./" + htmlname = imagename[:-4] + ".html" + + ceiling = 40.0 + goterms = [] + goscores = {} + numgenes = {} + possiblegenes = {} + flatArray = [] + + highestPval = 0.0 + lowestPval = 1.0 + for sigfile in fileroots: + infile = open(rootdir + sigfile + ".gosig", "r") + for line in infile: + if "depleted" in line: + continue + + fields = line.split("\t") + if fields[0] not in goterms: + goterms.append(fields[0]) + goscores[fields[0]] = [] + numgenes[fields[0]] = [] + possiblegenes[fields[0]] = 0 + + if float(fields[3]) > highestPval: + highestPval = float(fields[3]) + + if float(fields[3]) < lowestPval: + lowestPval = float(fields[3]) + + print highestPval + print lowestPval + + boundaryScore = score = -1 * log(highestPval) / (2.0 * ceiling) + 0.49 + print boundaryScore + + cdict = {"red": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)), + "green": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)), + "blue": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.75), (1.0, 0.0, 0.0)) + } + + mymap = matplotlib.colors.LinearSegmentedColormap("my_colormap", cdict, 1024) + + goindex = 0 + for zfile in fileroots: + infile = open(rootdir + zfile + ".gozscore", "r") + for line in infile: + fields = line.split() + goindex += 1 + if fields[0] not in goterms: + continue + + score = -1 * log(float(fields[7])) / (2.0 * ceiling) + if score < -0.5: + score = -0.5 + + if score > 0.5: + score = 0.5 + + score += 0.5 + if doGray: + score = 1 - score + + goscores[fields[0]].append(score) + numgenes[fields[0]].append(fields[1]) + possiblegenes[fields[0]] = int(fields[4]) + + goindex /= len(fileroots) + + gokeys = goscores.keys() + gosortarray = [] + for term in gokeys: + gosortarray.append(goscores[term] + [term]) + + gosortarray.sort() + + htmlfile = open(htmlname, "w") + htmlfile.write('GO Analysis') + htmlfile.write("") + for entry in titles: + htmlfile.write("" % entry) + + htmlfile.write("\n") + tableLines = [] + + for entry in gosortarray: + term = entry[-1] + outline = "%s:\t" % term + for entry in goscores[term]: + outline += str(round(entry, 4)) + "\t" + + print outline + htmlLine = "" % (allgodesc[term], possiblegenes[term]) + index = 0 + for fileroot in fileroots: + gofile = fileroot + "." + term[3:] + ngene = numgenes[term][index] + if os.path.exists(gofile): + htmlLine += '' % (gofile, ngene) + else: + htmlLine += "" % (ngene) + + index += 1 + + tableLines.append(htmlLine + "\n") + flatArray.append(goscores[term]) + godesc.append(allgodesc[term]) + + tableLines.reverse() + for line in tableLines: + htmlfile.write(line) + + htmlfile.write("") + htmlfile.write("\n") + htmlfile.write("
Descriptionpossible%s
%s
%s%d%s%s
Cohort Size:
") + + figure(figsize=(length, width)) + myaxe = axes([0.3, 0.1, 0.55, 0.75]) + + Z = array(flatArray) + print Z.shape + if doGray: + c = pcolor(Z, cmap=cm.gray, vmin=0.0, vmax=1.0) + else: + c = pcolor(Z, cmap=mymap, vmin=0.0, vmax=1.0) + + c.set_linewidth(0.1) + clim(0.0, 1.0) + + ind = arange(len(fileroots)) + width = 0.5 + + coordy = 0.1 + deltaX = 1.0 + deltaY = 1.0 + + pcolorAxes = c.get_axes() + for entry in gosortarray: + term = entry[-1] + coordx = 0.4 + for genenum in numgenes[term]: + if len(genenum) == 1: + genenum = " " + genenum + elif len(genenum) == 2: + genenum = " " + genenum + + pcolorAxes.text(coordx, coordy, genenum, fontsize=fontSize) + coordx += deltaX + + coordy += deltaY + + coordx = 0 + for (line1,line2) in titles: + pcolorAxes.text(coordx + 0.1, coordy + 3 * deltaY + 0.5, line1, fontsize=int(fontSize*1.5)) + pcolorAxes.text(coordx + 0.1, coordy + deltaY, line2, fontsize=int(fontSize*1.5)) + coordx += deltaX + + setp(gca(), "xticks", []) + setp(gca(), "xticklabels", []) + setp(gca(), "yticks", arange(len(godesc))) + setp(gca(), "yticklabels", godesc) + locs, labels = yticks() + setp(labels, fontsize=fontSize) + setp(labels, verticalalignment="bottom") + setp(gca(), "ylim", [0, len(godesc)]) + + figtext(0.3,0.02, str(goindex - len(gokeys)) + " additional GO Terms below threshold of significance", fontsize=fontSize*2) + + d = colorbar(orientation="vertical", drawedges=False) + for t in d.ax.get_yticklabels(): + t.set_fontsize(0) + + locs, labels = yticks() + setp(labels, fontsize=5) + pcolorAxes.text(conditions + 1,len(godesc), str(lowestPval), fontsize=fontSize*2) + pcolorAxes.text(conditions + 1,boundaryScore * len(godesc), str(highestPval), fontsize=fontSize*2) + + savefig(imagename, dpi=250) + show() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getmers.py b/getmers.py new file mode 100755 index 0000000..c7c35ff --- /dev/null +++ b/getmers.py @@ -0,0 +1,52 @@ +# +# getmers.py +# ENRAGE +# + +import sys +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +from cistematic.genomes import Genome + +def main(argv=None): + if not argv: + argv = sys.argv + + print '%s: version 1.1' % argv[0] + + if len(sys.argv) < 5: + print 'usage: python %s genome merlen chrAny:start-stop outfile' % argv[0] + exit(1) + + genome = argv[1] + merlen = int(argv[2]) + location = argv[3] + outfilename = argv[4] + + getmers(genome, merlen, location, outfilename) + + +def getmers(genome, merlen, location, outfilename): + (chrom, pos) = location.split(':') + chrom = chrom[3:] + (start, stop) = pos.split('-') + start = int(start) + regionlength = int(stop) - start + 1 + + hg = Genome(genome) + + seq = hg.sequence(chrom, start, regionlength) + + outfile = open(outfilename,'w') + print 'writing %d %d-mers' % (regionlength - merlen, merlen) + for index in range(regionlength - merlen): + outfile.write(seq[index:index + merlen].upper() + '\n') + + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/getsplicefa.py b/getsplicefa.py new file mode 100755 index 0000000..db8e204 --- /dev/null +++ b/getsplicefa.py @@ -0,0 +1,158 @@ +import sys +import optparse +import string +try: + import psyco + psyco.full() +except: + print "psyco not running" +from cistematic.core import complement +from cistematic.genomes import Genome + + +def main(argv=None): + if not argv: + argv = sys.argv + + verstring = "%prog: version 1.0" + print verstring + delimiter = "|" + + usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\ + \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\ + \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--verbose", action="store_true", dest="doVerbose", + help="show verbose messages [default: False]") + parser.add_option("--spacer", type="int", dest="spacer", + help="number of spacer NTs to use [default: 2") + parser.set_defaults(doVerbose=False, spacer=2) + (options, args) = parser.parse_args(argv[1:]) + + try: + genome = args[0] + datafilename = args[1] + outfilename = args[2] + maxBorder = args[3] + except IndexError: + print usage + sys.exit(1) + + getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter) + + +def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"): + spacerseq = "N" * spacer + + datafile = open(datafilename) + hg = Genome(genome) + + spliceCountDict = {} + exonStartDict = {} + exonStopDict = {} + exonLengthDict = {} + nameToChromDict = {} + nameToComplementDict = {} + alreadySeen = {} + counter = 0 + + for line in datafile: + fields = line.split() + name = fields[0] + spliceCount = int(fields[7]) - 1 + if spliceCount < 1: + continue + + counter += spliceCount + spliceCountDict[name] = spliceCount + chrom = fields[1][3:] + if chrom == "chrM": + continue + + nameToChromDict[name] = chrom + if chrom not in alreadySeen: + alreadySeen[chrom] = [] + + nameToComplementDict[name] = fields[2] + exonStarts = [] + exonStops = [] + for val in fields[8].split(",")[:-1]: + exonStarts.append(int(val)) + + for val in fields[9].split(",")[:-1]: + exonStops.append(int(val)) + + exonStartDict[name] = exonStarts + exonStopDict[name] = exonStops + exonLengths = [] + for index in range(spliceCount + 1): + exonLengths.append(exonStops[index] - exonStarts[index]) + + exonLengthDict[name] = exonLengths + + print len(spliceCountDict) + print counter + + missedCount = 0 + depressedCount = 0 + splicefileindex = 1 + spliceCounter = 0 + outfile = open(outfilename, "w") + for name in nameToChromDict: + try: + spliceCount = spliceCountDict[name] + except: + continue + + exonStarts = exonStartDict[name] + exonStops = exonStopDict[name] + exonLengths = exonLengthDict[name] + chrom = nameToChromDict[name] + for index in range(spliceCount): + if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]: + continue + + regionstart = exonStops[index] - maxBorder + alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1])) + beforeLen = exonLengths[index] + afterLen = exonLengths[index + 1] + if (beforeLen + afterLen) < maxBorder + spacer: + missedCount += 1 + continue + + if (beforeLen + afterLen) < 2 * maxBorder: + depressedCount += 1 + + if beforeLen > maxBorder: + beforeLen = maxBorder + + if afterLen > maxBorder: + afterLen = maxBorder + + try: + beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder) + afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder) + except: + if doVerbose: + print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1]) + continue + + sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "") + spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "") + outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence) + outfile.write(outstring) + + splicefileindex += 1 + spliceCounter += 1 + if spliceCounter > 10000: + print "%d genes" % splicefileindex + spliceCounter = 0 + + outfile.close() + + print "%d splices too short to be seen" % missedCount + print "%d splices will be under-reported" % depressedCount + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/gfftocis.py b/gfftocis.py new file mode 100644 index 0000000..9fec165 --- /dev/null +++ b/gfftocis.py @@ -0,0 +1,58 @@ +import sys + +print "%s: version 1.0" % sys.argv[0] +if len(sys.argv) < 3: + print "usage: python %s infile.gff outfile.cis\n" % sys.argv[0] + print "\tTHIS SCRIPT WILL MOST LIKELY NEED TO BE EDITED FOR YOUR GFF FILE\n" + sys.exit(1) + +index = 1 +# Cistematic just want's a use set of exons labeled "CDS", "5UTR", and "3UTR" +# just put the corresponding type in your GFF file as the key in the key:value pairs +# in the ftypeDict below +ftypeDict = {"CDS": "CDS", + "mRNA": "mRNA", + "five_prime_utr": "5UTR", + "three_prime_utr": "3UTR" +} + +chrom = "" +idfields = "" +gene = "" +sense = "" +start = 0 +stop = 0 +ftype = "" + +infile = open(sys.argv[1]) +outfile = open(sys.argv[2], "w") +for line in infile: + if line[0]=="#": + continue + + fields = line.strip().split() + try: + if fields[2] in ftypeDict: + # this part of the code will need to be customized, most likely + # how does the annotation define the gene, geneid, and chromosome + # for example, for Anopheles Gambiae we have + #chrX VectorBase mRNA 582 16387 . - . ID=vectorbase|AGAP000002-RA; stable_id=AGAP000002-RA.1; Parent=vectorbase|AGAP000002; + if fields[2] == "mRNA": + chrom = fields[0][3:] + source = fields[1] + idfields = fields[9].split(";") + geneid = idfields[0].split("=")[1] + sense = fields[6] + else: + start = int(fields[3]) + stop = int(fields[4]) + ftype = ftypeDict[fields[2]] + outline = "%s\t%s%d\t%s\t%d\t%d\t%s\t%s\n" % (geneid, source, index, chrom, start, stop, sense, ftype) + outfile.write(outline) + except: + sys.exit() + + index += 1 + +infile.close() +outfile.close() diff --git a/gointersects.py b/gointersects.py new file mode 100755 index 0000000..0f74727 --- /dev/null +++ b/gointersects.py @@ -0,0 +1,46 @@ +# +# gointersects.py +# ENRAGE +# + +import sys + +print "%s: version 1.0" % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 4: + print "usage: python %s gogidfile gidfile outfile" % argv[0] + sys.exit(1) + + gogidfilename = argv[1] + gidfilename = argv[2] + outfilename = argv[3] + + gointersects(gogidfilename, gidfilename, outfilename) + + +def gointersects(gogidfilename, gidfilename, outfilename): + gidList = [] + gogidfile = open(gogidfilename) + for line in gogidfile: + fields = line.split() + gidList.append(fields[0]) + + gogidfile.close() + + gidfile = open(gidfilename) + outfile = open(outfilename, "w") + for line in gidfile: + fields = line.split() + if fields[0] in gidList: + outfile.write(line) + + gidfile.close() + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/hepg2.rds b/hepg2.rds new file mode 100644 index 0000000..8674f76 Binary files /dev/null and b/hepg2.rds differ diff --git a/intersects.py b/intersects.py new file mode 100755 index 0000000..67e7d35 --- /dev/null +++ b/intersects.py @@ -0,0 +1,149 @@ +# +# intersects.py +# ENRAGE +# + +import sys, optparse + +print 'version 2.0' + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog infile1 infile2 outfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("-d", dest="delimiter") + parser.add_option("--file3", dest="infile3") + parser.add_option("-1", type="int", dest="matchfield1") + parser.add_option("-2", type="int", dest="matchfield2") + parser.add_option("-3", type="int", dest="matchfield3") + parser.add_option("-reject1", dest="reject1file") + parser.add_option("-trackGID", action="store_true", dest="trackGID") + parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0, + matchField3=0, rejectFileName="", trackGID=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + infile1 = args[0] + infile2 = args[1] + outfile = args[2] + + intersects(infile1, infile2, outfile, options.delimiter, options.infile3, + options.matchField1, options.matchField2, options.matchField3, + options.rejectFileName, options.trackGID) + + +def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None, + matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False): + + if rejectFileName: + doReject1 = True + reject1file = open(rejectFileName) + else: + doReject1 = False + + if infile3Name is not None: + doFile3 = True + else: + doFile3 = False + + matchedList = [] + matchedList12 = [] + matchedList13 = [] + matchedList23 = [] + gidDict = {} + + if trackGID: + gidKeys = gidDict.keys() + list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys) + for entry in fileGIDDict.keys(): + gidDict[entry] = fileGIDDict[entry] + + gidKeys = gidDict.keys() + list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys) + for entry in fileGIDDict.keys(): + gidDict[entry] = fileGIDDict[entry] + + if doFile3: + gidKeys = gidDict.keys() + list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys) + for entry in fileGIDDict.keys(): + gidDict[entry] = fileGIDDict[entry] + else: + list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1) + list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2) + if doFile3: + list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3) + + for candidate in list1: + if doFile3 and candidate in list2 and candidate in list3: + matchedList.append(candidate) + elif doFile3 and candidate in list3: + matchedList13.append(candidate) + elif doFile3 and candidate in list2: + matchedList12.append(candidate) + elif not doFile3 and candidate in list2: + matchedList.append(candidate) + elif doReject1: + if trackGID: + reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate])) + else: + reject1file.write("%s\n" % candidate) + + if doFile3: + for candidate in list2: + if candidate not in list1 and candidate in list3: + matchedList23.append(candidate) + + print len(list1), len(list2), len(list3) + if doFile3: + print len(matchedList12), len(matchedList13), len(matchedList23) + print len(matchedList) + + outfile = open(outfileName, "w") + for match in matchedList: + if trackGID: + outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match])) + else: + outfile.write("%s\n" % match) + + outfile.close() + + +def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]): + infile = open(filename) + candidateList = [] + gidDict = {} + + for line in infile: + if line[0] == "#": + continue + + fields = line.strip().split(delimiter) + candidate = fields[matchField] + if candidate not in candidateList: + candidateList.append(candidate) + + if trackGID and candidate not in gidList: + gidDict[candidate] = fields[matchField + 1] + + infile.close() + return candidateList, gidDict + + +def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]): + return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[]) + + +def getCandidateListFromFile(filename, delimiter, matchField): + candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField) + return candidateList + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/listGeneFeatures.py b/listGeneFeatures.py new file mode 100755 index 0000000..607100d --- /dev/null +++ b/listGeneFeatures.py @@ -0,0 +1,59 @@ +# +# listGeneFeatures.py +# ENRAGE +# + +import sys +from cistematic.genomes import Genome +from commoncode import getMergedRegions, getFeaturesByChromDict + +print "%s: version 1.1" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 4: + print "usage: python %s genome [acceptFile] gid outfile\n" % argv[0] + sys.exit(1) + + genome = argv[1] + + if len(argv) == 4: + gid = argv[2] + outfile = argv[3] + else: + acceptFileName = argv[2] + gid = argv[3] + outfile = argv[4] + + listGeneFeatures(genome, gid, outfile, acceptFileName) + + +def listGeneFeatures(genome, gid, outFileName, acceptFileName=""): + hg = Genome(genome) + outfile = open(outFileName, "w") + if acceptFileName: + additionalDict = getMergedRegions(acceptFileName, maxDist = 0, keepLabel = True, verbose = True) + else: + additionalDict = {} + + featuresDict = getFeaturesByChromDict(hg, additionalDict, restrictList=[gid]) + outfile.write('track name="LOC%s"\n' % gid) + + senseDict = {"F": "+", + "R": "-", + "+": "+", + "-": "-" + } + + for chrom in featuresDict: + for (start, stop, fgid, sense, ftype) in featuresDict[chrom]: + outfile.write("chr%s\t%d\t%d\t%s\t0\t%s\n" % (chrom, start, stop, ftype, senseDict[sense])) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makeGraphs.py b/makeGraphs.py new file mode 100644 index 0000000..3965e5c --- /dev/null +++ b/makeGraphs.py @@ -0,0 +1,127 @@ +import sys, os + + +def getEdges(nodeList, shorten=False): + edgeDict = {} + + for nodeEntry in nodeList: + try: + (node1, node2, count) = nodeEntry.strip().split("\t") + except ValueError: + continue + + if shorten: + try: + node1 = node1.split("_")[1] + except IndexError: + pass + + try: + node2 = node2.split("_")[1] + except IndexError: + pass + + node1Detail = (node1, int(count)) + node2Detail = (node2, int(count)) + try: + if node2Detail not in edgeDict[node1]: + edgeDict[node1].append(node2Detail) + except KeyError: + edgeDict[node1] = [node2Detail] + + try: + if node1Detail not in edgeDict[node2]: + edgeDict[node2].append(node1Detail) + except KeyError: + edgeDict[node2] = [node1Detail] + + return edgeDict + + +def getEdgesFromFile(inFileName, shorten=False): + + infile = open(inFileName) + edgeDict = getEdges(infile, shorten) + infile.close() + + return edgeDict + + +def getOutputLine(currentNode, node, nodeCount): + if nodeCount > 2: + outputLine = '\t"%s" -- "%s" [ label = "%d", penwidth=%d, color="red", constraint=false] ; \n' % (currentNode, node, nodeCount, nodeCount) + else: + outputLine = '\t"%s" -- "%s" [ label = "%d", color="red", constraint=false] ; \n' % (currentNode, node, nodeCount) + + return outputLine + + +infilename = sys.argv[1] +outprefix = sys.argv[2] + +shorten = False +if "-shorten" in sys.argv: + shorten = True + +edgeDict = getEdgesFromFile(infilename, shorten) + +nodeList = edgeDict.keys() +seenNodeDict = {} +seenEdgeDict = {} +currentNodeList = [] +currentEdgeList = [] +treeList = [] +localCount = [] + +outstat = open("%s.stats" % outprefix,"w") +outstat.write("#gID\tnodes\tedges\tweight\n") + +def visitNodes(currentNode): + if currentNode in seenNodeDict: + return + + seenNodeDict[currentNode] = [] + for (node, nodeCount) in edgeDict[currentNode]: + nodePair = [node, currentNode] + nodePair.sort() + if str(nodePair) not in seenEdgeDict: + if node not in currentNodeList: + currentNodeList.append(node) + + outputLine = getOutputLine(currentNode, node, nodeCount) + currentEdgeList.append(outputLine) + seenEdgeDict[str(nodePair)] = 0 + localCount[0] += nodeCount + try: + visitNodes(node) + except: + pass + +print "getting trees" +for node in nodeList: + if node not in seenNodeDict: + currentNodeList = [node] + currentEdgeList = [] + localCount = [0] + outfile = open("%s.%s.gv" % (outprefix, node), "w") + treeList.append(node) + outfile.write("graph g%s {\n" % node) + visitNodes(node) + currentNodeList.sort() + outfile.write('subgraph G0 {\n\t"%s" ' % currentNodeList[0]) + for anode in currentNodeList[1:]: + outfile.write('-- "%s" ' % anode) + + outfile.write(" [ weight = 100 ] ;\n\tordering = out ;\n}\n") + for line in currentEdgeList: + outfile.write(line) + + outfile.write("}\n") + outfile.close() + outstat.write("%s\t%d\t%d\t%d\n" % (node, len(currentNodeList), len(currentEdgeList), localCount[0])) + +print "generating pngs" +for node in treeList: + output = os.popen("dot -Tpng %s.%s.gv > %s.%s.png" % (outprefix, node, outprefix, node)) + +outstat.close() \ No newline at end of file diff --git a/makeSNPtrack.py b/makeSNPtrack.py new file mode 100755 index 0000000..23d8ac9 --- /dev/null +++ b/makeSNPtrack.py @@ -0,0 +1,99 @@ +# +# makeSNPtrack.py +# ENRAGE +# +# This script maps all the qualified SNC sites on to the genome browser +# Output format: bed +# Written by: Wendy Lee +# Written on: August 18th, 2008 +# Last Modified: December 14th, 2008 by Ali Mortazavi + +import sys + +def main(argv=None): + if not argv: + argv = sys.argv + + print "%s: version 1.2" % argv[0] + + if len(argv) < 4: + print "usage: python %s snpfile trackname trackoutfile" % argv[0] + sys.exit(1) + + snpfile = argv[1] + track = argv[2] + outfile = argv[3] + + makeSNPtrack(snpfile, track, outfile) + + +def makeSNPtrack(snpfilename, track, outfilename): + + snpfile = open(snpfilename, "r") + writeSNPsBedfile(snpfile, track, outfilename) + snpfile.close() + + +def writeSNPsBedfile(snpPropertiesList, track, outfilename): + + outfile = open(outfilename, "w") + header = getHeader(track) + outfile.write(header) + + for line in snpPropertiesList: + if doNotProcessLine(line): + continue + + fields = line.strip().split() + outline = getBedOutputLine(fields) + outfile.write(outline) + + outfile.close() + + +def getHeader(track): + header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track) + return header + + +def doNotProcessLine(line): + return line[0] == "#" + + +def getBedOutputLine(snpPropertiesList): + chromosome = snpPropertiesList[2] + readStart = int(snpPropertiesList[3]) - 1 + readStop = readStart + 1 + readName = snpPropertiesList[7] + color = getSNPColor(readName) + score = "0" + sense = "+" + outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color) + + return outline + + +def getSNPColor(readName): + baseColor = {"A": "200, 0, 255", + "T": "200, 0, 255", + "C": "200, 0, 255", + "G": "200, 0, 255" + } + + specialColors = {"A-G": "255, 0, 0", + "T-C": "0, 0, 255" + } + + if readName in specialColors.keys(): + color = specialColors[readName] + else: + try: + color = baseColor[readName[-1]] + except (IndexError, KeyError): + color = "200, 0, 255" + + return color + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makebedfromrds.py b/makebedfromrds.py new file mode 100755 index 0000000..924bc7e --- /dev/null +++ b/makebedfromrds.py @@ -0,0 +1,369 @@ +# +# makebedfromrds.py +# ENRAGE +# +# Created by Ali Mortazavi on 7/19/08. +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset + +PLUS_COLOR = "0,0,255" +MINUS_COLOR = "255,0,0" +MULTI_PLUS_COLOR = "64,64,64" +MULTI_MINUS_COLOR = "192,192,192" +SPLICE_COLOR = "255,0,0" +UNIQUE_COLOR = "0,0,0" +MULTI_COLOR = "128,128,128" + + +def main(argv=None): + if not argv: + argv = sys.argv + + verstring = "%prog: version 3.1" + print verstring + + doPairs = False + + usage = "usage: %prog trackLabel rdsFile bamFile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--nouniq", action="store_false", dest="withUniqs") + parser.add_option("--nomulti", action="store_false", dest="withMulti") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor") + parser.add_option("--flag", dest="withFlag") + parser.add_option("--flaglike", action="store_true", dest="useFlagLike") + parser.add_option("--pairs", type="int", dest="pairDist") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--enforceChr", action="store_true", dest="enforceChr") + parser.add_option("--chrom", action="append", dest="chromList") + parser.add_option("--strand", dest="strand") + parser.add_option("-r", "--region", dest="region", type="string", + help="samtools region string") + parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False, + pairDist=None, withFlag="", useFlagLike=False, enforceChr=False, + senseStrand="", allChrom=True, doCache=False, cachePages=100000, + chromList=[]) + (options, args) = parser.parse_args(argv[1:]) + + try: + trackType = args[0] + except IndexError: + print "no track specified - see --help for usage" + sys.exit(1) + + try: + rdsfile = args[1] + except IndexError: + print "no RDS file specified - see --help for usage" + sys.exit(1) + + try: + outfilename = args[2] + except IndexError: + print "no output file specified - see --help for usage" + sys.exit(1) + + if options.pairDist is not None: + doPairs = True + + if options.chromList: + options.allChrom = False + + outputBedFromRds(trackType, rdsfile, outfilename, options.withUniqs, options.withMulti, + options.doSplices, options.doSpliceColor, doPairs, options.pairDist, + options.withFlag, options.useFlagLike, options.enforceChr, options.senseStrand, + options.allChrom, options.doCache, options.cachePages, options.chromList) + + +def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True, + doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000, + withFlag="", useFlagLike=False, enforceChr=False, senseStrand="", + allChrom=True, doCache=False, cachePages=100000, chromList=[]): + + if not withUniqs and not withMulti and not doSplices: + print "must be outputing at least one of uniqs, multi, or -splices - exiting" + sys.exit(1) + + print "\nsample:" + RDS = readDataset(rdsfile, verbose = True, cache=doCache) + + #check that this is better than the dataset's default cache size + if cachePages > RDS.getDefaultCacheSize(): + RDS.setDBcache(cachePages) + + readlength = RDS.getReadSize() + minDist = -1 * readlength + + if allChrom: + if withUniqs: + chromList = RDS.getChromosomes() + elif withMulti: + chromList = RDS.getChromosomes(table="multi") + else: + chromList = RDS.getChromosomes(table="splices") + + chromList.sort() + + outfile = open(outfilename, "w") + outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (trackType)) + + if withUniqs or withMulti: + for achrom in chromList: + index = 0 + if doNotOutputChromosome(achrom, enforceChr): + continue + + print "chromosome %s" % (achrom) + + if doPairs: + hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, + withWeight=True, withPairID=True, doUniqs=withUniqs, + doMulti=withMulti, readIDDict=True, + flagLike=useFlagLike, strand=senseStrand) + + readIDList = hitDict.keys() + if doSplices: + spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, + withPairID=True, readIDDict=True, + flagLike=useFlagLike, strand=senseStrand) + + spliceIDList = spliceDict.keys() + combDict = {} + for readID in readIDList: + combDict[readID] = 1 + + for readID in spliceIDList: + combDict[readID] = 1 + + combinedIDList = combDict.keys() + else: + combinedIDList = readIDList + + for readID in combinedIDList: + localList = [] + try: + localList = hitDict[readID] + except: + pass + + if doSplices: + try: + localList += spliceDict[readID] + except: + pass + + localList.sort() + listLen = len(localList) - 1 + localIndex = 0 + while localIndex <= listLen: + try: + (leftpos, leftsense, leftweight, lPairID) = localList[localIndex] + leftstop = leftpos + readlength - 1 + lpart = 1 + startList = [leftpos] + stopList = [leftstop] + except: + (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex] + leftweight = 1.0 + lpart = 2 + startList = [leftpos, LRstart] + stopList = [LLstop, leftstop] + + if localIndex < listLen: + try: + (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1] + rightstop = rightpos + readlength - 1 + rpart = 1 + rstartList = [rightpos] + rstopList = [rightstop] + except: + (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1] + rightweight = 1.0 + rpart = 2 + rstartList = [rightpos, RRstart] + rstopList = [RLstop, rightstop] + else: + rightsense = "+" + rightpos = 0 + rstartList = [] + rstopList = [] + + if leftsense == "+" and rightsense == "-" and minDist < (rightpos - leftstop) < pairDist and lPairID != rPairID: + if doSpliceColor: + plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + elif leftweight == 1.0 or rightweight == 1.0: + plusSenseColor = "0,0,0" + minusSenseColor = MINUS_COLOR + else: + plusSenseColor = "128,128,128" + minusSenseColor = MULTI_MINUS_COLOR + + splitReadWrite(outfile, achrom, lpart + rpart, startList + rstartList, stopList + rstopList, "+", readID, plusSenseColor, minusSenseColor) + localIndex += 2 + index += 2 + else: + if doSpliceColor: + plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight) + outputSense = "+" + elif leftweight == 1.0: + plusSenseColor = PLUS_COLOR + minusSenseColor = MINUS_COLOR + outputSense = leftsense + else: + plusSenseColor = PLUS_COLOR + minusSenseColor = MINUS_COLOR + outputSense = leftsense + + splitReadWrite(outfile, achrom, lpart, startList, stopList, outputSense, readID, plusSenseColor, minusSenseColor) + localIndex += 1 + index += 1 + else: + hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike) + try: + for (pos, sense, weight, readID) in hitDict[achrom]: + splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR) + index += 1 + except: + pass + + if doSplices: + spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike) + if achrom not in spliceDict: + continue + for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]: + splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR) + index += 1 + + elif doSplices: + for achrom in chromList: + index = 0 + if doNotOutputChromosome(achrom, enforceChr): + continue + + print "chromosome %s" % (achrom) + + spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike) + if achrom not in spliceDict: + continue + for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]: + splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR) + index += 1 + + print index + + outfile.close() + + +def singleReadWrite(chrom, pos, sense, weight, readID, readlength, outfile): + start = pos + stop = pos + readlength - 1 + senseColor = getSenseColor(sense, weight) + outfile.write("%s %d %d %s %.1f %s 0 0 %s\n" % (chrom, start, stop, readID, weight, sense, senseColor)) + + +def getSenseColor(sense, weight): + if weight < 1.0: + senseColor = getMultiSenseColor(sense) + else: + senseColor = getSingleSenseColor(sense) + + return senseColor + + +def getMultiSenseColor(sense): + if sense == "+": + senseColor = MULTI_PLUS_COLOR + else: + senseColor = MULTI_MINUS_COLOR + + return senseColor + + +def getSingleSenseColor(sense): + if sense == "+": + senseColor = PLUS_COLOR + else: + senseColor = MINUS_COLOR + + return senseColor + + +def splitReadWrite(outfile, chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense): + readSizes = getReadSizes(numPieces, startList, stopList) + readCoords = getReadCoords(numPieces, startList) + leftStart = startList[0] + rightStop = stopList[-1] + + if rsense == "+": + senseCode = plusSense + else: + senseCode = minusSense + + outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords) + outfile.write(outline) + + +def getReadSizes(numPieces, startList, stopList): + readSizes = "%d" % (stopList[0] - startList[0]) + for index in range(1, numPieces): + readSizes += ',%d' % (stopList[index] - startList[index]) + + return readSizes + + +def getReadCoords(numPieces, startList): + readCoords = "0" + for index in range(1, numPieces): + readCoords += ",%d" % (startList[index] - startList[0]) + + return readCoords + + +def getSpliceColor(lpart, rpart, leftweight, rightweight, hackType=None): + if hackType == "1": + if (lpart + rpart) > 2: + aColor = SPLICE_COLOR + bColor = SPLICE_COLOR + elif leftweight == 1.0 or rightweight == 1.0: + aColor = UNIQUE_COLOR + bColor = UNIQUE_COLOR + else: + aColor = MULTI_COLOR + bColor = MULTI_COLOR + else: + if lpart > 1: + aColor = SPLICE_COLOR + bColor = SPLICE_COLOR + elif leftweight == 1.0: + aColor = UNIQUE_COLOR + bColor = UNIQUE_COLOR + else: + aColor = MULTI_COLOR + bColor = MULTI_COLOR + + return aColor, bColor + + +def doNotOutputChromosome(achrom, enforceChr): + result = False + + if achrom == "chrM": + result = True + + if enforceChr and ("chr" not in achrom): + result = True + + return result + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makerdsfrombed.py b/makerdsfrombed.py new file mode 100755 index 0000000..4f38d51 --- /dev/null +++ b/makerdsfrombed.py @@ -0,0 +1,126 @@ +# +# makerdsfrombed.py +# ENRAGE +# +# Created by Ali Mortazavi on 6/21/08. +# +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse +from commoncode import readDataset, writeLog + +verstring = "%prog: version 2.1" % sys.argv[0] +print verstring + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--append", action="store_false", dest="init") + parser.add_option("--index", action="store_true", dest="doIndex") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--RNA", action="store_true", dest="rnaDataType") + parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + print "\ntreats all imported reads as uniquely mapped\n" + sys.exit(1) + + label = args[0] + filename = args[1] + outdbname = args[2] + + if options.rnaDataType: + dataType = "RNA" + else: + dataType = "DNA" + + propertyList = [] + for arg in args: + if "::" in arg: + (pname, pvalue) = arg.strip().split("::") + propertyList.append((pname, pvalue)) + + makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList) + + +def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]): + readsize = 0 + padsize = 0 + index = 0 + insertSize = 100000 + + writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:])) + + infile = open(filename,"r") + + rds = readDataset(outdbname, init, dataType, verbose=True) + if not init: + rds.dropIndex() + + #check that our cacheSize is better than the dataset's default cache size + defaultCacheSize = rds.getDefaultCacheSize() + if cachePages > defaultCacheSize: + if init: + rds.setDBcache(cachePages, default=True) + else: + rds.setDBcache(cachePages) + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + insertList = [] + for line in infile: + if "track" in line: + continue + + fields = line.split() + if readsize == 0: + readsize = abs(int(fields[1]) - int(fields[2])) + if init: + rds.insertMetadata([("readsize", readsize+1)]) + rds.insertMetadata([("imported_from_bed", "True")]) + + chrom = fields[0] + start = int(fields[1]) + stop = int(fields[2]) + sense = fields[5] + readID = "%s-%s" % (label, str(index)) + insertList.append((readID, chrom, start, stop, sense, 1.0, "", "")) + if index % insertSize == 0: + rds.insertUniqs(insertList) + insertList = [] + print ".", + sys.stdout.flush() + + index += 1 + + if len(insertList) > 0: + rds.insertUniqs(insertList) + + countString = "%d unique reads" % index + print countString + + writeLog(outdbname + ".log", verstring, countString) + + if doIndex: + print "building index...." + if cachePages > defaultCacheSize: + rds.setDBcache(cachePages) + rds.buildIndex(cachePages) + else: + rds.buildIndex(defaultCacheSize) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makerdsfromblat.py b/makerdsfromblat.py new file mode 100755 index 0000000..f92d5f5 --- /dev/null +++ b/makerdsfromblat.py @@ -0,0 +1,362 @@ +# +# makerdsfromblat.py +# ENRAGE +# +# Created by Ali Mortazavi on 12/7/08. +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse +from commoncode import readDataset, writeLog + +verstring = "%prog: version 3.9" +print verstring + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--append", action="store_false", dest="init") + parser.add_option("--index", action="store_true", dest="doIndex") + parser.add_option("--rawreadID", action="store_false", dest="trimReadID") + parser.add_option("--forceRNA", action="store_true", dest="forceRNA") + parser.add_option("--flag", action="store_true", dest="flagReads") + parser.add_option("--strict", type="int", dest="minSpliceLength", + help="min required bp on each side of a splice") + parser.add_option("--spliceonly", action="store_true", dest="spliceOnly") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--RNA", dest="geneDataFileName") + parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="") + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + label = args[0] + filename = args[1] + outdbname = args[2] + + if options.geneDataFileName: + dataType = "RNA" + else: + dataType = "DNA" + + theFlag = "" + if options.flagReads: + theFlag = "blat" + + propertyList = [] + for arg in args: + if "::" in arg: + (pname, pvalue) = arg.strip().split("::") + propertyList.append((pname, pvalue)) + + makerdsfromblat(label, filename, outdbname, dataType, options.init, + options.doIndex, options.trimReadID, options. minSpliceLength, + options.forceRNA, theFlag, options.spliceOnly, options.verbose, + options.cachePages, options.geneDataFileName, propertyList) + + +def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True, + doIndex=False,trimReadID=True, minSpliceLength=0, + forceRNA=False, theFlag="", spliceOnly=False, + verbose=False, cachePages=100000, geneDataFileName="", + propertyList=[]): + + delimiter = "|" + minIntron = 10 + maxBorder = 0 + index = 0 + insertSize = 100000 + + if forceRNA: + print "forcing datatype to RNA" + dataType = "RNA" + + if dataType == "RNA": + genedatafile = open(geneDataFileName) + + writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:])) + + geneDict = {} + mapDict = {} + if dataType == "RNA" and not forceRNA: + for line in genedatafile: + fields = line.strip().split("\t") + blockCount = int(fields[7]) + if blockCount < 2: + continue + + uname = fields[0] + chrom = fields[1] + sense = fields[2] + chromstarts = fields[8][:-1].split(",") + chromstops = fields[9][:-1].split(",") + exonLengths = [] + totalLength = 0 + for index in range(blockCount): + chromstarts[index] = int(chromstarts[index]) + chromstops[index] = int(chromstops[index]) + exonLengths.append(chromstops[index] - chromstarts[index]) + totalLength += exonLengths[index] + + geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) + mapDict[uname] = [] + + genedatafile.close() + + rds = readDataset(outdbname, init, dataType, verbose=True) + + #check that our cacheSize is better than the dataset's default cache size + defaultCacheSize = rds.getDefaultCacheSize() + if cachePages > defaultCacheSize: + if init: + rds.setDBcache(cachePages, default=True) + else: + rds.setDBcache(cachePages) + + if not init and doIndex: + try: + if rds.hasIndex(): + rds.dropIndex() + except: + if verbose: + print "couldn't drop Index" + + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + # make some assumptions based on first read + infile = open(filename, "r") + for arg in range(6): + line = infile.readline() + + fields = line.split() + readsize = int(fields[10]) + pairedTest = fields[9][-2:] + paired = False + if pairedTest in ["/1", "/2"]: + print "assuming reads are paired" + paired = True + + print "read size: %d bp" % readsize + if init: + rds.insertMetadata([("readsize", readsize)]) + if paired: + rds.insertMetadata([("paired", "True")]) + + infile.close() + if "blat_mapped" not in rds.getMetadata(): + rds.insertMetadata([("blat_mapped", "True")]) + + minReadScore = readsize - readsize/25 - 1 + trim = -4 + if dataType == "RNA": + maxBorder = readsize + trim + + infile = open(filename, "r") + prevID = "" + readList = [] + uInsertList = [] + mInsertList = [] + sInsertList = [] + index = uIndex = mIndex = sIndex = lIndex = 0 + bestScore = 0 + # skip headers + for arg in range(5): + line = infile.readline() + + for line in infile: + lIndex += 1 + fields = line.strip().split() + readID = fields[9] + if trimReadID: + readID = string.join(readID.split(":")[1:], ":") + + if readID != prevID: + newReadList = [] + if bestScore > minReadScore: + for readData in readList: + if readData[1] == bestScore: + newReadList.append(readData) + + if trimReadID: + prevID = label + "-" + prevID + + listlen = len(newReadList) + if listlen == 1: + parts = int(newReadList[0][0]) + if parts == 1 and not spliceOnly: + (part, score, sense, chrom, start, mismatches) = newReadList[0] + stop = start + readsize + uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches)) + uIndex += 1 + elif forceRNA and parts == 2: + (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0] + startL = int(startList[0]) + stopL = startL + int(lengthList[0]) + startR = int(startList[1]) + stopR = startR + int(lengthList[1]) + if stopL + minIntron < startR: + sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches)) + sIndex += 1 + elif parts == 2: + print newReadList + (part, score, sense, chrom, start, mismatches) = newReadList[0] + currentSplice = chrom + (model, spliceID, regionStart) = currentSplice.split(delimiter) + if model not in geneDict: + print fields + continue + + (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model] + spliceID = int(spliceID) + rstart = int(start) - 2 + lefthalf = maxBorder - rstart + if lefthalf < 1 or lefthalf > maxBorder: + continue + + righthalf = readsize - lefthalf + startL = int(regionStart) + rstart + stopL = startL + lefthalf + startR = chromstarts[spliceID + 1] + stopR = chromstarts[spliceID + 1] + righthalf + if stopL + minIntron < startR: + sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches)) + sIndex += 1 + elif listlen > 1 and not spliceOnly: + prevID = prevID + "::" + str(listlen) + mIndex += 1 + # ignore multireads that can also map across splices + skip = False + for readData in newReadList: + if readData[0] > 1: + skip = True + + if not skip: + for (part, score, sense, chrom, start, mismatches) in newReadList: + stop = start + readsize + mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches)) + else: + prevID = readID + + if index % insertSize == 0: + rds.insertUniqs(uInsertList) + rds.insertMulti(mInsertList) + uInsertList = [] + mInsertList = [] + if dataType == "RNA": + rds.insertSplices(sInsertList) + sInsertList = [] + + print ".", + sys.stdout.flush() + + # start processing new read + readList = [] + prevID = readID + bestScore = 0 + index += 1 + + # add the new read + score = int(fields[0]) + sense = fields[8] + chrom = fields[13] + parts = int(fields[17]) + passStrict = True + if parts > 1: + lengthList = fields[18][:-1].split(",") + startList = fields[20][:-1].split(",") + listlen = len(lengthList) + for lpos in range(listlen): + if int(lengthList[lpos]) < minSpliceLength: + passStrict = False + + # throw out deletions, for now + if lpos > 0: + if int(lengthList[lpos - 1]) == int(startList[lpos]): + passStrict = False + pass + else: + start = int(fields[15]) + + if passStrict: + if score > bestScore: + bestScore = score + + mismatches = "" + if int(fields[1]) > 0: + try: + mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense) + except: + mismatches = "" + + if parts == 1: + readList.append((parts, score, sense, chrom, start, mismatches)) + else: + readList.append((parts, score, sense, chrom, startList, lengthList, mismatches)) + + if lIndex % 1000000 == 0: + print "processed %d lines" % lIndex + + print "%d lines processed" % lIndex + + if len(uInsertList) > 0: + rds.insertUniqs(uInsertList) + if len(mInsertList) > 0: + rds.insertMulti(mInsertList) + if len(sInsertList) > 0: + rds.insertSplices(sInsertList) + + combString = "%d unique reads" % uIndex + combString += "\t%d multi reads" % mIndex + if dataType == "RNA": + combString += "\t%d spliced reads" % sIndex + + print + print combString.replace("\t", "\n") + + writeLog(outdbname + ".log", verstring, combString) + + if doIndex: + print "building index...." + if cachePages > defaultCacheSize: + rds.setDBcache(cachePages) + rds.buildIndex(cachePages) + else: + rds.buildIndex(defaultCacheSize) + + +def decodeMismatches(gString, rString, rsense): + + output = [] + rlen = len(gString) + partIndex = 0 + for rindex in xrange(rlen): + if gString == ",": + partIndex += 1 + + if gString[rindex] == rString[rindex]: + continue + + genNT = gString[rindex] + readNT = rString[rindex] + # for eland-compatibility, we are 1-based + output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT)) + + return string.join(output, ",") + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py new file mode 100755 index 0000000..3534a88 --- /dev/null +++ b/makerdsfrombowtie.py @@ -0,0 +1,332 @@ +# +# makerdsfrombowtie.py +# ENRAGE +# +# Created by Ali Mortazavi on 10/20/08. +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse +from commoncode import readDataset, writeLog + +verstring = "%prog: version 4.1" +print verstring + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--RNA", dest="genedatafilename") + parser.add_option("--append", action="store_false", dest="init") + parser.add_option("--index", action="store_true", dest="doIndex") + parser.add_option("--spacer", type="int", dest="spacer") + parser.add_option("--rawreadID", action="store_false", dest="trimReadID") + parser.add_option("--forcepair", type="int", dest="forceID") + parser.add_option("--flip", action="store_true", dest="flip") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.add_option("--strip", action="store_true", dest="stripSpace") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2, + trimReadID=True, forceID=None, flip=False, verbose=False, + stripSpace=False, cachePages=100000) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + label = args[0] + filename = args[1] + outdbname = args[2] + + propertyList = [] + for arg in args: + if "::" in arg: + (pname, pvalue) = arg.strip().split("::") + propertyList.append((pname, pvalue)) + + makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init, + options.doIndex, options.spacer, options.trimReadID, options.forceID, + options.flip, options.verbose, options.stripSpace, options.cachePages, + propertyList) + + +def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True, + doIndex=False, spacer=2, trimReadID=True, forceID=None, + flip=False, verbose=False, stripSpace=False, cachePages=100000, + propertyList=[]): + + delimiter = "|" + + dataType = "DNA" + if genedatafilename is not None: + dataType = "RNA" + genedatafile = open(genedatafilename) + + + forcePair = False + if forceID is not None: + forcePair = True + else: + forceID = 0 + + maxBorder = 0 + index = 0 + insertSize = 100000 + + writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:])) + + geneDict = {} + mapDict = {} + if dataType == "RNA": + for line in genedatafile: + fields = line.strip().split("\t") + blockCount = int(fields[7]) + if blockCount < 2: + continue + + uname = fields[0] + chrom = fields[1] + sense = fields[2] + chromstarts = fields[8][:-1].split(",") + chromstops = fields[9][:-1].split(",") + exonLengths = [] + totalLength = 0 + for index in range(blockCount): + chromstarts[index] = int(chromstarts[index]) + chromstops[index] = int(chromstops[index]) + exonLengths.append(chromstops[index] - chromstarts[index]) + totalLength += exonLengths[index] + + geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) + mapDict[uname] = [] + + genedatafile.close() + + rds = readDataset(outdbname, init, dataType, verbose=True) + + #check that our cacheSize is better than the dataset's default cache size + defaultCacheSize = rds.getDefaultCacheSize() + if cachePages > defaultCacheSize: + if init: + rds.setDBcache(cachePages, default=True) + else: + rds.setDBcache(cachePages) + + if not init and doIndex: + try: + if rds.hasIndex(): + rds.dropIndex() + except: + if verbose: + print "couldn't drop Index" + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + # make some assumptions based on first read + infile = open(filename, "r") + line = infile.readline() + if stripSpace: + line = line.replace(" ","") + + fields = line.split() + readsize = len(fields[5]) + pairedTest = fields[0][-2:] + paired = False + if pairedTest in ["/1", "/2"] or forcePair: + print "assuming reads are paired" + paired = True + + + print "read size: %d bp" % readsize + if init: + rds.insertMetadata([("readsize", readsize)]) + if paired: + rds.insertMetadata([("paired", "True")]) + + if "bowtie_mapped" not in rds.getMetadata(): + rds.insertMetadata([("bowtie_mapped", "True")]) + + if dataType == "RNA" and "spacer" not in rds.getMetadata(): + rds.insertMetadata([("spacer", spacer)]) + + infile.close() + + trim = -4 + if dataType == "RNA": + maxBorder = readsize + trim + + infile = open(filename, "r") + prevID = "" + readList = [] + uInsertList = [] + mInsertList = [] + sInsertList = [] + index = uIndex = mIndex = sIndex = lIndex = 0 + for line in infile: + lIndex += 1 + if stripSpace: + line = line.replace(" ","") + + fields = line.strip().split() + readID = fields[0] + if trimReadID: + readID = string.join(readID.split(":")[1:], ":") + + if readID != prevID: + listlen = len(readList) + if trimReadID: + prevID = "%s-%s" % (label, prevID) + + if forcePair: + prevID += "/%d" % forceID + + if listlen == 1: + (sense, chrom, start, mismatches) = readList[0] + if flip: + if sense == "+": + sense = "-" + else: + sense = "+" + + if "|" not in chrom: + stop = start + readsize + uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches)) + uIndex += 1 + elif dataType == "RNA": + currentSplice = chrom + (model, spliceID, regionStart) = currentSplice.split(delimiter) + if model not in geneDict: + prevID = readID + else: + (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model] + spliceID = int(spliceID) + rstart = int(start) - spacer + lefthalf = maxBorder - rstart + if lefthalf < 1 or lefthalf > maxBorder: + prevID = readID + else: + righthalf = readsize - lefthalf + startL = int(regionStart) + rstart + stopL = startL + lefthalf + startR = chromstarts[spliceID + 1] + stopR = chromstarts[spliceID + 1] + righthalf + sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)) + sIndex += 1 + elif listlen > 1: + prevID = "%s::%s" % (prevID, str(listlen)) + mIndex += 1 + # ignore multireads that can also map across splices + skip = False + for (sense, chrom, start, mismatches) in readList: + if "|" in chrom: + skip = True + + if not skip: + for (sense, chrom, start, mismatches) in readList: + stop = start + readsize + if flip: + if sense == "+": + sense = "-" + else: + sense = "+" + + mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches)) + else: + prevID = readID + + if index % insertSize == 0: + rds.insertUniqs(uInsertList) + rds.insertMulti(mInsertList) + uInsertList = [] + mInsertList = [] + if dataType == "RNA": + rds.insertSplices(sInsertList) + sInsertList = [] + + print ".", + sys.stdout.flush() + + # start processing new read + readList = [] + prevID = readID + index += 1 + + # add the new read + sense = fields[1] + chrom = fields[2] + # for eland compat, we are 1-based + start = int(fields[3]) + 1 + mismatches = "" + if ":" in fields[-1]: + mismatches = decodeMismatches(fields[-1], sense) + + readList.append((sense, chrom, start, mismatches)) + if lIndex % 1000000 == 0: + print "processed %d lines" % lIndex + + print "%d lines processed" % lIndex + + if len(uInsertList) > 0: + rds.insertUniqs(uInsertList) + + if len(mInsertList) > 0: + rds.insertMulti(mInsertList) + + if len(sInsertList) > 0: + rds.insertSplices(sInsertList) + + combString = "%d unique reads" % uIndex + combString += "\t%d multi reads" % mIndex + if dataType == "RNA": + combString += "\t%d spliced reads" % sIndex + + print + print combString.replace("\t", "\n") + + writeLog("%s.log" % outdbname, verstring, combString) + + if doIndex: + print "building index...." + if cachePages > defaultCacheSize: + rds.setDBcache(cachePages) + rds.buildIndex(cachePages) + else: + rds.buildIndex(defaultCacheSize) + + +def decodeMismatches(mString, rsense): + complement = {"A": "T", + "T": "A", + "C": "G", + "G": "C", + "N": "N" + } + + output = [] + mismatches = mString.split(",") + for mismatch in mismatches: + (pos,change) = mismatch.split(":") + (genNT, readNT) = change.split(">") + if rsense == "-": + readNT = complement[readNT] + genNT = complement[genNT] + + elandCompatiblePos = int(pos) + 1 + output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT)) + + return string.join(output, ",") + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py new file mode 100755 index 0000000..317ceda --- /dev/null +++ b/makerdsfromeland2.py @@ -0,0 +1,670 @@ +# +# makerdsfromeland2.py +# ENRAGE +# +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse +from commoncode import readDataset + +def main(argv=None): + if not argv: + argv = sys.argv + + verstring = "%prog: version 3.4" + print verstring + + usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\ + \ninput reads must be sorted to properly record multireads" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--append", action="store_false", dest="init", + help="append to existing rds file [default: create new]") + parser.add_option("--RNA", dest="geneDataFileName", + help="set data type to RNA [default: DNA]") + parser.add_option("--index", action="store_true", dest="doIndex", + help="index the output rds file") + parser.add_option("--cache", type="int", dest="cachePages", + help="number of cache pages to use [default: 100000") + parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter", + help="use : as the delimiter") + parser.add_option("--paired", dest="pairID", + help="pairID value") + parser.add_option("--extended", action="store_true", dest="extended", + help="use eland_extended input") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.add_option("--maxlines", type="int", dest="maxLines", + help="[default: 1000000000") + parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + label = args[0] + filename = args[1] + outdbname = args[2] + + delimiter = '|' + if options.useOldDelimiter: + delimiter = ':' + + paired = False + pairID = '1' + if options.pairID is not None: + paired = True + if options.pairID not in ['1','2']: + print 'pairID value must be 1 or 2' + sys.exit(-1) + + print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID) + + dataType = 'DNA' + if options.geneDataFileName is not None: + dataType = 'RNA' + + makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose) + + +def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False): + maxBorder = 0 + index = 0 + insertSize = 100000 + + geneDict = {} + mapDict = {} + seenSpliceList = [] + if dataType == 'RNA': + genedatafile = open(geneDataFileName) + for line in genedatafile: + fields = line.strip().split('\t') + blockCount = int(fields[7]) + if blockCount < 2: + continue + + uname = fields[0] + chrom = fields[1] + sense = fields[2] + chromstarts = fields[8][:-1].split(',') + chromstops = fields[9][:-1].split(',') + exonLengths = [] + totalLength = 0 + for index in range(blockCount): + chromstarts[index] = int(chromstarts[index]) + chromstops[index] = int(chromstops[index]) + exonLengths.append(chromstops[index] - chromstarts[index]) + totalLength += exonLengths[index] + + geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) + mapDict[uname] = [] + genedatafile.close() + + rds = readDataset(outdbname, init, dataType, verbose=True) + + if cachePages > rds.getDefaultCacheSize(): + if init: + rds.setDBcache(cachePages, default=True) + else: + rds.setDBcache(cachePages) + + if not init and doIndex: + try: + if rds.hasIndex(): + rds.dropIndex() + except: + if verbose: + print "couldn't drop Index" + + propertyList = [] + for arg in sys.argv: + if '::' in arg: + (pname, pvalue) = arg.strip().split('::') + if pname == 'flowcell' and paired: + pvalue = pvalue + '/' + pairID + + propertyList.append((pname, pvalue)) + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + infile = open(filename,'r') + line = infile.readline() + fields = line.split() + readsize = len(fields[1]) + readsizeString = str(readsize) + if dataType == 'RNA' and readsize > 32: + splicesizeString = '32' + else: + splicesizeString = readsizeString + + print 'read size: %d bp' % readsize + if init: + rds.insertMetadata([('readsize', readsize)]) + rds.insertMetadata([('eland_mapped', 'True')]) + if extended: + rds.insertMetadata([('eland_extended', 'True')]) + + if paired: + rds.insertMetadata([('paired', 'True')]) + + trim = -4 + if dataType == 'RNA': + maxBorder = readsize + trim + + insertList = [] + infile = open(filename,'r') + print 'mapping unique reads...' + lineIndex = 0 + for line in infile: + lineIndex += 1 + if lineIndex > maxLines: + break + + fields = line.split() + if fields[2] in ['QC','NM']: + continue + + (matchType, bestMatch) = getUniqueMatch(fields[2]) + if matchType == -1: + continue + + bestpos = [] + try: + pos = fields[3].split(',') + except: + if verbose: + print 'problem with line: %s' % line.strip() + continue + + matchDict = {0:[], 1:[], 2:[], 3:[]} + if len(pos) == 1: + if 'splice' in pos: + continue + + bestpos = pos + else: + currentChr = '' + for apos in pos: + if 'splice' in apos: + continue + + if ':' in apos: + (front, back) = apos.split(':') + currentChr = front + else: + back = apos + apos = currentChr + ':' + apos + + if extended: + matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T') + if matchType > 2: + matchType = 3 + else: + matchType = int(apos[-1]) + + matchDict[matchType].append(apos) + if bestMatch[matchType]: + bestpos.append(apos) + + # for padded reads, mapped read might have more mismatches! + if len(bestpos) == 0: + # let's not worry about these yet. + if 'splice' in line: + continue + + for matchType in [1, 2, 3]: + if len(matchDict[matchType]) > 0: + if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]: + bestpos = matchDict[matchType] + break + + if len(bestpos) == 0 and verbose: + print "couldn't pick best read from line: %s" % line + + for apos in bestpos: + try: + (chrom, back) = apos.split(':') + except: + continue + + if 'splice' in chrom: + continue + + if '/' in chrom: + chromfields = chrom.split('/') + chrom = chromfields[-1] + + if '.' in chrom: + try: + (chrom, fileExt) = chrom.split('.') + except: + if verbose: + print 'problem with chromosome on line %s' % line.strip() + + continue + + if extended: + if 'F' in back: + sense = '+' + (start, matchPart) = back.split('F') + else: + sense = '-' + (start, matchPart) = back.split('R') + + start = int(start) + if matchPart == readsizeString: + matchType = '' + else: + matchType = decodeMismatches(fields[1], matchPart) + else: + start = int(back[:-2]) + if back[-2] == 'F': + sense = '+' + else: + sense = '-' + + stop = int(start) + readsize - 1 + if paired: + readID = label + '-' + str(lineIndex) + '/' + pairID + else: + readID = label + '-' + str(index) + + if len(chrom) > 0: + insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType)) + + if index % insertSize == 0: + rds.insertUniqs(insertList) + insertList = [] + print '.', + sys.stdout.flush() + + index += 1 + + if len(insertList) > 0: + rds.insertUniqs(insertList) + insertList = [] + + print + print '%d unique reads' % index + infile.close() + + if dataType == 'RNA': + print 'mapping splices...' + index = 0 + lineIndex = 0 + mapfile = open(filename,'r') + for line in mapfile: + lineIndex += 1 + if lineIndex > maxLines: + break + + if 'splice' not in line: + continue + + fields = line.strip().split() + (matchType, bestMatch) = getUniqueMatch(fields[2]) + if matchType == -1: + continue + + bestpos = [] + pos = fields[3].split(',') + matchDict = {0:[], 1:[], 2:[], 3:[]} + if len(pos) == 1: + if 'chr' in pos: + continue + + bestpos = pos + else: + currentSplice = '' + for apos in pos: + if 'splice' not in apos: + continue + + if ':' in apos: + if delimiter == ':': + try: + (extmodel, spliceID, regionStart, thepos) = apos.split(':') + except: + try: + (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':') + extmodel = extmodel1 + ':' + extmodel2 + except: + print 'warning: could not process splice %s' % apos + continue + + currentSplice = extmodel + ':' + spliceID + ':' + regionStart + else: + try: + (currentSplice, thepos) = apos.split(':') + except: + try: + (extmodel1, restSplice, thepos) = apos.split(':') + currentSplice = extmodel1 + ':' + restSplice + (extmodel, spliceID, regionStart) = currentSplice.split(delimiter) + except: + print 'warning: could not process splice %s' % apos + continue + else: + thepos = apos + apos = currentSplice + ':' + apos + + if extended: + matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T') + if matchType > 2: + matchType = 3 + + # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match + if thepos[:2] == splicesizeString: + matchType = 0 + else: + matchType = int(apos[-1]) + + if bestMatch[matchType]: + bestpos.append(apos) + + # for padded reads, mapped read might have more mismatches! + if len(bestpos) == 0: + for matchType in [1, 2, 3]: + if len(matchDict[matchType]) > 0: + if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]: + bestpos = matchDict[matchType] + + break + if len(bestpos) == 0 and verbose: + print "couldn't pick best read from line: %s" % line + + for apos in bestpos: + if delimiter == ':': + try: + (extmodel, spliceID, regionStart, thepos) = apos.split(':') + except: + try: + (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':') + extmodel = extmodel1 + ':' + extmodel2 + except: + print 'warning: could not process splice %s' % apos + continue + else: + try: + (currentSplice, thepos) = apos.split(':') + except: + try: + (extmodel1, restSplice, thepos) = apos.split(':') + currentSplice = extmodel1 + ':' + restSplice + except: + print 'warning: could not process splice %s' % apos + continue + + (extmodel, spliceID, regionStart) = currentSplice.split(delimiter) + + modelfields = extmodel.split('/') + if len(modelfields) > 2: + model = string.join(modelfields[1:],'/') + else: + model = modelfields[1] + + if model not in geneDict: + print fields + continue + + (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model] + if extended: + if 'F' in thepos: + rsense = '+' + (start, matchPart) = thepos.split('F') + else: + rsense = '-' + (start, matchPart) = thepos.split('R') + + rstart = int(start) - 2 + if matchPart == readsizeString: + matchType = '' + elif matchPart[:2] == splicesizeString: + matchType = '' + else: + matchType = decodeMismatches(fields[1], matchPart) + else: + rstart = int(thepos[:-2]) - 2 + if thepos[-2] == 'F': + rsense = '+' + else: + rsense = '-' + + if trim <= rstart <= maxBorder: + pass + else: + print rstart + continue + + currentSplice = model + delimiter + spliceID + delimiter + regionStart + spliceID = int(spliceID) + lefthalf = maxBorder - rstart + if lefthalf < 1 or lefthalf > maxBorder: + continue + + righthalf = readsize - lefthalf + startL = int(regionStart) + rstart + stopL = startL + lefthalf + startR = chromstarts[spliceID + 1] + stopR = chromstarts[spliceID + 1] + righthalf + if paired: + readName = label + '-' + str(lineIndex) + '/' + pairID + else: + readName = model + '-' + str(thepos) + + insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType)) + index += 1 + if index % insertSize == 0: + rds.insertSplices(insertList) + print '.', + sys.stdout.flush() + insertList = [] + + if currentSplice not in seenSpliceList: + seenSpliceList.append(currentSplice) + + mapfile.close() + if len(insertList) > 0: + rds.insertSplices(insertList) + insertList = [] + + print + print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList)) + + infile = open(filename,'r') + print 'mapping multireads...' + lineIndex = 0 + origReadid = rds.getMultiCount() + try: + readid = int(origReadid) + 1 + except: + readid = 0 + origReadid = 0 + + print 'starting at %d' % (readid + 1) + + for line in infile: + lineIndex += 1 + if lineIndex > maxLines: + break + + fields = line.split() + if len(fields) < 4: + continue + + if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-': + continue + + (zero, one, two) = fields[2].split(':') + zero = int(zero) + one = int(one) + two = int(two) + + bestMatch = [False] * readsize + if zero > 1: + bestMatch[0] = True + elif zero == 0 and one > 1: + bestMatch[1] = True + elif zero == 0 and one == 0 and two > 1: + bestMatch[2] = True + else: + continue + + readcount = 0 + bestpos = [] + pos = fields[3].split(',') + matchDict = {0:[], 1:[], 2:[], 3:[]} + currentChr = '' + for apos in pos: + if ':' in apos: + try: + (front, back) = apos.split(':') + except: + if verbose: + print "problem splitting %s" % str(apos) + continue + + currentChr = front + else: + back = apos + apos = currentChr + ':' + apos + + if extended: + matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T') + else: + matchType = int(apos[-1]) + + try: + matchDict[matchType].append(apos) + except: + matchDict[matchType] = [apos] + + if bestMatch[matchType]: + bestpos.append(apos) + + # for padded reads, mapped read might have more mismatches! + if len(bestpos) == 0: + for matchType in [1, 2, 3]: + if len(matchDict[matchType]) > 0: + if len(matchDict[matchType]) > 1: + noSplice = True + for arg in matchDict[matchType]: + if 'splice' in arg: + noSplice = False + + if noSplice: + bestpos = matchDict[matchType] + break + + if len(bestpos) == 0 and verbose: + print "couldn't pick best read from line: %s" % line + continue + + hasSplice = False + for apos in bestpos: + if 'splice' in apos: + hasSplice = True + + # do not allow multireads that can also map accross splices for now + if hasSplice: + if verbose: + print "throwing out multiread because of splice conflict" + continue + + if len(bestpos) > 0: + readid += 1 + + for apos in bestpos: + readcount += 1 + (front, back) = apos.split(':') + chrom = front[:-3] + if extended: + if 'F' in back: + sense = '+' + (start, matchPart) = back.split('F') + else: + sense = '-' + (start, matchPart) = back.split('R') + + start = int(start) + if matchPart == readsizeString: + matchType = '' + else: + matchType = decodeMismatches(fields[1], matchPart) + else: + start = int(back[:-2]) + if back[-2] == 'F': + sense = '+' + else: + sense = '-' + + stop = int(start) + readsize + readName = '%dx%d' % (readid, len(bestpos)) + if paired: + readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName + + insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType)) + if index % insertSize == 0: + rds.insertMulti(insertList) + insertList = [] + print '.', + sys.stdout.flush() + + index += 1 + + if len(insertList) > 0: + rds.insertMulti(insertList) + insertList = [] + + print + print '%d multireads' % (readid - origReadid) + + if doIndex: + print 'building index....' + rds.buildIndex(cachePages) + + +def getUniqueMatch(elandCode): + (zero, one, two) = elandCode.split(':') + zero = int(zero) + one = int(one) + two = int(two) + bestMatch = [False, False, False, False] + if zero == 1: + bestMatch[0] = True + matchType = 0 + elif zero == 0 and one == 1: + bestMatch[1] = True + matchType = 1 + elif zero == 0 and one == 0 and two == 1: + bestMatch[2] = True + matchType = 2 + else: + matchType = -1 + + return (matchType, bestMatch) + + +def decodeMismatches(origSeq, code): + output = [] + number = '0' + index = 0 + for pos in code: + if pos.isdigit(): + number += pos + else: + index += int(number) + 1 + origNT = origSeq[index - 1] + output.append('%s%d%s' % (origNT, index, pos)) + number = '0' + + return string.join(output, ',') + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makesitetrack.py b/makesitetrack.py new file mode 100755 index 0000000..c6d0b8e --- /dev/null +++ b/makesitetrack.py @@ -0,0 +1,99 @@ +# +# makesitetrack.py +# ENRAGE +# + +import sys, string, optparse + +print "%prog: version 2.1" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog sitefile outbedfile [--noheader] [--stype fieldID] [--color xx,yy,zz] [--append] [--exploded]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--noheader", action="store_true", dest="noHeader") + parser.add_option("--stype", type="int", dest="stypeID") + parser.add_option("--color", dest="color") + parser.add_option("--append", action="store_true", dest="append") + parser.add_option("--exploded", action="store_false", dest="compact") + parser.set_defaults(stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + infile = args[0] + outfileName = args[1] + + makesitetrack(infile, outfileName, options.stypeID, options.color, options.append, options.compact, options.noHeader) + + +def makesitetrack(infileName, outFileName, stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False): + if stypeID is not None: + doStype = True + else: + doStype = False + stypeID = 4 + + infile = open(infileName) + + if append: + outfile = open(outFileName, "a") + else: + outfile = open(outFileName, "w") + + try: + (name, extension) = outFileName.split(".") + except ValueError: + name = outFileName.split(".")[:-1] + name = string.join(name, "_") + + if not noHeader: + outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % name) + + count = 1 + for line in infile: + if line[0] == "#": + continue + + fields = line.split() + if compact: + (chrom, loc) = fields[0].split(":") + (start, stop) = loc.split("-") + score = fields[1] + else: + chrom = fields[1] + start = fields[2] + stop = fields[3] + score = 1. + + stype = "%s-%s" % (name, str(count)) + if doStype: + try: + stype = fields[stypeID] + if stype == "11": + stype = "can" + elif stype == "0": + stype = "half" + else: + stype = "NC" + stype + except IndexError: + pass + + sense = fields[-2].strip() + if sense not in ["+", "-"]: + sense = "+" + + outfile.write("%s\t%s\t%d\t%s\t%s\t%s\t-\t-\t%s\n" % (chrom, start, int(stop) + 1, stype, score, sense, color)) + count += 1 + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/makewiggle.py b/makewiggle.py new file mode 100755 index 0000000..95b0634 --- /dev/null +++ b/makewiggle.py @@ -0,0 +1,209 @@ +# +# makewiggle.py +# ENRAGE +# +import sys, optparse +from commoncode import readDataset + +print "%prog: version 6.7" + +try: + import psyco + psyco.full() +except: + print 'psyco not running' + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s name rdsfile outfilename [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--raw", action="store_false", dest="doNormalize") + parser.add_option("--color", dest="color") + parser.add_option("--altcolor", dest="altColor") + parser.add_option("--chrom", dest="limitChrom") + parser.add_option("--shift", type="int", dest="shift") + parser.add_option("--split", action="store_true", dest="doSplit") + parser.add_option("--listfile", dest="listfilename") + parser.add_option("--listprefix", dest="listPrefix") + parser.add_option("--group", dest="group") + parser.add_option("--startPriority", type="float", dest="startPriority") + parser.add_option("--skiprandom", action="store_true", dest="skipRandom") + parser.add_option("--nomulti", action="store_false", dest="withMulti") + parser.add_option("--splices", action="store_true", dest="withSplices") + parser.add_option("--singlebase", action="store_true", dest="doSingle") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--enforceChr", action="store_true", dest="enforceChr") + parser.add_option("--stranded", dest="strand") + parser.add_option("--maxchunk", type="int", dest="chunk") + parser.set_defaults(doNormalize=True, color=None, altColor="", limitChrom=None, + shift=0, doSplit=False, listfilename=None, listPrefix="", + group="", startPriority=0.01, skipRandom=False, withMulti=True, + withSplices=False, doSingle=False, cachePages=-1, enforceChr=False, + strand=None, chunk=20) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + name = args[0] + hitfilename = args[1] + outfilename = args[2] + + makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor, + options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix, + options.group, options.startPriority, options.skipRandom, options.withMulti, + options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand, + options.chunk) + + +def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, altColor="", + limitChrom=None, shift=0, doSplit=False, listfilename=None, listPrefix="", + group="", startPriority=0.01, skipRandom=False, withMulti=True, withSplices=False, + doSingle=False, cachePages=-1, enforceChr=False, strand=None, chunk=20): + + priorityIncrement = 0.01 + wigType = "bedGraph" + + if color is not None: + colorString = " color=%s" % color + else: + colorString = "" + + if altColor: + colorString += " altcolor=%s" % altColor + + doList = False + if listfilename is not None: + doList = True + + chromLimit = False + if limitChrom is not None: + chromLimit = True + + if group: + groupName = "group=%s" % group + + doCache = False + if cachePages > 0: + doCache = True + + maxSpan = chunk * 1000000 + + isStranded = False + strandedDirection = "both" + if strand is not None: + isStranded = True + if strand == "plus": + strandedDirection = "plusOnly" + elif strand == "minus": + strandedDirection = "minusOnly" + + print "will keep track of %s strand(s)" % strandedDirection + + if shift: + print "Will shift reads by +/- %d bp according to their sense" % shift + name += "shift=%d" % shift + + hitRDS = readDataset(hitfilename, verbose=True, cache=doCache) + + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + readlen = hitRDS.getReadSize() + + if doNormalize: + normalizeBy = len(hitRDS) / 1000000. + else: + normalizeBy = 1. + + if doList: + listfile = open(listfilename, "w") + + priority = startPriority + if not doSplit: + outfile = open(outfilename, "w") + if doList: + listfile.write("%s%s\n" % (listPrefix, outfilename)) + + outfile.write('track type=%s name="%s" %s priority=%.3f visibility=full%s\n' % (wigType, name, groupName, priority, colorString)) + + chromList = hitRDS.getChromosomes() + chromList.sort() + for achrom in chromList: + if enforceChr and ("chr" not in achrom): + continue + + if chromLimit and achrom != limitChrom: + continue + + if skipRandom and "random" in achrom: + continue + + if doSplit: + outfile = open("%s.%s" % (outfilename, achrom), "w") + if doList: + listfile.write("%s%s.%s\n" % (listPrefix, outfilename, achrom)) + + outfile.write('track type=%s name="%s %s" %s priority=%.3f visibility=full%s\n' % (wigType, name, achrom, groupName, priority, colorString)) + priority += priorityIncrement + + lastNT = hitRDS.getMaxCoordinate(achrom, doMulti=withMulti, doSplices=withSplices) + readlen + spanStart = 0 + + previousVal = 0 + previousStart = 1 + lineIndex = 0 + for spanStop in xrange(maxSpan, lastNT+maxSpan, maxSpan): + if spanStop > lastNT: + spanStop = lastNT + + print achrom, spanStart, spanStop + chromModel = hitRDS.getChromProfile(achrom, spanStart, spanStop, withMulti, withSplices, normalizeBy, isStranded, strandedDirection, shiftValue=shift) + + for index in xrange(len(chromModel)): + currentVal = chromModel[index] + if doSingle: + outline = "%s %d %.4f\n" % (achrom, spanStart + index, currentVal) + outfile.write(outline) + continue + + if currentVal == previousVal: + continue + + if currentVal != previousVal: + if previousVal != 0: + lastpos = index + spanStart + outline = "%s %d %d %.4f\n" % (achrom, previousStart, lastpos, previousVal) + outfile.write(outline) + lineIndex += 1 + + previousVal = currentVal + previousStart = index + spanStart + + currentVal = 0 + del chromModel + spanStart = spanStop + 1 + + if doSplit: + outfile.close() + + if doSingle: + print index + 1 + else: + print lineIndex + + if not doSplit: + outfile.close() + + if doList: + listfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py new file mode 100644 index 0000000..4d174bf --- /dev/null +++ b/normalizeExpandedExonic.py @@ -0,0 +1,225 @@ +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset +from cistematic.genomes import Genome +from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB + +print "%prog: version 5.6" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--gidField", type="int", dest="fieldID") + parser.add_option("--maxLength", type="float", dest="maxLength") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--models", dest="extendGenome") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="", + replaceModels=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(sys.argv) < 6: + print usage + print "\twhere splicecountfile can be set to 'none' to not count splices\n" + sys.exit(1) + + genome = args[0] + hitfile = args[1] + uniquecountfile = args[2] + splicecountfile = args[3] + outfile = args[4] + + candidateLines = [] + acceptedfilename = "" + if len(args) > 5: + try: + candidatefile = open(args[5]) + candidateLines = candidatefile.readlines() + candidatefile.close() + acceptedfilename = args[6] + except IndexError: + pass + + normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile, + candidateLines, acceptedfilename, options.fieldID, + options.maxLength, options.doCache, options.extendGenome, + options.replaceModels) + + +def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename, + outfilename, candidateLines=[], acceptedfilename="", + fieldID=0, maxLength=1000000000., doCache=False, + extendGenome="", replaceModels=False): + + uniquecountfile = open(uniquecountfilename) + + if acceptedfilename: + acceptedfile = open(acceptedfilename, "w") + + dosplicecount = False + if splicecountfilename != "none": + dosplicecount = True + splicecountfile = open(splicecountfilename) + + if extendGenome: + if replaceModels: + print "will replace gene models with %s" % extendGenome + else: + print "will extend gene models with %s" % extendGenome + + if doCache: + cacheGeneDB(genome) + hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) + print "%s cached" % genome + else: + hg = Genome(genome, inRAM=True) + + if extendGenome != "": + hg.extendFeatures(extendGenome, replace=replaceModels) + + RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False) + uniqcount = RDS.getUniqsCount() + print "%d unique reads" % uniqcount + + splicecount = 0 + countDict = {} + gidList = [] + farList = [] + candidateDict = {} + + gidToGeneDict = {} + + featuresDict = hg.getallGeneFeatures() + print "got featuresDict" + + outfile = open(outfilename, "w") + + for line in uniquecountfile: + fields = line.strip().split() + gid = fields[fieldID] + gene = fields[1] + countDict[gid] = float(fields[-1]) + gidList.append(gid) + gidToGeneDict[gid] = gene + + uniquecountfile.close() + + if dosplicecount: + for line in splicecountfile: + fields = line.strip().split() + gid = fields[fieldID] + try: + countDict[gid] += float(fields[-1]) + except: + print fields + continue + + splicecount += float(fields[-1]) + + splicecountfile.close() + + for line in candidateLines: + if "#" in line: + continue + + fields = line.strip().split() + gid = fields[1] + gene = fields[0] + if gid not in gidList: + if gid not in farList: + farList.append(gid) + gidToGeneDict[gid] = gene + + if gid not in countDict: + countDict[gid] = 0 + + countDict[gid] += float(fields[6]) + + if gid not in candidateDict: + candidateDict[gid] = [] + + candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])) + + totalCount = (uniqcount + splicecount) / 1000000. + uniqScale = uniqcount / 1000000. + for gid in gidList: + gene = gidToGeneDict[gid] + featureList = [] + try: + featureList = featuresDict[gid] + except: + try: + featureList = featuresDict[gene] + except: + print gene, gid + + newfeatureList = [] + geneLength = 0. + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + geneLength += (abs(start - stop) + 1.) / 1000. + + if geneLength < 0.1: + geneLength = 0.1 + elif geneLength > maxLength: + geneLength = maxLength + + rpm = countDict[gid] / totalCount + rpkm = rpm / geneLength + if gid in candidateDict: + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + cratio = cCount / (cLength / 1000.) + cratio = (uniqScale * cratio) / totalCount + if 10. * cratio < rpkm: + continue + + countDict[gid] += cCount + geneLength += cLength / 1000. + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene)) + + rpm = countDict[gid] / totalCount + rpkm = rpm / geneLength + outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm)) + + for gid in farList: + gene = gidToGeneDict[gid] + geneLength = 0 + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + geneLength += cLength / 1000. + + if geneLength < 0.1: + continue + + for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]: + cratio = cCount / (cLength / 1000.) + cratio = cratio / totalCount + acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene)) + + rpm = countDict[gid] / totalCount + rpkm = rpm / geneLength + outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm)) + + outfile.close() + try: + acceptedfile.close() + except: + pass + + if doCache: + uncacheGeneDB(genome) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py new file mode 100755 index 0000000..6053e80 --- /dev/null +++ b/normalizeFinalExonic.py @@ -0,0 +1,161 @@ +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset + +print "%prog: version 3.5" % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--multifraction", action="store_true", dest="reportfraction") + parser.add_option("--multifold", action="store_true", dest="reportFold") + parser.add_option("--minrpkm", type="float", dest="minThreshold") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--withGID", action="store_true", dest="writeGID") + parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0., + doCache=False, writeGID=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + rdsfilename = argv[1] + expandedRPKMfile = args[3] + multicountfile = args[2] + outfilename = args[3] + + normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename, + options.reportFraction, options.reportFold, options.minThreshold, + options.doCache, options.writeGID) + + +def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename, + reportFraction=False, reportFold=False, minThreshold=0., doCache=False, + writeGID=False): + + expandedRPKMfile = open(expandedRPKMfilename) + multicountfile = open(multicountfilename) + + if reportFraction: + print "reporting fractional contribution of multireads" + reportFold = False + elif reportFold: + print "reporting fold contribution of multireads" + + RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False) + uniqcount = RDS.getUniqsCount() + splicecount = RDS.getSplicesCount() + multicount = RDS.getMultiCount() + countDict = {} + multicountDict = {} + lengthDict = {} + gidList = [] + + uniqspliceCount = (uniqcount + splicecount) / 1000000. + totalCount = (uniqcount + splicecount + multicount) / 1000000. + + symbolDict = {} + + for line in expandedRPKMfile: + fields = line.strip().split() + lineGID = fields[0] + symbolDict[lineGID] = fields[1] + countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount + lengthDict[lineGID] = float(fields[-2]) + multicountDict[lineGID] = 0 + if lineGID not in gidList: + gidList.append(lineGID) + + expandedRPKMfile.close() + + for line in multicountfile: + fields = line.strip().split() + gid = fields[0] + if gid in countDict: + countDict[gid] += float(fields[-1]) + multicountDict[gid] = float(fields[-1]) + else: + print "could not find gid %s in dictionaries" % gid + + multicountfile.close() + + outfile = open(outfilename, "w") + outheader = "#" + if writeGID: + outheader += "GID\t" + + outheader += "gene\tlen_kb\tRPKM" + if reportFraction: + outheader += "\tmulti/all" + elif reportFold: + outheader += "\tall/uniq" + + outheader += "\n" + outfile.write(outheader) + + outlineList = [] + index = 0 + for gid in gidList: + outline = "" + gene = symbolDict[gid] + rpm = countDict[gid] / totalCount + rpkm = rpm / lengthDict[gid] + if rpkm < minThreshold: + continue + + if writeGID: + outline = "%s\t" % gid + + index += 1 + try: + multirpm = multicountDict[gid] / totalCount + multirpkm = multirpm / lengthDict[gid] + except: + print "problem with %s - skipping " % gid + continue + + if reportFraction or reportFold: + try: + if reportFraction: + multivalue = multirpkm / rpkm + else: + if rpm > multirpm: + uniqrpkm = (rpm - multirpm) / lengthDict[gid] + multivalue = rpkm / uniqrpkm + elif rpkm > 0.01: + multivalue = 100. + else: + multivalue = 1.0 + except: + multivalue = 0 + + outline += "%s\t%.3f\t%.2f\t%.2f\n" % (gene, lengthDict[gid], rpkm, multivalue) + outlineList.append((rpkm, outline)) + else: + outline += "%s\t%.3f\t%.2f\n" % (gene, lengthDict[gid], rpkm) + outlineList.append((rpkm, outline)) + + outlineList.sort() + outlineList.reverse() + + for (rpkm, line) in outlineList: + outfile.write(line) + + outfile.close() + + print "returned %d genes" % index + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/partition.py b/partition.py new file mode 100755 index 0000000..89148fd --- /dev/null +++ b/partition.py @@ -0,0 +1,129 @@ +# +# partition.py +# ENRAGE +# +""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum] + where the regionfiles must be comma-separated with no white space + -minFeature controls the size of the smallest partition +""" + +try: + import psyco + psyco.full() +except: + pass + +import sys, string, optparse +from commoncode import getMergedRegions, writeLog + +versionString = '%s: version 2.0' % sys.argv[0] +print versionString + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--minFeature", type="int", dest="minFeature", + help="size of smallest partition") + parser.add_option("--chromField", type="int", dest="cField", + help="num chromosome fields") + parser.add_option("--padregion", type="int", dest="padregion", + help="padding on each side of region") + parser.add_option("--mergeregion", type="int", dest="mergeregion", + help="bp threshold to merge regions") + parser.add_option("--nomerge", action="store_false", dest="merging", + help="do not merge regions") + parser.add_option("--log", dest="logfilename", + help="log file") + parser.add_option("--locID", action="store_true", dest="locID", + help="use location as region ID") + parser.add_option("--norandom", action="store_true", dest="ignoreRandom", + help="ignore 'random' chromosomes") + parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log") + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + mergeID = args[0] + regionfiles = args[1] + outfilename = args[2] + + if options.padregion: + print "padding %d bp on each side of a region" % options.padregion + + if options.mergeregion: + print "merging regions closer than %d bp" % options.mergeregion + + if options.locID: + print "using locations as region ID" + + if options.ignoreRandom: + print "ignoring 'random' chromosomes" + + partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename) + + +def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"): + + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + + allregionsDict = {} + regionFileList = regionfiles.split(',') + numRegions = len(regionFileList) + chromList = [] + for regionID in range(numRegions): + allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion, minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion) + for achrom in allregionsDict[regionID]: + if achrom not in chromList: + chromList.append(achrom) + + outregionDict = {} + + chromList = sorted(chromList) + + for chrom in chromList: + if ignoreRandom and 'random' in chrom: + continue + + outregionDict[chrom] = [] + pointList = [] + for regionID in range(numRegions): + if chrom in allregionsDict[regionID]: + for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]: + pointList.append(rstart) + pointList.append(rstop) + + pointList.sort() + start = 0 + for point in pointList: + if (point - start) > minFeature: + outregionDict[chrom].append((start, point - 1, point - 1 - start)) + start = point + + outfile = open(outfilename, 'w') + if locID: + outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n') + else: + outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n') + + index = 0 + for chrom in outregionDict: + for (start, stop, length) in outregionDict[chrom]: + index += 1 + if locID: + outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.)) + else: + outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.)) + + message = "%s was partitioned into %d regions" % (mergeID, index) + print message + writeLog(logfilename, versionString, message) + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/peakstoregion.py b/peakstoregion.py new file mode 100755 index 0000000..78000f5 --- /dev/null +++ b/peakstoregion.py @@ -0,0 +1,71 @@ +# +# peakstoregion.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys + +print "%s: version 1.0" % sys.argv[0] + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 3: + print "usage: python %s peakfile outfile [radius] [chromField] [posField] [labelField] [datafield]" % sys.argv[0] + sys.exit(1) + + peakfile = argv[1] + outfile = argv[2] + + radius = 500 + chromField = 2 + posField = 3 + labelField = 1 + dataField = -1 + + if len(argv) > 3: + radius = int(argv[3]) + + if len(argv) > 4: + chromField = int(argv[4]) + + if len(argv) > 5: + posField = int(argv[5]) + + if len(argv) > 6: + labelField = int(argv[6]) + + if len(argv) > 7: + dataField = int(argv[7]) + + peakstoregion(peakfile, outfile, radius, chromField, posField, labelField, dataField) + + +def peakstoregion(peakfilename, outfilename, radius=500, chromField=2, posField=3, labelField=1, dataField=-1): + peakfile = open(peakfilename) + outfile = open(outfilename, "w") + + for line in peakfile: + fields = line.strip().split() + label = "REGION" + try: + label = fields[labelField] + except IndexError: + pass + + start = int(fields[posField]) - radius + stop = int(fields[posField]) + radius + outfile.write("%s\t%s\t%d\t%d\t%s\n" % (label, fields[chromField], start, stop, fields[dataField])) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/plotbardist.py b/plotbardist.py new file mode 100755 index 0000000..52ccbe2 --- /dev/null +++ b/plotbardist.py @@ -0,0 +1,183 @@ +# +# plotbardist.py +# ENRAGE +# +# Created by Ali Mortazavi on 12/13/07. + +try: + import psyco + psyco.full() +except: + pass + +import sys +import optparse +import matplotlib +from pylab import * +from math import * + + +print "%prog: version 3.2" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog infile1 [infile2] [infile3] [options] outfile.png" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--bins", type="int", dest="bins") + parser.add_option("--field", type="int", dest="binnedField") + parser.add_option("--binSize", type="float", dest="binLength") + parser.add_option("--doLog", type="int", dest="logBase") + parser.add_option("--ymax", type="int", dest="maxY") + parser.add_option("--xlabel", dest="xLabel") + parser.add_option("--ylabel", dest="yLabel") + parser.add_option("--binLabels", dest="binLabels", help="comma separated list") + parser.add_option("--title", dest="figTitle") + parser.add_option("--legend", dest="barsLegend", help="comma separated list") + parser.add_option("--xoffset", type="float", dest="pointOffset") + parser.add_option("--figsize", dest="figSizes", help="x,y pair") + parser.set_defaults(bins=10, binnedField=-1, binLength=-1, logBase=None, maxY=0, + xLabel="bins", yLabel="count", binLabels=None, figTitle="", + barsLegend=None, pointOffset=0., figSizes=None) + + (options, args) = parser.parse_args(argv[1:]) + + + if len(args) < 2 or len(args) > 4: + print usage + print "where labelList and legendList are comma delimited strings of the form 'labelA,labelB,...,labelN'" + sys.exit(1) + + fileList = args[:-1] + pngfilename = args[-1] + + plotbardist(fileList, pngfilename, options.bins, options.binnedField, options.binLength, + options.logBase, options.maxY, options.xLabel, options.yLabel, options.binLabels, + options.figTitle, options.barsLegend, options.pointOffset, options.figSizes) + + +def plotbardist(fileList, pngfilename, bins=10, binnedField=-1, binLength=-1, logBase=None, + maxY=0, xLabel="bins", yLabel="count", binLabels=None, figTitle="", + barsLegend=None, pointOffset=0., figSizes=None): + + matplotlib.use("Agg") + plotParameters = {1: {"width": 0.5, + "offset": [-0.25]}, + 2: {"width": 0.3, + "offset": [-0.3, 0]}, + 3: {"width": 0.2, + "offset": [-0.2, 0., 0.2]} + } + + colorList = ["b", "r", "c"] + width = plotParameters[len(fileList)]["width"] + offset = plotParameters[len(fileList)]["offset"] + + doLog = False + if logBase is not None: + doLog = True + print "taking log%d of x datapoints" % logBase + xLabel = "log%d(%s)" % (logBase, xLabel) + else: + logBase = 10 + + if figSizes is not None: + sizes = figSizes.strip().split(",") + figure(figsize=(float(sizes[0]),float(sizes[1]))) + + doLabels = False + if binLabels is not None: + binLabels = binLabels.strip().split(",") + doLabels = True + else: + binLabels = [] + + if barsLegend is not None: + barsLegend = barsLegend.strip().split(",") + else: + barsLegend = [] + + ind2 = arange(bins) + + bars = [] + barsColors = [] + index = 0 + for fileName in fileList: + aFile = open(fileName) + distbin = bins * [0] + + dataList = [] + for line in aFile: + fields = line.strip().split() + try: + point = float(fields[binnedField]) + pointOffset + if doLog: + if point < 1: + point = 1 + + point = log(point, logBase) + + dataList.append(point) + except: + continue + + print "%d data points" % len(dataList) + + dataList.sort() + print "low = %f high = %f" % (dataList[0], dataList[-1]) + + if binLength < 0: + binLength = abs(dataList[-1] - dataList[0]) / bins + + for point in dataList: + try: + distbin[int(round(point/binLength))] += 1 + except: + distbin[-1] += 1 + + print binLength, int(round(point/binLength)) + + bars.append(bar(ind2 + offset[index], distbin, width, color=colorList[index])) + barsColors.append(bars[-1][0]) + + print distbin + halfCount = sum(distbin) / 2 + median = 0 + foundMedian = False + while not foundMedian: + if sum(distbin[:median]) < halfCount: + median += 1 + else: + foundMedian = True + + print median + index += 1 + + xlim(-1 * width - 0.2, bins + 0.2) + + if len(barsLegend) > 0: + legend(barsColors, barsLegend) + + ylabel(yLabel) + xlabel(xLabel) + + if doLabels: + setp(gca(), "xticklabels", binLabels) + + if maxY > 0: + ylim(0, maxY) + + if len(figTitle) > 0: + title(figTitle) + + gca().get_xaxis().tick_bottom() + gca().get_yaxis().tick_left() + + savefig(pngfilename) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/plotnomogram.py b/plotnomogram.py new file mode 100755 index 0000000..238a4da --- /dev/null +++ b/plotnomogram.py @@ -0,0 +1,126 @@ +# +# plotnomogram.py +# ENRAGE +# + +import sys + +import matplotlib +from pylab import * +import matplotlib.axes + +try: + import psyco + psyco.full() +except: + pass + +print "%s: version 1.1" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 5: + print "usage: python %s maxdev xreads infile outpng" % argv[0] + sys.exit(1) + + maxdev = float(argv[1]) + xreads = float(argv[2]) + infilename = argv[3] + outfilename = argv[4] + + plotnomogram(maxdev, xreads, infilename, outfilename) + + +def plotnomogram(maxdev, xreads, infilename, outfilename): + matplotlib.use("Agg") + infile = open(infilename) + line = infile.readline().strip() + + percentages = line.split() + del percentages[0] + + listWidth = len(percentages) + + geneValues = {} + + for line in infile: + fields = line.strip().split() + geneValues[fields[0]] = [] + for pos in range(listWidth): + geneValues[fields[0]].append(float(fields[1 + pos])) + + # categories here are: 3000+, 2999-300, 299-30, 29-3 + genes3000p = [] + genes300p = [] + genes30p = [] + genes3p = [] + + for gene in geneValues: + finalLevel = geneValues[gene][0] + if finalLevel >= 3000: + genes3000p.append(gene) + elif finalLevel >= 300: + genes300p.append(gene) + elif finalLevel >= 30: + genes30p.append(gene) + elif finalLevel >= 3: + genes3p.append(gene) + + organizedList = [genes3000p, genes300p, genes30p, genes3p] + listNames = ["3000+ RPKM ", "300-2999 RPKM", "30-299 RPKM ", "3-29 RPKM "] + listColors = ["k", "c", "m", "r"] + geneCounts = {} + oldscores = [0.] + newscores = {} + for name in listNames: + newscores[name] = [0.] + + index = 0 + for percent in percentages[1:]: + oldscores.append(xreads * float(percent) / 100.) + index += 1 + listindex = 0 + for geneList in organizedList: + geneCount = len(geneList) + numOver = 0. + for gene in geneList: + finalVal = geneValues[gene][0] + currentVal = geneValues[gene][index] + if abs((currentVal - finalVal) / finalVal) > maxdev: + numOver += 1. + + fraction = 1. - numOver / geneCount + print "%s %s %d %.2f" % (percent, listNames[listindex], geneCount, fraction) + newscores[listNames[listindex]].append(fraction) + geneCounts[listNames[listindex]] = geneCount + listindex += 1 + + matplotlib.axes._process_plot_var_args.defaultColors = ["k", "y", "m", "c", "b", "g", "r"] + + oldscores.append(xreads) + index = 0 + plots = [] + plotsColors = [] + plotsLegend = [] + for name in listNames: + newscores[name].append(1.0) + plots.append(plot(oldscores, newscores[name], listColors[index], linewidth=2)) + plot(oldscores[1:-1], newscores[name][1:-1], listColors[index] + "^") + plotsColors.append(plots[-1][0]) + plotsLegend.append("%s n = %d" % (name, geneCounts[name])) + index += 1 + + legend(plotsColors, plotsLegend, loc=0) + xticks(oldscores) + locs, labels = xticks() + setp(labels, rotation="vertical") + ylim(0, 1.03) + xlim(-0.1, xreads + .1) + savefig(outfilename) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/plotprofile.py b/plotprofile.py new file mode 100755 index 0000000..854affa --- /dev/null +++ b/plotprofile.py @@ -0,0 +1,129 @@ +# +# plotprofile.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys +import optparse +from pylab import * +from math import * +import matplotlib + + +print "%prog: version 2.2" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s infile outfile.png [--scale] [--max weightMax] [--ymin bottom] [--ymax top] [--subtractEvens]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--scale", action="store_true", dest="doScale") + parser.add_option("--max", type="float", dest="weightMax") + parser.add_option("--ymin", type="float", dest="ymin") + parser.add_option("--ymax", type="float", dest="ymax") + parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens") + parser.set_defaults(doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + infile = args[0] + pngfilename = args[1] + + plotprofile(infile, pngfilename, options.doScale, options.weightMax, options.ymin, options.ymax, options.subtractEvens) + + +def plotprofile(inFileName, pngfilename, doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False): + infile = open(inFileName) + limitYscale = False + if ymax is not None: + limitYscale = True + else: + ymax = 0. + + if ymin is not None: + limitYscale = True + else: + ymin = 0. + + matplotlib.use("Agg") + + labelList = [] + dataList = [] + plotList = [] + xmin = 10**20 + xmax = -10**20 + + xcoordList = [] + datapointList = [] + weightList = [] + line = infile.readline() + fields = line.strip().split() + for data in fields[1:-1]: + datapoint = float(data) + if datapoint < xmin: + xmin = datapoint + + if datapoint > xmax: + xmax = datapoint + + xcoordList.append(datapoint) + + index = 1 + for line in infile: + fields = line.strip().split() + datapointList = [] + for data in fields[1:-1]: + datapointList.append(float(data)) + + if subtractEvens and index % 2 == 0: + for dataIndex in range(len(datapointList)): + dataList[-1][dataIndex] -= datapointList[dataIndex] + else: + dataList.append(datapointList) + + weight = float(fields[-1]) + if subtractEvens and index % 2 == 0: + pass + else: + labelList.append(fields[0]) + if weight > weightMax: + weightMax = weight + + weightList.append(weight) + + index += 1 + + for index in range(len(dataList)): + newList = [] + if doScale: + scale = weightList[index] / weightMax + print weightList[index], weightMax, scale + for val in dataList[index]: + newList.append(val * scale) + else: + newList = dataList[index] + + plotList.append(plot(xcoordList, newList, linewidth=3.0)) + + xticks(xcoordList, rotation="vertical") + xlim(xmin - 0.1, xmax + 0.1) + if limitYscale: + ylim(ymin, ymax) + + legend(plotList, labelList) + savefig(pngfilename) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/predictSpliceCount.py b/predictSpliceCount.py new file mode 100755 index 0000000..bab85db --- /dev/null +++ b/predictSpliceCount.py @@ -0,0 +1,84 @@ +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys +from cistematic.genomes import Genome + + +def main(argv=None): + if not argv: + argv = sys.argv + + print '%s: version 1.1' % argv[0] + + if len(argv) < 6: + print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0] + sys.exit(1) + + genome = argv[1] + # number of nucleotides at the end of each exon that is affected by splicing + splicelead = int(argv[2]) + uniquefilecount = argv[3] + splicefilecount = argv[4] + outfilename = argv[5] + + predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename) + + +def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename): + hg = Genome(genome) + + gidDict = {} + gidList = [] + uniqueCountDict = {} + spliceCountDict = {} + + uniquefile = open(uniquefilecount) + for line in uniquefile: + fields = line.strip().split() + gidDict[fields[0]] = fields[1] + gidList.append(fields[0]) + uniqueCountDict[fields[0]] = int(fields[2]) + + splicefile = open(splicefilecount) + for line in splicefile: + fields = line.strip().split() + spliceCountDict[fields[0]] = int(fields[2]) + + outfile = open(outfilename,'w') + + gidList.sort() + for gid in gidList: + symbol = gidDict[gid] + featureList = hg.getGeneFeatures((genome, gid)) + newfeatureList = [] + featuresizesum = 0 + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + featuresizesum += stop - start + 1 + + if featuresizesum < 1: + featuresizesum = 1 + + splicearea = (len(newfeatureList) - 1) * splicelead + if splicearea < splicelead: + splicearea = 0 + + fractionCoverage = featuresizesum / float(splicearea + featuresizesum) + expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid] + + # this p-value is based on the observed unique count, not the expected total count + # nor the multi-read adjusted count + pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid]) + print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]) + outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/profilebins.py b/profilebins.py new file mode 100755 index 0000000..46274f5 --- /dev/null +++ b/profilebins.py @@ -0,0 +1,154 @@ +# +# profilebins.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +print "%prog: version 2.2" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog label infile1 [--upstream infile2] [--downstream infile3] [--uplength kb] [--downlength kb] [--gene geneName] [--genes genefile] [--append] outfile" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--upstream", dest="upfilename") + parser.add_option("--downstream", dest="downfilename") + parser.add_option("--uplength", type="float", dest="uplength") + parser.add_option("--downlength", type="int", dest="") + parser.add_option("--gene", dest="gene") + parser.add_option("--genes", dest="genefile") + parser.add_option("--append", action="store_true", dest="doAppend") + parser.set_defaults(upfilename=None, downfilename=None, uplength=0.0, downlength=0.0, + gene=None, genefile=None, doAppend=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + label = args[0] + infilename = args[1] + outfilename = args[2] + + profilebins(label, infilename, outfilename, options.upfilename, options.downfilename, + options.uplength, options.downlength, options.gene, options.genefile, + options.doAppend) + + +def profilebins(label, infilename, outfilename, upfilename=None, downfilename=None, + uplength=0.0, downlength=0.0, gene=None, genefile=None, doAppend=False): + + fileList = [infilename] + geneList = [] + restrictGenes = False + if gene is not None: + geneList.append(gene) + restrictGenes = True + + if genefile is not None: + for line in genefile: + fields = line.strip().split() + if len(fields) > 1: + geneList.append(fields[0]) + else: + geneList.append(line.strip()) + + restrictGenes = True + + if upfilename is not None: + fileList = [upfilename, infilename] + + if downfilename is not None: + fileList.append(downfilename) + + partLength = [10.] + partOffset = [0.] + + if uplength: + partLength = [uplength, 10.] + partOffset = [-1. * uplength, 0.] + + if downlength: + partLength.append(downlength) + partOffset.append(10.) + + totalWeight = 0. + totalBins = [] + for afile in fileList: + infile = open(afile) + + line = infile.readline() + fields = line.strip().split() + numBins = len(fields) - 4 + + geneName = fields[1] + weight = float(fields[2]) + if restrictGenes and geneName in geneList: + totalWeight += weight + + totalBins.append([]) + for myBin in fields[4:]: + if not restrictGenes or (restrictGenes and geneName in geneList): + totalBins[-1].append(weight * float(myBin)) + else: + totalBins[-1].append(0.) + + for line in infile: + fields = line.strip().split() + geneName = fields[1] + if restrictGenes and geneName not in geneList: + continue + + weight = float(fields[2]) + index = 0 + for myBin in fields[4:]: + totalBins[-1][index] += weight * float(myBin) + index += 1 + + totalWeight += weight + + sumWeight = 0. + totalPercent = 0. + if doAppend: + outfile = open(outfilename, "a") + else: + outfile = open(outfilename, "w") + outfile.write("x-axis") + partIndex = 0 + for partBins in totalBins: + partLen = partLength[partIndex] + numBins = len(partBins) + for binIndex in range(numBins): + outfile.write("\t%.2f" % (partOffset[partIndex] + (binIndex * partLen/numBins))) + + partIndex += 1 + + outfile.write("\tweight\n") + + outfile.write(label) + for partBins in totalBins: + for aBin in partBins: + percent = aBin / totalWeight + outfile.write("\t%.1f" % percent) + sumWeight += aBin + totalPercent += percent + + outfile.write("\t%.1f\n" % totalWeight) + outfile.close() + + print sumWeight + print totalPercent + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/ratio.py b/ratio.py new file mode 100755 index 0000000..ae14cfe --- /dev/null +++ b/ratio.py @@ -0,0 +1,83 @@ +import sys +import string +import optparse +import math + +print "%prog: version 2.3" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog denominatorField infile [--only fieldID] [--out outfile]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--only", type="int", dest="onlyField") + parser.add_option("--out", dest="outFileName") + parser.set_defaults(outFileName=None, onlyField=-1) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + field = int(args[0]) + if args[1] == "-": + inFileName = sys.stdin + else: + inFileName = args[1] + + ratio(field, inFileName, options.outFileName, options.onlyField) + +def ratio(field, inFileName, outFileName=None, onlyField=-1): + + if inFileName is not None: + infile = open(inFileName) + else: + infile = sys.stdin + + record = False + if outFileName is not None: + outfile = open(outFileName, "w") + record = True + + doOnly = False + if onlyField != -1: + doOnly = True + + line = infile.readline() + count = len(line.strip().split()) + if record: + outfile.write(line) + + for line in infile: + fields = line.strip().split() + outline = str(fields[0]) + outError = False + for index in range(1, count): + if field == index: + outline = string.join([outline, "0"], " ") + elif doOnly and index != onlyField: + outline = string.join([outline, str(fields[index])], " ") + else: + try: + ratioString = "%2.2f" % math.log((float(fields[index]) + 1)/(float(fields[field]) + 1), 2) + outline = string.join([outline, ratioString], " ") + except: + try: + outline = string.join([outline, "e%s" % fields[index]], " ") + except: + outError = True + + if outError: + continue + + if record: + outfile.write(outline + "\n") + else: + print outline + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rdsmetadata.py b/rdsmetadata.py new file mode 100755 index 0000000..1ac458b --- /dev/null +++ b/rdsmetadata.py @@ -0,0 +1,106 @@ +# +# rdsmetadata.py +# ENRAGE +# +try: + import psyco + psyco.full() +except: + pass + +import sys +import optparse +from commoncode import readDataset + +print "%prog: version 2.7" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog rdsfile [propertyName1::propertyValue1] ... [propertyNameN::propertyValueN] [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--defaultcache", type="int", dest="cacheVal") + parser.add_option("--index", action="store_true", dest="buildIndex") + parser.add_option("--dropindex", action="store_true", dest="dropIndex") + parser.add_option("--nocount", action="store_false", dest="doCount") + parser.add_option("--complexity", action="store_true", dest="doComplexity") + parser.add_option("--reset", action="store_true", dest="resetFlags") + parser.add_option("--initrna", action="store_true", dest="rnaDataType") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(cacheVal=0, buildIndex=False, dropIndex=False, doCount=True, + doComplexity=False, resetFlags=False, rnaDataType=False, + cachePages=-1) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 1: + print usage + print "where the optional metadata name::value pairs are added to the existing dataset" + sys.exit(1) + + datafile = args[0] + + propertyList=[] + for arg in args: + if "::" in arg: + (pname, pvalue) = arg.strip().split("::") + print "adding %s : %s" % (pname, pvalue) + propertyList.append((pname, pvalue)) + + rdsmetadata(datafile, propertyList, options.cacheVal, options.buildIndex, + options.dropIndex, options.doCount, options.doComplexity, + options.resetFlags, options.rnaDataType, options.cachePages) + + +def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False, + dropIndex=False, doCount=True, doComplexity=False, resetFlags=False, + rnaDataType=False, cachePages=-1): + + doCache = False + if cachePages != -1: + doCache = True + + if rnaDataType: + rds = readDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache) + else: + rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache) + + if cachePages > rds.getDefaultCacheSize(): + rds.setDBcache(cachePages) + + if cacheVal > 0: + rds.setDBcache(cacheVal, default=True) + print "set default cache size to %d pages" % cacheVal + + if resetFlags: + print "clearing read flags" + rds.resetFlags() + + if dropIndex: + try: + rds.dropIndex() + except: + print "could not drop index" + + if buildIndex: + print "building index...." + if cacheVal > 0: + rds.buildIndex(cacheVal) + else: + rds.buildIndex() + + if doComplexity: + print "calculating uniq read complexity..." + uniqs = rds.getUniqsCount(distinct=False) + distincts = rds.getUniqsCount(distinct=True) + print "%d distincts / %d uniqs = %.2f" % (distincts, uniqs, float(distincts) / uniqs) + + if len(propertyList) > 0: + rds.insertMetadata(propertyList) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/recordLog.py b/recordLog.py new file mode 100755 index 0000000..731d378 --- /dev/null +++ b/recordLog.py @@ -0,0 +1,31 @@ +# +# recordLog.py +# ENRAGE +# +# Created by Ali Mortazavi on 12/14/08. +# + +import sys +from commoncode import writeLog + + +def main(argv=None): + if not argv: + argv = sys.argv + + if "-verbose" in argv or len(argv) < 4: + print "%s: version 1.0" % sys.argv[0] + + if len(argv) < 4: + print "usage: python %s logFile messenger message [--verbose]" % argv[0] + sys.exit(1) + + logFile = argv[1] + messenger = argv[2] + message = argv[3] + + writeLog(logFile, messenger, message) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/regionBins.py b/regionBins.py new file mode 100755 index 0000000..2d1649b --- /dev/null +++ b/regionBins.py @@ -0,0 +1,89 @@ +# +# regionBins.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys +print '%s: version 2.0' % sys.argv[0] + +if len(sys.argv) < 4: + print 'usage: python %s regionfile rdsfile outfilename [-bins numbins] [-field fieldNum] [-raw] [-padregion bp] [-mergeregion bp] [-cache]' % sys.argv[0] + sys.exit(1) + +from commoncode import * + +regionfilename = sys.argv[1] +hitfile = sys.argv[2] +outfilename = sys.argv[3] + +if '-raw' in sys.argv: + normalize = False + normalizeBins = False +else: + normalize = True + normalizeBins = True + +doCache = False +if '-cache' in sys.argv: + doCache = True + +cField = 1 +if '-field' in sys.argv: + fieldIndex = sys.argv.index('-field') + 1 + cField = int(sys.argv[fieldIndex]) + +padregion = 0 +if '-padregion' in sys.argv: + padField = sys.argv.index('-padregion') + 1 + padregion = int(sys.argv[padField]) + print 'padding %d bp on each side of a region' % padregion + +mergeregion = 0 +if '-mergeregion' in sys.argv: + mergeField = sys.argv.index('-mergeregion') + 1 + mergeregion = int(sys.argv[mergeField]) + print 'merging regions closer than %d bp' % mergeregion + +bins = 10 +if '-bins' in sys.argv: + binfield = sys.argv.index('-bins') + 1 + bins = int(sys.argv[binfield]) + +hitRDS = readDataset(hitfile, verbose = True, cache=doCache) +readlen = hitRDS.getReadSize() +normalizationFactor = 1.0 +if normalize: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. + +chromList = hitRDS.getChromosomes(fullChrom=False) +chromList.sort() + +regionDict = getMergedRegions(regionfilename, maxDist = mergeregion, keepLabel = True, verbose = True, chromField = cField, pad=padregion) + +hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + +(regionsBins, regionsLen) = computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizationFactor) + +outfile = open(outfilename, 'w') +for regionID in regionsBins: + tagCount = 0. + for binAmount in regionsBins[regionID]: + tagCount += binAmount + outfile.write('%s\t%s\t%.1f\t%d' % (regionID, regionID, tagCount, Len[gid])) + for binAmount in gidBins[gid]: + if normalizeBins: + if tagCount == 0: + tagCount = 1 + outfile.write('\t%.1f' % (100. * binAmount / tagCount)) + else: + outfile.write('\t%.1f' % binAmount) + outfile.write('\n') + +outfile.close() \ No newline at end of file diff --git a/regionCounts.py b/regionCounts.py new file mode 100755 index 0000000..0104cc2 --- /dev/null +++ b/regionCounts.py @@ -0,0 +1,221 @@ +# +# regionCounts.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + print 'psyco not running' + +import sys, string, optparse +from commoncode import readDataset, getMergedRegions, findPeak, writeLog + +versionString = "%prog: version 3.9" +print versionString + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog regionfile rdsfile outfilename [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--markRDS", action="store_true", dest="flagRDS") + parser.add_option("--chromField", type="int", dest="cField") + parser.add_option("--fullchrom", action="store_true", dest="useFullchrom") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--padregion", type="int", dest="padregion") + parser.add_option("--mergeregion", type="int", dest="mergeregion") + parser.add_option("--nomerge", action="store_false", dest="merging") + parser.add_option("--noUniqs", action="store_false", dest="doUniqs") + parser.add_option("--noMulti", action="store_false", dest="doMulti") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--peak", action="store_true", dest="usePeak") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--log", dest="logfilename") + parser.add_option("--rpkm", action="store_true", dest="doRPKM") + parser.add_option("--length", action="store_true", dest="doLength") + parser.add_option("--force", action="store_true", dest="forceRegion") + parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True, + padregion=0, mergeregion=0, merging=True, doUniqs=True, + doMulti=True, doSplices=False, usePeak=False, cachePages=-1, + logfilename="regionCounts.log", doRPKM=False, doLength=False, + forceRegion=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + regionfilename = args[0] + hitfile = args[1] + outfilename = args[2] + + regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField, + options.useFullchrom, options.normalize, options.padregion, + options.mergeregion, options.merging, options.doUniqs, options.doMulti, + options.doSplices, options.usePeak, options.cachePages, options.logfilename, + options.doRPKM, options.doLength, options.forceRegion) + + +def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, + useFullchrom=False, normalize=True, padregion=0, mergeregion=0, + merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False, + cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False, + forceRegion=False): + + print "padding %d bp on each side of a region" % padregion + print "merging regions closer than %d bp" % mergeregion + print "will use peak values" + + if cachePages != -1: + doCache = True + else: + doCache = False + + normalize = True + doRPKM = False + if doRPKM == True: + normalize = True + + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + + regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True, + fullChrom=useFullchrom, verbose=True, chromField=cField, + doMerge=merging, pad=padregion) + + labelList = [] + labeltoRegionDict = {} + regionCount = {} + + hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + readlen = hitRDS.getReadSize() + if cachePages > hitRDS.getDefaultCacheSize(): + hitRDS.setDBcache(cachePages) + + totalCount = len(hitRDS) + if normalize: + normalizationFactor = totalCount / 1000000. + + chromList = hitRDS.getChromosomes(fullChrom=useFullchrom) + if len(chromList) == 0 and doSplices: + chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom) + + chromList.sort() + + if flagRDS: + hitRDS.setSynchronousPragma("OFF") + + for rchrom in regionDict: + if forceRegion and rchrom not in chromList: + print rchrom + for (label, start, stop, length) in regionDict[rchrom]: + regionCount[label] = 0 + labelList.append(label) + labeltoRegionDict[label] = (rchrom, start, stop) + + for rchrom in chromList: + regionList = [] + if rchrom not in regionDict: + continue + + print rchrom + if useFullchrom: + fullchrom = rchrom + else: + fullchrom = "chr%s" % rchrom + + if usePeak: + readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True) + rindex = 0 + dictLen = len(readDict[fullchrom]) + + for (label, start, stop, length) in regionDict[rchrom]: + regionCount[label] = 0 + labelList.append(label) + labeltoRegionDict[label] = (rchrom, start, stop) + + if useFullchrom: + fullchrom = rchrom + else: + fullchrom = "chr%s" % rchrom + + for (label, rstart, rstop, length) in regionDict[rchrom]: + regionList.append((label, fullchrom, rstart, rstop)) + if usePeak: + readList = [] + for localIndex in xrange(rindex, dictLen): + read = readDict[fullchrom][localIndex] + if read[0] < rstart: + rindex += 1 + elif rstart <= read[0] <= rstop: + readList.append(read) + else: + break + + if len(readList) < 1: + continue + + readList.sort() + (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True) + try: + topValue = smoothArray[topPos[0]] + except: + print "problem with %s %s" % (str(topPos), str(smoothArray)) + continue + + regionCount[label] += topValue + else: + regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + + if flagRDS: + hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices) + + if flagRDS: + hitRDS.setSynchronousPragma("ON") + + if normalize: + for label in regionCount: + regionCount[label] = float(regionCount[label]) / normalizationFactor + + outfile = open(outfilename, "w") + + if forceRegion: + labelList.sort() + + for label in labelList: + (chrom, start, stop) = labeltoRegionDict[label] + if useFullchrom: + fullchrom = chrom + else: + fullchrom = "chr%s" % chrom + + if normalize: + if doRPKM: + length = abs(stop - start) / 1000. + else: + length = 1. + + if length < 0.001: + length = 0.001 + + outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length)) + if doLength: + outfile.write("\t%.1f" % length) + else: + outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label])) + + outfile.write("\n") + + outfile.close() + if doCache and flagRDS: + hitRDS.saveCacheDB(hitfile) + + writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.)) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/regionintersects.py b/regionintersects.py new file mode 100755 index 0000000..340d2f8 --- /dev/null +++ b/regionintersects.py @@ -0,0 +1,203 @@ +# +# regionintersects.py +# ENRAGE +# +try: + import psyco + psyco.full() +except: + pass + +import sys, optparse +from commoncode import readDataset, getMergedRegions, findPeak + +print "%prog: version 3.0" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--reject1", dest="rejectOneName") + parser.add_option("--reject2", dest="rejectTwoName") + parser.add_option("--union", action="store_true", dest="trackReject") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--raw", action="store_false", dest="normalize") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False, + doCache=False, normalize=True, doVerbose=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 5: + print usage + sys.exit(1) + + readOneName = args[0] + regionOneName = args[1] + readTwoName = args[2] + regionTwoName = args[3] + outfilename = args[4] + + regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, + outfilename, options.rejectOneName, options.rejectTwoName, + options.trackReject, options.doCache, options.normalize, + options.doVerbose) + + +def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName, + outfilename, rejectOneName=None, rejectTwoName=None, + trackReject=False, doCache=False, normalize=True, doVerbose=False): + + mergedist=0 + + outfile = open(outfilename, "w") + + doReject = False + if rejectOneName is not None: + trackReject = True + doReject = True + rejectOne = open(rejectOneName, "w") + + if rejectTwoName is not None: + trackReject = True + doReject = True + rejectTwo = open(rejectTwoName, "w") + + oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose) + twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose) + + oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) + twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache) + + if normalize: + normalize1 = len(oneRDS) / 1000000. + normalize2 = len(twoRDS) / 1000000. + else: + normalize1 = 1. + normalize2 = 1. + + commonRegions = 0 + oneRejectIndex = 0 + twoRejectIndex = 0 + + onePeaksDict = {} + oneFoundDict = {} + + numRegionsOne = 0 + numRegionsTwo = 0 + for rchrom in oneDict: + numRegionsOne += len(oneDict[rchrom]) + + for rchrom in twoDict: + numRegionsTwo += len(twoDict[rchrom]) + + outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName)) + + for rchrom in oneDict: + if rchrom not in twoDict: + continue + + print rchrom + rindex = 0 + rindex2 = 0 + fullchrom = "chr" + rchrom + oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True) + dictLen1 = len(oneReads[fullchrom]) + twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True) + dictLen2 = len(twoReads[fullchrom]) + chrom = rchrom + onePeaksDict[chrom] = [] + oneFoundDict[chrom] = [] + for (start, stop, length) in oneDict[chrom]: + readList = [] + for localIndex in xrange(rindex, dictLen1): + read = oneReads[fullchrom][localIndex] + if read[0] < start: + rindex += 1 + elif start <= read[0] <= stop: + readList.append(read) + else: + break + + if len(readList) < 1: + continue + + readList.sort() + + (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True) + onePeakScore = smoothArray[topPos[0]] + onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1)) + + for (start, stop, length) in twoDict[chrom]: + readList2 = [] + for localIndex in xrange(rindex2, dictLen2): + read = twoReads[fullchrom][localIndex] + if read[0] < start: + rindex2 += 1 + elif start <= read[0] <= stop: + readList2.append(read) + else: + break + + if len(readList2) < 1: + continue + + readList2.sort() + (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True) + numHits /= normalize2 + twoIsCommon = False + twoPeak = topPos[0] + start + twoRadius = length/2 + twoPeakScore = smoothArray[topPos[0]] / normalize2 + for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]: + if abs(twoPeak - onePeak) < (twoRadius + oneRadius): + if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict: + oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits)) + + twoIsCommon = True + commonRegions += 1 + outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore) + if doVerbose: + print outline + + outfile.write(outline + "\n") + + if trackReject and not twoIsCommon: + twoRejectIndex += 1 + outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore) + if doReject: + rejectTwo.write(outline + "\n") + else: + outfile.write(outline + "\n") + + if doVerbose: + print outline + + if trackReject: + for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]: + if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]: + oneRejectIndex += 1 + outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore) + if doReject: + rejectOne.write(outline + "\n") + else: + outfile.write(outline + "\n") + + if doVerbose: + print outline + + if trackReject: + print "common: %d one-only: %d two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex) + outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex)) + else: + print "common: %d" % commonRegions + outfile.write("#common: %d\n" % commonRegions) + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/regiontobed.py b/regiontobed.py new file mode 100755 index 0000000..e6ce22a --- /dev/null +++ b/regiontobed.py @@ -0,0 +1,113 @@ +""" + usage: python regiontobed label regionfile outbedfile [--color r,g,b] [--score field] [--narrowPeak] [--broadPeak] [--itemRgb] [--nolabel] + where color is in comma-delimited RGB without space + and field is a column with a score (first column is 0, second is 1,...) + t-narrowPeak assumes that findall.py was run with -listPeak + t-broadPeak assumes that findall.py was *NOT* run with -listPeak +""" + +try: + import psyco + psyco.full() +except: + pass + +import sys, math, optparse + +print "%prog: version 3.1" + + +def usage(): + print __doc__ + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = __doc__ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--color", dest="color") + parser.add_option("--score", type="int", dest="scoreField") + parser.add_option("--narrowPeak", action="store_true", dest="doNarrow") + parser.add_option("--broadPeak", action="store_true", dest="doBroad") + parser.add_option("--itemRgb", action="store_true", dest="itemRGB") + parser.add_option("--nolabel", action="store_true", dest="noLabel") + parser.set_defaults(color="0,0,0", scoreField=None, doNarrow=False, + doBroad=False, itemRGB=False, noLabel=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + usage() + sys.exit(2) + + factorlabel = args[0] + regionfile = args[1] + outfile = args[2] + + regiontobed(factorlabel, regionfile, outfile, options.color, + options.scoreField, options.doNarrow, options.doBroad, + options.itemRGB, options.noLabel) + + +def regiontobed(factorlabel, regionFileName, outFileName, color="0,0,0", + scoreField=None, doNarrow=False, doBroad=False, itemRGB=False, + noLabel=False): + + regionfile = open(regionFileName) + outfile = open(outFileName, "w") + + if itemRGB: + print "assigning each item its color" + + if noLabel: + if itemRGB: + outfile.write('track name=%s visibility=4 itemRgb="on"\n' % factorlabel) + else: + outfile.write("track name=%s visibility=4 color=%s\n" % (factorlabel, color)) + + for line in regionfile: + if line[0] == "#": + continue + + fields = line.strip().split() + if doNarrow: + signalVal = float(fields[4]) + pval = float(fields[-1]) + if pval == 0.: + pValue = 350 + else: + pValue = -1. * math.log(pval, 10) + + peakPos = int(fields[9]) - int(fields[2]) + outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1\t%d" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue, peakPos)) + elif doBroad: + signalVal = float(fields[4]) + pval = float(fields[-1]) + if pval == 0.: + pValue = 350 + else: + pValue = -1. * math.log(pval, 10) + + outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue)) + elif scoreField is not None: + score = int(float(fields[scoreField])) + if score > 1000: + score = 1000 + + outfile.write("%s\t%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0], score)) + if itemRGB: + outfile.write("\t+\t-\t-\t%s" % color) + else: + outfile.write("%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0])) + if itemRGB: + outfile.write("\t1000\t+\t-\t-\t%s" % color) + + outfile.write("\n") + + outfile.close() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnaAToIFilter.py b/rnaAToIFilter.py new file mode 100644 index 0000000..aefa78b --- /dev/null +++ b/rnaAToIFilter.py @@ -0,0 +1,39 @@ +import sys + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s infile outfile" % sys.argv[0] + + if len(argv) < 3: + print usage + sys.exit(1) + + infile = open(argv[1]) + outfile = open(argv[2], "w") + + lines = infile.readlines() + outputLines = rnaAToIFilter(lines) + + for line in outputLines: + outfile.write(line) + + outfile.close() + + +def rnaAToIFilter(snpPropertiesList): + outputLines = [] + for line in snpPropertiesList: + fields = line.split() + if fields[13] == "F" and fields[7] == "A-G": + outputLines.append(line) + elif fields[13] == "R" and fields[7] == "T-C": + outputLines.append(line) + + return outputLines + + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/rnaEditing.py b/rnaEditing.py new file mode 100644 index 0000000..30de5a3 --- /dev/null +++ b/rnaEditing.py @@ -0,0 +1,87 @@ +""" +Based on shell script provided by Ali. +""" + +import sys +import optparse +from Erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter +from Erange.commoncode import countDuplicatesInList + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog dbfile snpsfile genome rpkmfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--goprefix", dest="prefix") + parser.add_option("--novelsnp", dest="novelsnpoutfilename") + parser.add_option("--bedfile", dest="bedoutfilename") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--snpDB", action="append", dest="snpDBList", + help="additional snp db files to check will be searched in order given") + parser.set_defaults(prefix=None, novelsnpoutfilename=None, bedoutfilename=None, cachePages=None, snpDBList=[]) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + dbfile = args[0] + hitfile = args[1] + genome = args[2] + rpkmfilename = args[3] + + if options.cachePages is not None: + doCache = True + else: + doCache = False + + # get the SNPs + snpList = getSNPs.getSNPs(hitfile, 3, 0.25, doCache, options.cachePages, forceChr=True) + + # check for existing SNPs + dbList = [dbfile] + for dbFileName in options.snpDBList: + dbList.append(dbFileName) + + snpPropertiesList = chksnp.chkSNP(dbList, snpList, options.cachePages) + + # get the neighboring genes + geneInfoList = getSNPGeneInfo.getSNPGeneInfo(genome, snpPropertiesList, rpkmfilename, doCache, flankBP=10000) + + # filter out for the A-to-I events in the same direction as the genes + filteredSNPs = rnaAToIFilter.rnaAToIFilter(geneInfoList) + + # count the number of different bases that have been called for each gene + # pick a set of genes with a high number of sites (here 5) + geneList = getGenesWithMultipleSNPs(filteredSNPs, minCount=5) + + if options.prefix is not None: + analyzego.analyzeGO(genome, geneList, options.prefix, translateGene=True, fieldID=1) + + if options.novelsnpoutfilename is not None: + getNovelSNPs.writeNovelSNPFile(genome, filteredSNPs, options.novelsnpoutfilename) + + if options.bedoutfilename is not None: + makeSNPtrack.writeSNPsBedfile(filteredSNPs, "rnaEdit_sample", options.bedoutfilename) + + +def getGenesWithMultipleSNPs(snpList, minCount=1): + geneList = [] + for snpEntry in snpList: + geneList.append(snpEntry[11]) + + duplicateCountList = countDuplicatesInList(geneList) + + geneList = [] + for (gene, count) in duplicateCountList: + if count >= minCount: + geneList.append(gene) + + return geneList + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnafarPairs.py b/rnafarPairs.py new file mode 100755 index 0000000..d1baebd --- /dev/null +++ b/rnafarPairs.py @@ -0,0 +1,180 @@ +# +# RNAFARpairs.py +# ENRAGE +# +# Created by Ali Mortazavi on 11/2/08. +# +""" usage: python rnafarpairs.py genome goodfile rdsfile outfile [options] + looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM +""" +try: + import psyco + psyco.full() +except: + pass + +import sys, time, optparse +from commoncode import readDataset +from cistematic.core.geneinfo import geneinfoDB +from cistematic.genomes import Genome + +def main(argv=None): + if not argv: + argv = sys.argv + + print "%prog: version 3.6" + usage = "usage: python %prog genome goodfile rdsfile outfile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--verbose", action="store_true", dest="doVerbose", + help="verbose output") + parser.add_option("--cache", action="store_true", dest="doCache", + help="use cache") + parser.add_option("--maxDist", type="int", dest="maxDist", + help="maximum distance") + parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(1) + + genome = args[0] + goodfilename = args[1] + rdsfile = args[2] + outfilename = args[3] + + rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist) + + +def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000): + goodDict = {} + goodfile = open(goodfilename) + for line in goodfile: + fields = line.split() + goodDict[fields[0]] = line + + RDS = readDataset(rdsfile, verbose = True, cache=doCache) + rdsChromList = RDS.getChromosomes() + + if doVerbose: + print time.ctime() + + distinct = 0 + total = 0 + outfile = open(outfilename,"w") + + idb = geneinfoDB() + if genome == "dmelanogaster": + geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus") + else: + geneinfoDict = idb.getallGeneInfo(genome) + + hg = Genome(genome) + geneannotDict = hg.allAnnotInfo() + + assigned = {} + farConnected = {} + for achrom in rdsChromList: + if achrom == "chrM": + continue + + print achrom + uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True) + if doVerbose: + print len(uniqDict), time.ctime() + + for readID in uniqDict: + readList = uniqDict[readID] + if len(readList) == 2: + total += 1 + (start1, flag1, pair1) = readList[0] + (start2, flag2, pair2) = readList[1] + + if flag1 != flag2: + dist = abs(start1 - start2) + if flag1 != "NM" and flag2 != "NM" and dist < maxDist: + geneID = "" + saw1 = False + saw2 = False + if flag1 in goodDict: + geneID = flag2 + farFlag = flag1 + saw1 = True + + if flag2 in goodDict: + geneID = flag1 + farFlag = flag2 + saw2 = True + + if saw1 or saw2: + total += 1 + + if saw1 and saw2: + if flag1 < flag2: + geneID = flag1 + farFlag = flag2 + else: + geneID = flag2 + farFlag = flag1 + + if geneID in farConnected: + farConnected[geneID].append(farFlag) + else: + farConnected[geneID] = [farFlag] + elif geneID != "": + try: + if genome == "dmelanogaster": + symbol = geneinfoDict["Dmel_" + geneID][0][0] + else: + symbol = geneinfoDict[geneID][0][0] + except: + try: + symbol = geneannotDict[(genome, geneID)][0] + except: + symbol = "LOC" + geneID + + symbol = symbol.strip() + symbol = symbol.replace(" ","|") + symbol = symbol.replace("\t","|") + if farFlag not in assigned: + assigned[farFlag] = (symbol, geneID) + print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip()) + outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag])) + distinct += 1 + + farIndex = 0 + for farFlag in farConnected: + geneID = "" + symbol = "" + idList = [farFlag] + farConnected[farFlag] + for oneID in idList: + if oneID in assigned: + (symbol, geneID) = assigned[oneID] + + if geneID == "": + farIndex += 1 + symbol = "FAR%d" % farIndex + geneID = -1 * farIndex + + for oneID in idList: + if oneID not in assigned: + print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip()) + outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID])) + distinct += 1 + assigned[oneID] = (symbol, geneID) + + for farFlag in goodDict: + if farFlag not in assigned: + farIndex += 1 + line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag]) + print line.strip() + outfile.write(line) + + outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total)) + outfile.close() + print "distinct: %d\ttotal: %d" % (distinct, total) + print time.ctime() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/.svn/entries b/rnapath/.svn/entries new file mode 100644 index 0000000..d37a6ac --- /dev/null +++ b/rnapath/.svn/entries @@ -0,0 +1,86 @@ +10 + +dir +23 +file:///Users/sau/svn/repos/erange/source/Erange/rnapath +file:///Users/sau/svn/repos + + + +2010-10-01T18:32:26.347691Z +22 +sau + + + + + + + + + + + + + + +d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9 + +RNAPATH.py +file + + + + + +dbb616164849ddb57ad0880cf59ff36a +2010-10-01T18:32:26.347691Z +22 +sau + +__init__.py +file + + + + +2010-09-10T18:56:21.000000Z +d41d8cd98f00b204e9800998ecf8427e +2010-09-10T18:57:45.549780Z +20 +sau + + + + + + + + + + + + + + + + + + + + + +0 + +processvelvet.py +file + + + + + +c232f2e5338d3f018f259576a65ff49e +2010-10-01T18:32:26.347691Z +22 +sau + diff --git a/rnapath/.svn/text-base/RNAPATH.py.svn-base b/rnapath/.svn/text-base/RNAPATH.py.svn-base new file mode 100644 index 0000000..86f61cd --- /dev/null +++ b/rnapath/.svn/text-base/RNAPATH.py.svn-base @@ -0,0 +1,468 @@ +import sys +import optparse +import string +from numpy import zeros, int16 + +versionString = "%s: version 0.95" % sys.argv[0] +print versionString + + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = { "A": "T", + "T": "A", + "G": "C", + "C": "G", + "S": "S", + "W": "W", + "R": "Y", + "Y": "R", + "M": "K", + "K": "M", + "H": "D", + "D": "H", + "B": "V", + "V": "B", + "N": "N", + "a": "t", + "t": "a", + "g": "c", + "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=-1): + """ returns the complement of the sequence. + """ + newSeq = "" + + seqLength = len(sequence) + + if length == seqLength or length < 0: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + #TODO: this seems to want to deal with case where length is more than + # sequence length except that a negative index on a sequence is fine + # index will only be overrun if length is negative but that case is + # handled above + for index in range(seqLength - 1,seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="pathPrefix") + parser.add_option("--overlap", type="int", dest="overlap") + parser.set_defaults(pathPrefix="RNAPATH", overlap=30) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(0) + + incontigfilename = args[0] + distalPairsfile = args[1] + outpathfilename = args[2] + outcontigfilename = args[3] + + rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, options.pathPrefix, options.overlap) + + +def rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, pathPrefix="RNAPATH", overlap=30): + + outpathfile = open(outpathfilename, "w") + + outheader = "#settings: %s" % " ".join(sys.argv) + print outheader + print >> outpathfile, outheader + + contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename) + halfSize = calculateN50(origSize) + print "building the adjacency graph" + pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList) + + print "found %d paths" % len(pathList) + + newSizeList = [] + pathID = 0 + outcontigfile = open(outcontigfilename, "w") + for path in pathList: + pathID += 1 + outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) + vertexNameList = [] + for vertex in path: + vertexNameList.append(nameList[vertex]) + pathDescription = string.join(vertexNameList, ",") + + print >> outpathfile, pathDescription + currentVertex = path[0] + currentSense = "+" + assemblyList = currentVertex + sequence = contigDict[currentVertex] + for nextVertex in path[1:]: + if (currentVertex, nextVertex) in edgeSenseDict: + senseList = edgeSenseDict[currentVertex, nextVertex] + FR = senseList.count(("+", "-")) + RF = senseList.count(("-", "+")) + else: + senseList = edgeSenseDict[nextVertex, currentVertex] + # flip + FR = senseList.count(("-", "+")) + RF = senseList.count(("+", "-")) + + FF = senseList.count(("+", "+")) + RR = senseList.count(("-", "-")) + if currentSense == "-": + # we had flipped the upstream piece! Must flip again + temp1 = FR + temp2 = FF + FR = RR + FF = RF + RR = temp1 + RF = temp2 + + if FR >= FF and FR >= RR and FR >= RF: + # we have FR - leave alone + sense1 = "+" + sense2 = "-" + assemblyList = ((assemblyList, "+"), (nextVertex, "+")) + seqleft = sequence[-20:] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += contigDict[nextVertex][offset:] + else: + sequence += "NN" + contigDict[nextVertex] + + currentSense = "+" + elif FF >= RR and FF >= RF: + # we have FF - flip seqright + sense1 = "+" + sense2 = "+" + assemblyList = ((assemblyList, "+"), (nextVertex, "-")) + seqleft = sequence[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += complement(contigDict[nextVertex])[offset:] + else: + sequence += "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + elif RR >= RF: + # we have RR - flip seqleft + sense1 = "-" + sense2 = "-" + assemblyList = ((assemblyList, "-"), (nextVertex, "+")) + seqleft = complement(sequence)[:20] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + contigDict[nextVertex][offset:] + else: + sequence = complement(sequence) + "NN" + contigDict[nextVertex] + + currentSense = "+" + else: + # we have RF - flip both + sense1 = "-" + sense2 = "+" + assemblyList = ((assemblyList, "-"), (nextVertex, "-")) + seqleft = complement(sequence)[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:] + else: + sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + + outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList)) + print outstring + print >> outpathfile, outstring + currentVertex = nextVertex + + outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence)) + newSizeList.append(len(sequence)) + + for vertex in contigDict: + if vertex in visitedDict: + continue + + newSizeList.append(len(contigDict[vertex])) + outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex])) + + calculateN50(newSizeList, referenceMean=halfSize) + + +def calculateN50(sizeList, referenceMean=None): + if referenceMean is None: + totalSize = sum(sizeList) + referenceMean = totalSize / 2 + + sizeList.sort() + sizeList.reverse() + currentTotalLength = 0 + for size in sizeList: + if currentTotalLength + size > referenceMean: + print "#contigs", len(sizeList) + print "N50", size + break + + currentTotalLength += size + + print sizeList[:50] + + return referenceMean + + +def getContigsFromFile(contigFileName): + nameList = [] + origSize = [] + contigNum = 0 + currentChrom = "" + seq = "" + contigDict = {} + + try: + incontigfile = open(contigFileName) + except IOError: + print "Error opening contig file: %s" % contigFileName + return contigNum, nameList, contigDict, origSize + + for line in incontigfile: + if ">" in line: + if currentChrom !="": + nameList.append(currentChrom) + contigDict[contigNum] = seq + origSize.append(len(seq)) + contigNum += 1 + + currentChrom = line.strip().split()[0][1:] + seq = "" + else: + seq += line.strip() + + incontigfile.close() + + return contigNum, nameList, contigDict, origSize + + +def getPath(contigNum, distalPairsfile, nameList): + edgeMatrix = EdgeMatrix(contigNum) + + print len(edgeMatrix.edgeArray) + try: + print len(edgeMatrix.edgeArray[50]) + except IndexError: + pass + + print "processing distal pairs" + verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList) + + willVisitList = verticesWithEdges.keys() + willVisitList.sort() + print "visiting %d vertices" % len(willVisitList) + + print "cleaning up graph of edges with weight 1" + verticesToDelete = [] + for rindex in willVisitList: + if rindex not in notSoloDict: + cindex = vertexEdges[rindex][0] + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + verticesToDelete.append(rindex) + + for vertex in verticesToDelete: + willVisitList.remove(vertex) + + print "%d 1-edges zeroed out" % len(verticesToDelete) + + zeroedEdge = 0 + print "visiting %d vertices" % len(willVisitList) + + leafList = [] + print "picking top 2 edges per vertex - zero out others" + for rindex in willVisitList: + vertices = vertexEdges[rindex] + rEdges = [] + for avertex in vertices: + if avertex in willVisitList: + rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex)) + + if len(rEdges) > 2: + rEdges.sort() + rEdges.reverse() + zeroedEdge += len(rEdges[2:]) + for (weight, cindex) in rEdges[2:]: + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + elif len(rEdges) == 1: + if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1: + leafList.append(rindex) + + print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge + pathList, visitedDict = traverseGraph(leafList, edgeMatrix) + + return pathList, edgeSenseDict, visitedDict + + +def traverseGraph(leafList, edgeMatrix): + pathList = [] + visitedDict = {} + leafList.sort() + print "traveling through the graph" + for rindex in leafList: + if visitedDict.has_key(rindex): + pass + else: + path = edgeMatrix.visitLink(rindex) + if len(path) > 1: + for vertex in path: + visitedDict[vertex] = "" + + print path + pathList.append(path) + + return pathList, visitedDict + + +def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList): + contigToRowLookup = {} + verticesWithEdges = {} + vertexEdges = {} + notSoloDict = {} + edgeSenseDict = {} + + distalPairs = open(distalPairsfilename) + for line in distalPairs: + if line[0] == "#": + continue + + fields = line.strip().split() + contA = "chr%s" % fields[1] + try: + contig1 = contigToRowLookup[contA] + except KeyError: + try: + contig1 = nameList.index(contA) + contigToRowLookup[contA] = contig1 + except ValueError: + print "problem with end1: ", line + continue + + sense1 = fields[3] + + contB = "chr%s" % fields[4] + try: + contig2 = contigToRowLookup[contB] + except KeyError: + try: + contig2 = nameList.index(contB) + contigToRowLookup[contB] = contig2 + except ValueError: + print "problem with end2: ", line + continue + + sense2 = fields[6] + + edgeMatrix.edgeArray[contig1][contig2] += 1 + edgeMatrix.edgeArray[contig2][contig1] += 1 + verticesWithEdges[contig1] = "" + verticesWithEdges[contig2] = "" + if (contig1, contig2) in edgeSenseDict: + edgeSenseDict[contig1, contig2].append((sense1, sense2)) + elif (contig2, contig1) in edgeSenseDict: + edgeSenseDict[contig2, contig1].append((sense2, sense1)) + else: + edgeSenseDict[contig1, contig2] = [(sense1, sense2)] + + if contig1 in vertexEdges: + if contig2 not in vertexEdges[contig1]: + vertexEdges[contig1].append(contig2) + else: + vertexEdges[contig1] = [contig2] + + if contig2 in vertexEdges: + if contig1 not in vertexEdges[contig2]: + vertexEdges[contig2].append(contig1) + else: + vertexEdges[contig2] = [contig1] + + if edgeMatrix.edgeArray[contig1][contig2] > 1: + notSoloDict[contig1] = "" + notSoloDict[contig2] = "" + + distalPairs.close() + + return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict + + +class EdgeMatrix: + """ Describes a sparse matrix to hold edge data. + """ + + def __init__(self, dimension): + self.dimension = dimension + self.edgeArray = zeros((self.dimension, self.dimension), int16) + + + def visitLink(self, fromVertex, ignoreList=[]): + returnPath = [fromVertex] + toVertex = [] + for toindex in xrange(self.dimension): + if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList: + toVertex.append(toindex) + + for vertex in toVertex: + if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]: + self.edgeArray[fromVertex][vertex] = 0 + self.edgeArray[vertex][fromVertex] = 0 + return returnPath + [vertex] + else: + self.edgeArray[fromVertex][vertex] = 0 + try: + return returnPath + self.visitLink(vertex, returnPath) + except IOError: + return returnPath + [vertex] + return [] + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/.svn/text-base/__init__.py.svn-base b/rnapath/.svn/text-base/__init__.py.svn-base new file mode 100644 index 0000000..e69de29 diff --git a/rnapath/.svn/text-base/processvelvet.py.svn-base b/rnapath/.svn/text-base/processvelvet.py.svn-base new file mode 100644 index 0000000..0af43d1 --- /dev/null +++ b/rnapath/.svn/text-base/processvelvet.py.svn-base @@ -0,0 +1,110 @@ +import sys +import optparse + +print "%prog: version 1.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="contigPrefix") + parser.add_option("--filter", dest="filterFileName") + parser.add_option("--min", type="int", dest="minSize") + parser.add_option("--keepcov", action="store_true", dest="keepCoverage") + parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(2) + + infile = args[0] + outfile = args[1] + + processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage) + + +def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False): + infile = open(inFileName) + outfile = open(outFileName, "w") + filterList = getFilterList(filterFileName) + + node = {"contigPrefix": contigPrefix, + "completeID": "", + "currentSeq": "" + } + + counts = {"acceptedSize": 0, + "nSize": 0, + "contigsAccepted": 0, + "filteredSize": 0 + } + + for line in infile: + if ">NODE" in line: + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + node["completeID"] = line.strip()[1:] + node["currentSeq"] = "" + else: + node["currentSeq"] += line + + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + + infile.close() + outfile.close() + + print "%d contigs accepted" % counts["contigsAccepted"] + print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"]) + print "%d bp accepted" % counts["acceptedSize"] + print "%d bp accepted N" % counts["nSize"] + print "%d bp filtered\n" % counts["filteredSize"] + + +def getFilterList(filterFileName=""): + filterList = [] + + if filterFileName: + try: + filterFile = open(filterFileName) + except IOError: + return filterList + + for line in filterFile: + if "NODE" in line: + fields = line.strip().split() + try: + exclude = fields[9] + except IndexError: + continue + + if exclude not in filterList: + filterList.append(exclude) + + filterFile.close() + + return filterList + + +def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False): + completeID = node["completeID"] + currentSeq = node["currentSeq"] + sequenceLength = len(currentSeq) - currentSeq.count("\n") + if len(completeID) > 5 and completeID not in filterList: + fields = completeID.split("_") + newID = fields[1] + if keepCoverage: + newID = fields[1] + "_" + fields[-1].strip() + + if sequenceLength >= minSize: + outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq)) + counts["acceptedSize"] += sequenceLength + counts["nSize"] += currentSeq.count("N") + counts["contigsAccepted"] += 1 + else: + counts["filteredSize"] += sequenceLength + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/.svn/tmp/RNAPATH.py.tmp b/rnapath/.svn/tmp/RNAPATH.py.tmp new file mode 100644 index 0000000..86f61cd --- /dev/null +++ b/rnapath/.svn/tmp/RNAPATH.py.tmp @@ -0,0 +1,468 @@ +import sys +import optparse +import string +from numpy import zeros, int16 + +versionString = "%s: version 0.95" % sys.argv[0] +print versionString + + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = { "A": "T", + "T": "A", + "G": "C", + "C": "G", + "S": "S", + "W": "W", + "R": "Y", + "Y": "R", + "M": "K", + "K": "M", + "H": "D", + "D": "H", + "B": "V", + "V": "B", + "N": "N", + "a": "t", + "t": "a", + "g": "c", + "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=-1): + """ returns the complement of the sequence. + """ + newSeq = "" + + seqLength = len(sequence) + + if length == seqLength or length < 0: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + #TODO: this seems to want to deal with case where length is more than + # sequence length except that a negative index on a sequence is fine + # index will only be overrun if length is negative but that case is + # handled above + for index in range(seqLength - 1,seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="pathPrefix") + parser.add_option("--overlap", type="int", dest="overlap") + parser.set_defaults(pathPrefix="RNAPATH", overlap=30) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(0) + + incontigfilename = args[0] + distalPairsfile = args[1] + outpathfilename = args[2] + outcontigfilename = args[3] + + rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, options.pathPrefix, options.overlap) + + +def rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, pathPrefix="RNAPATH", overlap=30): + + outpathfile = open(outpathfilename, "w") + + outheader = "#settings: %s" % " ".join(sys.argv) + print outheader + print >> outpathfile, outheader + + contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename) + halfSize = calculateN50(origSize) + print "building the adjacency graph" + pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList) + + print "found %d paths" % len(pathList) + + newSizeList = [] + pathID = 0 + outcontigfile = open(outcontigfilename, "w") + for path in pathList: + pathID += 1 + outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) + vertexNameList = [] + for vertex in path: + vertexNameList.append(nameList[vertex]) + pathDescription = string.join(vertexNameList, ",") + + print >> outpathfile, pathDescription + currentVertex = path[0] + currentSense = "+" + assemblyList = currentVertex + sequence = contigDict[currentVertex] + for nextVertex in path[1:]: + if (currentVertex, nextVertex) in edgeSenseDict: + senseList = edgeSenseDict[currentVertex, nextVertex] + FR = senseList.count(("+", "-")) + RF = senseList.count(("-", "+")) + else: + senseList = edgeSenseDict[nextVertex, currentVertex] + # flip + FR = senseList.count(("-", "+")) + RF = senseList.count(("+", "-")) + + FF = senseList.count(("+", "+")) + RR = senseList.count(("-", "-")) + if currentSense == "-": + # we had flipped the upstream piece! Must flip again + temp1 = FR + temp2 = FF + FR = RR + FF = RF + RR = temp1 + RF = temp2 + + if FR >= FF and FR >= RR and FR >= RF: + # we have FR - leave alone + sense1 = "+" + sense2 = "-" + assemblyList = ((assemblyList, "+"), (nextVertex, "+")) + seqleft = sequence[-20:] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += contigDict[nextVertex][offset:] + else: + sequence += "NN" + contigDict[nextVertex] + + currentSense = "+" + elif FF >= RR and FF >= RF: + # we have FF - flip seqright + sense1 = "+" + sense2 = "+" + assemblyList = ((assemblyList, "+"), (nextVertex, "-")) + seqleft = sequence[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += complement(contigDict[nextVertex])[offset:] + else: + sequence += "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + elif RR >= RF: + # we have RR - flip seqleft + sense1 = "-" + sense2 = "-" + assemblyList = ((assemblyList, "-"), (nextVertex, "+")) + seqleft = complement(sequence)[:20] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + contigDict[nextVertex][offset:] + else: + sequence = complement(sequence) + "NN" + contigDict[nextVertex] + + currentSense = "+" + else: + # we have RF - flip both + sense1 = "-" + sense2 = "+" + assemblyList = ((assemblyList, "-"), (nextVertex, "-")) + seqleft = complement(sequence)[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:] + else: + sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + + outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList)) + print outstring + print >> outpathfile, outstring + currentVertex = nextVertex + + outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence)) + newSizeList.append(len(sequence)) + + for vertex in contigDict: + if vertex in visitedDict: + continue + + newSizeList.append(len(contigDict[vertex])) + outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex])) + + calculateN50(newSizeList, referenceMean=halfSize) + + +def calculateN50(sizeList, referenceMean=None): + if referenceMean is None: + totalSize = sum(sizeList) + referenceMean = totalSize / 2 + + sizeList.sort() + sizeList.reverse() + currentTotalLength = 0 + for size in sizeList: + if currentTotalLength + size > referenceMean: + print "#contigs", len(sizeList) + print "N50", size + break + + currentTotalLength += size + + print sizeList[:50] + + return referenceMean + + +def getContigsFromFile(contigFileName): + nameList = [] + origSize = [] + contigNum = 0 + currentChrom = "" + seq = "" + contigDict = {} + + try: + incontigfile = open(contigFileName) + except IOError: + print "Error opening contig file: %s" % contigFileName + return contigNum, nameList, contigDict, origSize + + for line in incontigfile: + if ">" in line: + if currentChrom !="": + nameList.append(currentChrom) + contigDict[contigNum] = seq + origSize.append(len(seq)) + contigNum += 1 + + currentChrom = line.strip().split()[0][1:] + seq = "" + else: + seq += line.strip() + + incontigfile.close() + + return contigNum, nameList, contigDict, origSize + + +def getPath(contigNum, distalPairsfile, nameList): + edgeMatrix = EdgeMatrix(contigNum) + + print len(edgeMatrix.edgeArray) + try: + print len(edgeMatrix.edgeArray[50]) + except IndexError: + pass + + print "processing distal pairs" + verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList) + + willVisitList = verticesWithEdges.keys() + willVisitList.sort() + print "visiting %d vertices" % len(willVisitList) + + print "cleaning up graph of edges with weight 1" + verticesToDelete = [] + for rindex in willVisitList: + if rindex not in notSoloDict: + cindex = vertexEdges[rindex][0] + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + verticesToDelete.append(rindex) + + for vertex in verticesToDelete: + willVisitList.remove(vertex) + + print "%d 1-edges zeroed out" % len(verticesToDelete) + + zeroedEdge = 0 + print "visiting %d vertices" % len(willVisitList) + + leafList = [] + print "picking top 2 edges per vertex - zero out others" + for rindex in willVisitList: + vertices = vertexEdges[rindex] + rEdges = [] + for avertex in vertices: + if avertex in willVisitList: + rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex)) + + if len(rEdges) > 2: + rEdges.sort() + rEdges.reverse() + zeroedEdge += len(rEdges[2:]) + for (weight, cindex) in rEdges[2:]: + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + elif len(rEdges) == 1: + if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1: + leafList.append(rindex) + + print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge + pathList, visitedDict = traverseGraph(leafList, edgeMatrix) + + return pathList, edgeSenseDict, visitedDict + + +def traverseGraph(leafList, edgeMatrix): + pathList = [] + visitedDict = {} + leafList.sort() + print "traveling through the graph" + for rindex in leafList: + if visitedDict.has_key(rindex): + pass + else: + path = edgeMatrix.visitLink(rindex) + if len(path) > 1: + for vertex in path: + visitedDict[vertex] = "" + + print path + pathList.append(path) + + return pathList, visitedDict + + +def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList): + contigToRowLookup = {} + verticesWithEdges = {} + vertexEdges = {} + notSoloDict = {} + edgeSenseDict = {} + + distalPairs = open(distalPairsfilename) + for line in distalPairs: + if line[0] == "#": + continue + + fields = line.strip().split() + contA = "chr%s" % fields[1] + try: + contig1 = contigToRowLookup[contA] + except KeyError: + try: + contig1 = nameList.index(contA) + contigToRowLookup[contA] = contig1 + except ValueError: + print "problem with end1: ", line + continue + + sense1 = fields[3] + + contB = "chr%s" % fields[4] + try: + contig2 = contigToRowLookup[contB] + except KeyError: + try: + contig2 = nameList.index(contB) + contigToRowLookup[contB] = contig2 + except ValueError: + print "problem with end2: ", line + continue + + sense2 = fields[6] + + edgeMatrix.edgeArray[contig1][contig2] += 1 + edgeMatrix.edgeArray[contig2][contig1] += 1 + verticesWithEdges[contig1] = "" + verticesWithEdges[contig2] = "" + if (contig1, contig2) in edgeSenseDict: + edgeSenseDict[contig1, contig2].append((sense1, sense2)) + elif (contig2, contig1) in edgeSenseDict: + edgeSenseDict[contig2, contig1].append((sense2, sense1)) + else: + edgeSenseDict[contig1, contig2] = [(sense1, sense2)] + + if contig1 in vertexEdges: + if contig2 not in vertexEdges[contig1]: + vertexEdges[contig1].append(contig2) + else: + vertexEdges[contig1] = [contig2] + + if contig2 in vertexEdges: + if contig1 not in vertexEdges[contig2]: + vertexEdges[contig2].append(contig1) + else: + vertexEdges[contig2] = [contig1] + + if edgeMatrix.edgeArray[contig1][contig2] > 1: + notSoloDict[contig1] = "" + notSoloDict[contig2] = "" + + distalPairs.close() + + return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict + + +class EdgeMatrix: + """ Describes a sparse matrix to hold edge data. + """ + + def __init__(self, dimension): + self.dimension = dimension + self.edgeArray = zeros((self.dimension, self.dimension), int16) + + + def visitLink(self, fromVertex, ignoreList=[]): + returnPath = [fromVertex] + toVertex = [] + for toindex in xrange(self.dimension): + if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList: + toVertex.append(toindex) + + for vertex in toVertex: + if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]: + self.edgeArray[fromVertex][vertex] = 0 + self.edgeArray[vertex][fromVertex] = 0 + return returnPath + [vertex] + else: + self.edgeArray[fromVertex][vertex] = 0 + try: + return returnPath + self.visitLink(vertex, returnPath) + except IOError: + return returnPath + [vertex] + return [] + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/.svn/tmp/processvelvet.py.tmp b/rnapath/.svn/tmp/processvelvet.py.tmp new file mode 100644 index 0000000..0af43d1 --- /dev/null +++ b/rnapath/.svn/tmp/processvelvet.py.tmp @@ -0,0 +1,110 @@ +import sys +import optparse + +print "%prog: version 1.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="contigPrefix") + parser.add_option("--filter", dest="filterFileName") + parser.add_option("--min", type="int", dest="minSize") + parser.add_option("--keepcov", action="store_true", dest="keepCoverage") + parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(2) + + infile = args[0] + outfile = args[1] + + processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage) + + +def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False): + infile = open(inFileName) + outfile = open(outFileName, "w") + filterList = getFilterList(filterFileName) + + node = {"contigPrefix": contigPrefix, + "completeID": "", + "currentSeq": "" + } + + counts = {"acceptedSize": 0, + "nSize": 0, + "contigsAccepted": 0, + "filteredSize": 0 + } + + for line in infile: + if ">NODE" in line: + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + node["completeID"] = line.strip()[1:] + node["currentSeq"] = "" + else: + node["currentSeq"] += line + + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + + infile.close() + outfile.close() + + print "%d contigs accepted" % counts["contigsAccepted"] + print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"]) + print "%d bp accepted" % counts["acceptedSize"] + print "%d bp accepted N" % counts["nSize"] + print "%d bp filtered\n" % counts["filteredSize"] + + +def getFilterList(filterFileName=""): + filterList = [] + + if filterFileName: + try: + filterFile = open(filterFileName) + except IOError: + return filterList + + for line in filterFile: + if "NODE" in line: + fields = line.strip().split() + try: + exclude = fields[9] + except IndexError: + continue + + if exclude not in filterList: + filterList.append(exclude) + + filterFile.close() + + return filterList + + +def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False): + completeID = node["completeID"] + currentSeq = node["currentSeq"] + sequenceLength = len(currentSeq) - currentSeq.count("\n") + if len(completeID) > 5 and completeID not in filterList: + fields = completeID.split("_") + newID = fields[1] + if keepCoverage: + newID = fields[1] + "_" + fields[-1].strip() + + if sequenceLength >= minSize: + outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq)) + counts["acceptedSize"] += sequenceLength + counts["nSize"] += currentSeq.count("N") + counts["contigsAccepted"] += 1 + else: + counts["filteredSize"] += sequenceLength + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/RNAPATH.py b/rnapath/RNAPATH.py new file mode 100644 index 0000000..86f61cd --- /dev/null +++ b/rnapath/RNAPATH.py @@ -0,0 +1,468 @@ +import sys +import optparse +import string +from numpy import zeros, int16 + +versionString = "%s: version 0.95" % sys.argv[0] +print versionString + + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = { "A": "T", + "T": "A", + "G": "C", + "C": "G", + "S": "S", + "W": "W", + "R": "Y", + "Y": "R", + "M": "K", + "K": "M", + "H": "D", + "D": "H", + "B": "V", + "V": "B", + "N": "N", + "a": "t", + "t": "a", + "g": "c", + "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=-1): + """ returns the complement of the sequence. + """ + newSeq = "" + + seqLength = len(sequence) + + if length == seqLength or length < 0: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + #TODO: this seems to want to deal with case where length is more than + # sequence length except that a negative index on a sequence is fine + # index will only be overrun if length is negative but that case is + # handled above + for index in range(seqLength - 1,seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="pathPrefix") + parser.add_option("--overlap", type="int", dest="overlap") + parser.set_defaults(pathPrefix="RNAPATH", overlap=30) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 4: + print usage + sys.exit(0) + + incontigfilename = args[0] + distalPairsfile = args[1] + outpathfilename = args[2] + outcontigfilename = args[3] + + rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, options.pathPrefix, options.overlap) + + +def rnaPath(incontigfilename, distalPairsfile, outpathfilename, + outcontigfilename, pathPrefix="RNAPATH", overlap=30): + + outpathfile = open(outpathfilename, "w") + + outheader = "#settings: %s" % " ".join(sys.argv) + print outheader + print >> outpathfile, outheader + + contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename) + halfSize = calculateN50(origSize) + print "building the adjacency graph" + pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList) + + print "found %d paths" % len(pathList) + + newSizeList = [] + pathID = 0 + outcontigfile = open(outcontigfilename, "w") + for path in pathList: + pathID += 1 + outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) + vertexNameList = [] + for vertex in path: + vertexNameList.append(nameList[vertex]) + pathDescription = string.join(vertexNameList, ",") + + print >> outpathfile, pathDescription + currentVertex = path[0] + currentSense = "+" + assemblyList = currentVertex + sequence = contigDict[currentVertex] + for nextVertex in path[1:]: + if (currentVertex, nextVertex) in edgeSenseDict: + senseList = edgeSenseDict[currentVertex, nextVertex] + FR = senseList.count(("+", "-")) + RF = senseList.count(("-", "+")) + else: + senseList = edgeSenseDict[nextVertex, currentVertex] + # flip + FR = senseList.count(("-", "+")) + RF = senseList.count(("+", "-")) + + FF = senseList.count(("+", "+")) + RR = senseList.count(("-", "-")) + if currentSense == "-": + # we had flipped the upstream piece! Must flip again + temp1 = FR + temp2 = FF + FR = RR + FF = RF + RR = temp1 + RF = temp2 + + if FR >= FF and FR >= RR and FR >= RF: + # we have FR - leave alone + sense1 = "+" + sense2 = "-" + assemblyList = ((assemblyList, "+"), (nextVertex, "+")) + seqleft = sequence[-20:] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += contigDict[nextVertex][offset:] + else: + sequence += "NN" + contigDict[nextVertex] + + currentSense = "+" + elif FF >= RR and FF >= RF: + # we have FF - flip seqright + sense1 = "+" + sense2 = "+" + assemblyList = ((assemblyList, "+"), (nextVertex, "-")) + seqleft = sequence[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence += complement(contigDict[nextVertex])[offset:] + else: + sequence += "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + elif RR >= RF: + # we have RR - flip seqleft + sense1 = "-" + sense2 = "-" + assemblyList = ((assemblyList, "-"), (nextVertex, "+")) + seqleft = complement(sequence)[:20] + seqright = contigDict[nextVertex][:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + contigDict[nextVertex][offset:] + else: + sequence = complement(sequence) + "NN" + contigDict[nextVertex] + + currentSense = "+" + else: + # we have RF - flip both + sense1 = "-" + sense2 = "+" + assemblyList = ((assemblyList, "-"), (nextVertex, "-")) + seqleft = complement(sequence)[-20:] + seqright = complement(contigDict[nextVertex])[:overlap] + if seqleft in seqright: + pos = seqright.index(seqleft) + offset = pos + 20 + outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset) + print outstring + print >> outpathfile, outstring + sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:] + else: + sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex]) + + currentSense = "-" + + outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList)) + print outstring + print >> outpathfile, outstring + currentVertex = nextVertex + + outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence)) + newSizeList.append(len(sequence)) + + for vertex in contigDict: + if vertex in visitedDict: + continue + + newSizeList.append(len(contigDict[vertex])) + outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex])) + + calculateN50(newSizeList, referenceMean=halfSize) + + +def calculateN50(sizeList, referenceMean=None): + if referenceMean is None: + totalSize = sum(sizeList) + referenceMean = totalSize / 2 + + sizeList.sort() + sizeList.reverse() + currentTotalLength = 0 + for size in sizeList: + if currentTotalLength + size > referenceMean: + print "#contigs", len(sizeList) + print "N50", size + break + + currentTotalLength += size + + print sizeList[:50] + + return referenceMean + + +def getContigsFromFile(contigFileName): + nameList = [] + origSize = [] + contigNum = 0 + currentChrom = "" + seq = "" + contigDict = {} + + try: + incontigfile = open(contigFileName) + except IOError: + print "Error opening contig file: %s" % contigFileName + return contigNum, nameList, contigDict, origSize + + for line in incontigfile: + if ">" in line: + if currentChrom !="": + nameList.append(currentChrom) + contigDict[contigNum] = seq + origSize.append(len(seq)) + contigNum += 1 + + currentChrom = line.strip().split()[0][1:] + seq = "" + else: + seq += line.strip() + + incontigfile.close() + + return contigNum, nameList, contigDict, origSize + + +def getPath(contigNum, distalPairsfile, nameList): + edgeMatrix = EdgeMatrix(contigNum) + + print len(edgeMatrix.edgeArray) + try: + print len(edgeMatrix.edgeArray[50]) + except IndexError: + pass + + print "processing distal pairs" + verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList) + + willVisitList = verticesWithEdges.keys() + willVisitList.sort() + print "visiting %d vertices" % len(willVisitList) + + print "cleaning up graph of edges with weight 1" + verticesToDelete = [] + for rindex in willVisitList: + if rindex not in notSoloDict: + cindex = vertexEdges[rindex][0] + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + verticesToDelete.append(rindex) + + for vertex in verticesToDelete: + willVisitList.remove(vertex) + + print "%d 1-edges zeroed out" % len(verticesToDelete) + + zeroedEdge = 0 + print "visiting %d vertices" % len(willVisitList) + + leafList = [] + print "picking top 2 edges per vertex - zero out others" + for rindex in willVisitList: + vertices = vertexEdges[rindex] + rEdges = [] + for avertex in vertices: + if avertex in willVisitList: + rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex)) + + if len(rEdges) > 2: + rEdges.sort() + rEdges.reverse() + zeroedEdge += len(rEdges[2:]) + for (weight, cindex) in rEdges[2:]: + edgeMatrix.edgeArray[rindex][cindex] = 0 + edgeMatrix.edgeArray[cindex][rindex] = 0 + elif len(rEdges) == 1: + if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1: + leafList.append(rindex) + + print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge + pathList, visitedDict = traverseGraph(leafList, edgeMatrix) + + return pathList, edgeSenseDict, visitedDict + + +def traverseGraph(leafList, edgeMatrix): + pathList = [] + visitedDict = {} + leafList.sort() + print "traveling through the graph" + for rindex in leafList: + if visitedDict.has_key(rindex): + pass + else: + path = edgeMatrix.visitLink(rindex) + if len(path) > 1: + for vertex in path: + visitedDict[vertex] = "" + + print path + pathList.append(path) + + return pathList, visitedDict + + +def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList): + contigToRowLookup = {} + verticesWithEdges = {} + vertexEdges = {} + notSoloDict = {} + edgeSenseDict = {} + + distalPairs = open(distalPairsfilename) + for line in distalPairs: + if line[0] == "#": + continue + + fields = line.strip().split() + contA = "chr%s" % fields[1] + try: + contig1 = contigToRowLookup[contA] + except KeyError: + try: + contig1 = nameList.index(contA) + contigToRowLookup[contA] = contig1 + except ValueError: + print "problem with end1: ", line + continue + + sense1 = fields[3] + + contB = "chr%s" % fields[4] + try: + contig2 = contigToRowLookup[contB] + except KeyError: + try: + contig2 = nameList.index(contB) + contigToRowLookup[contB] = contig2 + except ValueError: + print "problem with end2: ", line + continue + + sense2 = fields[6] + + edgeMatrix.edgeArray[contig1][contig2] += 1 + edgeMatrix.edgeArray[contig2][contig1] += 1 + verticesWithEdges[contig1] = "" + verticesWithEdges[contig2] = "" + if (contig1, contig2) in edgeSenseDict: + edgeSenseDict[contig1, contig2].append((sense1, sense2)) + elif (contig2, contig1) in edgeSenseDict: + edgeSenseDict[contig2, contig1].append((sense2, sense1)) + else: + edgeSenseDict[contig1, contig2] = [(sense1, sense2)] + + if contig1 in vertexEdges: + if contig2 not in vertexEdges[contig1]: + vertexEdges[contig1].append(contig2) + else: + vertexEdges[contig1] = [contig2] + + if contig2 in vertexEdges: + if contig1 not in vertexEdges[contig2]: + vertexEdges[contig2].append(contig1) + else: + vertexEdges[contig2] = [contig1] + + if edgeMatrix.edgeArray[contig1][contig2] > 1: + notSoloDict[contig1] = "" + notSoloDict[contig2] = "" + + distalPairs.close() + + return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict + + +class EdgeMatrix: + """ Describes a sparse matrix to hold edge data. + """ + + def __init__(self, dimension): + self.dimension = dimension + self.edgeArray = zeros((self.dimension, self.dimension), int16) + + + def visitLink(self, fromVertex, ignoreList=[]): + returnPath = [fromVertex] + toVertex = [] + for toindex in xrange(self.dimension): + if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList: + toVertex.append(toindex) + + for vertex in toVertex: + if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]: + self.edgeArray[fromVertex][vertex] = 0 + self.edgeArray[vertex][fromVertex] = 0 + return returnPath + [vertex] + else: + self.edgeArray[fromVertex][vertex] = 0 + try: + return returnPath + self.visitLink(vertex, returnPath) + except IOError: + return returnPath + [vertex] + return [] + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/rnapath/__init__.py b/rnapath/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rnapath/processvelvet.py b/rnapath/processvelvet.py new file mode 100644 index 0000000..0af43d1 --- /dev/null +++ b/rnapath/processvelvet.py @@ -0,0 +1,110 @@ +import sys +import optparse + +print "%prog: version 1.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="contigPrefix") + parser.add_option("--filter", dest="filterFileName") + parser.add_option("--min", type="int", dest="minSize") + parser.add_option("--keepcov", action="store_true", dest="keepCoverage") + parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(2) + + infile = args[0] + outfile = args[1] + + processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage) + + +def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False): + infile = open(inFileName) + outfile = open(outFileName, "w") + filterList = getFilterList(filterFileName) + + node = {"contigPrefix": contigPrefix, + "completeID": "", + "currentSeq": "" + } + + counts = {"acceptedSize": 0, + "nSize": 0, + "contigsAccepted": 0, + "filteredSize": 0 + } + + for line in infile: + if ">NODE" in line: + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + node["completeID"] = line.strip()[1:] + node["currentSeq"] = "" + else: + node["currentSeq"] += line + + writeNode(outfile, node, filterList, counts, minSize, keepCoverage) + + infile.close() + outfile.close() + + print "%d contigs accepted" % counts["contigsAccepted"] + print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"]) + print "%d bp accepted" % counts["acceptedSize"] + print "%d bp accepted N" % counts["nSize"] + print "%d bp filtered\n" % counts["filteredSize"] + + +def getFilterList(filterFileName=""): + filterList = [] + + if filterFileName: + try: + filterFile = open(filterFileName) + except IOError: + return filterList + + for line in filterFile: + if "NODE" in line: + fields = line.strip().split() + try: + exclude = fields[9] + except IndexError: + continue + + if exclude not in filterList: + filterList.append(exclude) + + filterFile.close() + + return filterList + + +def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False): + completeID = node["completeID"] + currentSeq = node["currentSeq"] + sequenceLength = len(currentSeq) - currentSeq.count("\n") + if len(completeID) > 5 and completeID not in filterList: + fields = completeID.split("_") + newID = fields[1] + if keepCoverage: + newID = fields[1] + "_" + fields[-1].strip() + + if sequenceLength >= minSize: + outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq)) + counts["acceptedSize"] += sequenceLength + counts["nSize"] += currentSeq.count("N") + counts["contigsAccepted"] += 1 + else: + counts["filteredSize"] += sequenceLength + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/scatterfields.py b/scatterfields.py new file mode 100755 index 0000000..60649ff --- /dev/null +++ b/scatterfields.py @@ -0,0 +1,297 @@ +""" + usage: python scatterfields.py infilename xaxisLabel xField yaxisLabel yField outImageName [--xmin xMin] [--ymin yMin] + [--xmax xMax] [--ymax yMax] [--doLogF1] [--doLogF2] [--arcsinh] [--order polyOrder] [--base logBase] + [--markGenes geneFile] [--markfold times] [--noregression] [--large] [--markdiag] [--title text] [--verbose] + + Do a scatter plot of 2 fields from an input file. + fields are counted from 0. + use [-order polyOrder] to specify polynomial fits > 1 + Supports very rudimentary compound fields for X value + using python's lambda functions (omit the keyword lambda) +""" + +import matplotlib +matplotlib.use("Agg") + +from pylab import * +import math, cmath +import sys +import optparse + +alphaVal = 0.5 + +print "%prog: version 3.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = __doc__ + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--xmin", type="float", dest="forcexmin") + parser.add_option("--ymin", type="float", dest="forceymin") + parser.add_option("--xmax", type="float", dest="forcexmax") + parser.add_option("--ymax", type="float", dest="forceymax") + parser.add_option("--doLogF1", action="store_true", dest="doLogF1") + parser.add_option("--doLogF2", action="store_true", dest="doLogF2") + parser.add_option("--arcsinh", action="store_true", dest="doArcsinh") + parser.add_option("--order", type="int", dest="fitOrder") + parser.add_option("--base", type="int", dest="base") + parser.add_option("--markGenes", dest="markFile") + parser.add_option("--markfold", type="float", dest="foldChange") + parser.add_option("--noregression", action="store_false", dest="doRegression") + parser.add_option("--large", action="store_true", dest="plotLarge") + parser.add_option("--markdiag", action="store_true", dest="markDiag") + parser.add_option("--title", type="int", dest="figtitle") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False, + doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None, + foldChange=None, doRegression=True, plotLarge=False, markDiag=False, + figtitle="", verbose=False) + + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 6: + print usage + sys.exit(1) + + infile = open(args[0]) + xaxis = args[1] + xField = args[2] + yaxis = args[3] + yField = int(args[4]) + outfilename = args[5] + + scatterfields(infile, xaxis, xField, yaxis, yField, outfilename, options.forcexmin, + options.forceymin, options.forcexmax, options.forceymax, options.doLogF1, + options.doLogF2, options.doArcsinh, options.fitOrder, options.base, + options.markFile, options.foldChange, options.doRegression, options.plotLarge, + options.markDiag, options.figtitle, options.verbose) + + +def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0, + forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1, + base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False, + markDiag=False, figtitle="", verbose=False): + + infile = open(infilename) + compoundField = False + try: + xField = int(xField) + except: + try: + compoundOp = "lambda %s" % xField + operator = eval(compoundOp) + compoundField = True + print "compound field %s" % xField + except: + pass + + if not compoundField: + print "expression %s not supported" % xField + sys.exit(1) + + markedGenes = [] + marking = False + if markFile is not None: + for line in markFile: + try: + markedGenes.append(line.strip().split()[0].upper()) + except: + markedGenes.append(line.strip().upper()) + + markFile.close() + marking = True + + markFold = False + if foldChange is not None: + markFold = True + + newscores = [] + oldscores = [] + + markednewscores = [] + markedoldscores = [] + + markedfoldnewscores = [] + markedfoldoldscores = [] + + ymax = 0. + xmax = 0. + for line in infile: + fields = line.strip().split() + gene = fields[0] + try: + if compoundField: + score = operator(fields) + else: + score = float(fields[xField]) + + newscore = float(fields[yField]) + except: + continue + + foldMarkThisScore = False + if markFold: + tempscore = score + if tempscore == 0: + tempscore = 0.03 + + tempratio = newscore / tempscore + if tempratio == 0: + tempratio2 = tempscore / 0.03 + else: + tempratio2 = 1. / tempratio + + if tempratio > foldChange or tempratio2 > foldChange: + foldMarkThisScore = True + + if doArcsinh: + score = abs(cmath.asinh(score)) + elif doLogF1: + try: + score = math.log(score, base) + except: + score = forcexmin + + if score > xmax: + xmax = score + + if doArcsinh: + newscore = abs(cmath.asinh(newscore)) + elif doLogF2: + try: + newscore = math.log(newscore, base) + except: + newscore = forceymin + + if newscore > ymax: + ymax = newscore + + oldscores.append(score) + newscores.append(newscore) + if foldMarkThisScore: + markedfoldoldscores.append(score) + markedfoldnewscores.append(newscore) + if marking and gene.upper() not in markedGenes: + print gene, score, newscore, "unmarked" + + if gene.upper() in markedGenes: + print gene, score, newscore, "overfold" + + if verbose: + print len(markedfoldoldscores), line.strip() + + if gene.upper() in markedGenes: + if not foldMarkThisScore: + print gene, score, newscore + + markedoldscores.append(score) + markednewscores.append(newscore) + + print score, newscore + print fields + + if plotLarge and markFold: + plot(oldscores, newscores, "^", markersize=10., color="0.75", alpha=alphaVal) + elif plotLarge: + plot(oldscores, newscores, "b^", markersize=10., alpha=alphaVal) + elif markFold: + plot(oldscores, newscores, ",", color="0.75", alpha=alphaVal) + else: + plot(oldscores, newscores, "b,", alpha=alphaVal) + + if len(markedfoldoldscores) > 0: + if plotLarge: + plot(markedfoldoldscores, markedfoldnewscores, "b^", markersize=10., alpha=alphaVal) + else: + plot(markedfoldoldscores, markedfoldnewscores, "b,", alpha=alphaVal) + + if len(markedoldscores) > 0: + if plotLarge: + plot(markedoldscores, markednewscores, "r^", color="red", markersize=10., alpha=alphaVal) + else: + plot(markedoldscores, markednewscores, ".", color="red", markersize=4., alpha=alphaVal) + + fitvalues = polyfit(oldscores, newscores, fitOrder) + print fitvalues + print len(oldscores) + + meanObserved = float(sum(newscores)) / len(newscores) + if len(fitvalues) == 2: + predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores] + else: + predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores] + + SSt = 0. + SSe = 0. + + for index in range(len(newscores)): + SSt += (newscores[index] - meanObserved) ** 2 + SSe += (newscores[index] - predicted[index]) ** 2 + + rSquared = 1. - SSe / SSt + print "R**2 = %f" % rSquared + + oldscores.sort() + if len(fitvalues) == 2: + predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores] + else: + predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores] + + if doRegression: + plot(oldscores, predicted, "-k", linewidth=2) + + if figtitle == "": + figtitle = "%s vs %s (R^2: %.2f)" % (yaxis, xaxis, rSquared) + + title(figtitle) + + if markDiag: + min = forcexmin + if forceymin < min: + min = forceymin + + max = xmax + if ymax > max: + max = ymax + + if forcexmax > max: + max = forcexmax + + if forceymax > max: + max = forceymax + + plot([min,max], [min,max], "-g", linewidth=2) + + print forcexmin, forceymin + + if doLogF2: + ylabel("log%s(%s)" % (str(base), yaxis)) + else: + ylabel(yaxis) + + if doLogF1: + xlabel("log%s(%s)" % (str(base), xaxis)) + else: + xlabel(xaxis) + + if xmax > 0: + xlim(forcexmin - 0.05, xmax) + + if ymax > 0: + ylim(forceymin - 0.05, ymax) + + if forcexmax > 0 and forceymax > 0: + xlim(forcexmin - 0.05, forcexmax) + ylim(forceymin - 0.05, forceymax) + + gca().get_xaxis().tick_bottom() + gca().get_yaxis().tick_left() + + savefig(outfilename, dpi=100) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/siteintersects.py b/siteintersects.py new file mode 100755 index 0000000..ba0f1cd --- /dev/null +++ b/siteintersects.py @@ -0,0 +1,147 @@ +# +# siteintersects.py +# ENRAGE +# + +import sys + +print "%s: version 2.0" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 4: + print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0] + sys.exit(1) + + sitefilename1 = argv[1] + sitefilename2 = argv[2] + outfilename = argv[3] + + doReject = False + if "--reject" in sys.argv: + reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w") + reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w") + doReject = True + + doExpanded = False + if "--expanded" in sys.argv: + doExpanded = True + + siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded) + + +def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False): + + siteDict = {} + file1Dict = {} + + infile1count = 0 + infile = open(sitefilename1) + infile.readline() + for line in infile: + if line[0] == "#": + continue + + infile1count += 1 + fields = line.strip().split() + if doExpanded: + chrom = fields[1][3:] + start = int(fields[2]) + stop = int(fields[3]) + rest = fields[4:] + else: + (chrom, pos) = fields[0].split(":") + chrom = chrom[3:] + (start, stop) = pos.split("-") + start = int(start) + stop = int(stop) + rest = fields[1:] + + try: + siteDict[chrom].append((start, stop, rest)) + except: + siteDict[chrom] = [(start, stop, rest)] + + if doReject: + file1Dict[str((chrom, start, stop, rest))] = line + + infile.close() + + print "file1: %d" % infile1count + + infile2count = 0 + infile = open(sitefilename2) + infile.readline() + + commonSites = 0 + unique2List = [] + outfile = open(outfilename, "w") + for line in infile: + if line[0] == "#": + continue + + infile2count += 1 + fields = line.strip().split() + if doExpanded: + chrom = fields[1][3:] + start = int(fields[2]) + stop = int(fields[3]) + rest = fields[4:] + else: + (chrom, pos) = fields[0].split(":") + chrom = chrom[3:] + (start, stop) = pos.split("-") + rest = str(fields[1:]) + + start = int(start) + stop = int(stop) + mid = start + abs(stop - start)/2 + if chrom not in siteDict: + if doReject: + unique2List.append(line) + continue + + twoNotCommon = True + for (rstart, rstop, rline) in siteDict[chrom]: + rsize = abs(rstart - rstop) /2 + rmid = rstart + abs(rstop - rstart)/2 + if abs(mid - rmid) < rsize: + commonSites += 1 + if twoNotCommon: + outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest)) + twoNotCommon = False + + try: + if doReject: + del file1Dict[str((chrom, rstart, rstop, rline))] + except: + pass + + if doReject and twoNotCommon: + unique2List.append(line) + + outfile.close() + + print "file2: %d" % infile2count + + if doReject: + reject1file = open(reject1filename, "w") + reject2file = open(reject2filename, "w") + + for key in file1Dict: + reject1file.write(file1Dict[key]) + + for line in unique2List: + reject2file.write(line) + + reject1file.close() + reject2file.close() + + print commonSites + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/stallCategory.py b/stallCategory.py new file mode 100755 index 0000000..92cd519 --- /dev/null +++ b/stallCategory.py @@ -0,0 +1,165 @@ +# +# stallCategory.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys +import optparse + +print "%prog: version 1.1" + + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--out", dest="outFileName") + parser.add_option("--statout", dest="statOutFileName") + parser.add_option("--expression", type="float", dest="expressionLevel") + parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + infile1 = args[1] + infile2 = args[2] + transcriptFile = args[2] + + stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel) + + +def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9): + + infile1 = open(inFile1Name) + infile2 = open(inFile2Name) + transcriptFile = open(transcriptFileName) + + writeOut = False + if outFileName is not None: + outfile = open(outFileName, "w") + outfile.write("gene\texpression\tratio1\tpromAmount1\ttotal1\trestRPKM1\tratio2\tpromAmount2\ttotal2\trestRPKM2\n") + writeOut = True + + statWriteOut = False + if statOutFileName is not None: + statoutfile = open(statOutFileName, "w") + statoutfile.write("ExpressionR1R2Stalled1Stalled2\tCount\n") + statWriteOut = True + + dictOne = {} + dictTwo = {} + expressionDict = {} + + for line in infile1: + if "short" in line: + continue + + fields = line.strip().split() + promAmount = float(fields[4]) + float(fields[5]) + genelen = float(fields[3])/100 + total = float(fields[2]) + if total < 0.1: + total = 0.1 + + restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6) + ratio = float(fields[-1]) + dictOne[fields[1]] = (ratio, promAmount, total, restRPKM) + + for line in infile2: + if "short" in line: + continue + + fields = line.strip().split() + promAmount = float(fields[4]) + float(fields[5]) + genelen = float(fields[3])/100 + if promAmount == 0.: + promAmount = 0.1 + + total = float(fields[2]) + if total < 0.1: + total = 0.1 + + restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6) + ratio = float(fields[-1]) + dictTwo[fields[1]] = (ratio, promAmount, total, restRPKM) + + for line in transcriptFile: + (gene, transc, transcpercell) = line.strip().split() + expressionDict[gene] = float(transcpercell) + + categoryList = [] + categoryDict = {} + for atype in ["HH", "HL", "LH", "LL"]: + for expression in ["E", "N"]: + for cat1 in ["Y", "N"]: + for cat2 in ["Y", "N"]: + category = expression + cat1 + cat2 + atype + categoryList.append(category) + categoryDict[category] = [] + + for gene in dictOne: + if gene not in expressionDict: + if writeOut: + print "%s is not in expressionDict - skipping" % gene + + continue + + expression = expressionDict[gene] + (ratio1, promAmount1, total1, restRPKM1) = dictOne[gene] + (ratio2, promAmount2, total2, restRPKM2) = dictTwo[gene] + + if expression > expressionLevel: + category = "E" + else: + category = "N" + + if total1 > 5.0: + category += "Y" + else: + category += "N" + + if total2 > 5.0: + category += "Y" + else: + category += "N" + + if ratio1 > 15: + category += "H" + else: + category += "L" + + if ratio2 > 15: + category += "H" + else: + category += "L" + + categoryDict[category].append(gene) + if writeOut: + outfile.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category) +) + else: + print "%s %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %s" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category) + + if writeOut: + outfile.close() + + for category in categoryList: + if statWriteOut: + statoutfile.write("%s\t%d\n" % (category, len(categoryDict[category]))) + else: + print "%s %d" % (category, len(categoryDict[category])) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/test/testAnalyzeGO.py b/test/testAnalyzeGO.py new file mode 100644 index 0000000..dacf4a2 --- /dev/null +++ b/test/testAnalyzeGO.py @@ -0,0 +1,84 @@ +''' +Created on Aug 26, 2010 + +@author: sau +''' +import unittest +import os +from Erange import analyzego + + +class TestAnalyzeGO(unittest.TestCase): + genome = "celegans" + prefix = "testGO" + inFileName = "testAnayzeGOInput.txt" + + def setUp(self): + infile = open(self.inFileName, "w") + infile.close() + + + def tearDown(self): + try: + os.remove(self.inFileName) + except OSError: + pass + + try: + os.remove("%s.gostat" % self.prefix) + except OSError: + pass + + try: + os.remove("%s.gozscore" % self.prefix) + except OSError: + pass + + try: + os.remove("%s.gosig" % self.prefix) + except OSError: + pass + + + #TODO: write more tests + def testAnalyzeGO(self): + geneInfoList = [] + analyzego.analyzeGO(self.genome, geneInfoList, self.prefix) + self.assertRaises(IOError, open, "%s.gostat" % self.prefix, "r") + self.assertRaises(IOError, open, "%s.gozscore" % self.prefix, "r") + self.assertRaises(IOError, open, "%s.gosig" % self.prefix, "r") + + geneInfoList = ["worm\tgeneID"] + analyzego.analyzeGO(self.genome, geneInfoList, self.prefix) + statfile = open("%s.gostat" % self.prefix, "r") + stats = statfile.readlines() + print len(stats) + + statfile.close() + scorefile = open("%s.gozscore" % self.prefix, "r") + scores = scorefile.readlines() + print len(scores) + + scorefile.close() + sigfile = open("%s.gosig" % self.prefix, "r") + sigs = sigfile.readlines() + print len(sigs) + + sigfile.close() + + + def testMain(self): + argv = ["analyzego", self.genome, self.inFileName, self.prefix] + analyzego.main(argv) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestAnalyzeGO)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testChkSNP_input.txt b/test/testChkSNP_input.txt new file mode 100644 index 0000000..f9f36de --- /dev/null +++ b/test/testChkSNP_input.txt @@ -0,0 +1,3 @@ +# header line +foo foo chr1 691 +foo2 foo2 chr1 81752 diff --git a/test/testChksnp.py b/test/testChksnp.py new file mode 100644 index 0000000..3f2e8ae --- /dev/null +++ b/test/testChksnp.py @@ -0,0 +1,199 @@ +''' +Created on Aug 25, 2010 + +@author: sau +''' +import unittest +import string +import os +from Erange import chksnp + +dbPath = "/Users/sau/work/snpdb/hg18" + +class TestChksnp(unittest.TestCase): + """ First entries from snpDB using select func, name, start, stop from snp where chrom="1" limit 4; + unknown|rs10218492|690|691 + unknown|rs10218493|766|767 + unknown|rs10218527|789|790 + unknown|rs28853987|800|801 + + Entry from altSnpDB not in sndDB + unknown|rs17160650|81751|81752 + """ + + snpDB = "%s/dbSNP128.db" % dbPath + altSnpDB = "%s/snp129cDNA.db" % dbPath + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testChkSNPFile(self): + inputFileName = "testChkSNP_input.txt" + infile = open(inputFileName, "w") + infile.write("# header line\n") + snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t") + infile.write("%s\n" % snpEntry) + snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t") + infile.write("%s\n" % snpEntry) + infile.close() + + outputFileName = "testChkSNP_output.txt" + + chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName) + outfile = open(outputFileName, "r") + line = outfile.readline() + result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" + self.assertEquals(result, line) + result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n" + line = outfile.readline() + self.assertEquals(result, line) + outfile.close() + os.remove(outputFileName) + + chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB]) + outfile = open(outputFileName, "r") + line = outfile.readline() + result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" + self.assertEquals(result, line) + result = "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown\n" + line = outfile.readline() + self.assertEquals(result, line) + outfile.close() + + os.remove(inputFileName) + os.remove(outputFileName) + + + def testMain(self): + inputFileName = "testChkSNP_input.txt" + infile = open(inputFileName, "w") + infile.write("# header line\n") + snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t") + infile.write("%s\n" % snpEntry) + snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t") + infile.write("%s\n" % snpEntry) + infile.close() + + outputFileName = "testChkSNP_output.txt" + + argv = ["chksnp", self.snpDB, inputFileName, outputFileName] + chksnp.main(argv) + outfile = open(outputFileName, "r") + line = outfile.readline() + result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n" + self.assertEquals(result, line) + result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n" + line = outfile.readline() + self.assertEquals(result, line) + outfile.close() + os.remove(outputFileName) + + def testChkSNP(self): + snpPropertiesList = [] + dbList = [self.snpDB] + self.assertEquals({}, chksnp.chkSNP(dbList, snpPropertiesList)) + + snpPropertiesList = ["# header line"] + snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t") + snpPropertiesList.append(snpEntry) + snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t") + snpPropertiesList.append(snpEntry) + dbList = [self.snpDB, self.altSnpDB] + result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown", + ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"} + self.assertEquals(result, chksnp.chkSNP(dbList, snpPropertiesList)) + + + def testGetSNPLocationInfo(self): + snpPropertiesList = [] + snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t") + snpPropertiesList.append(snpEntry) + snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList) + self.assertEquals([("1", 20)], snpLocationList) + self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict) + + snpPropertiesList = ["# header line"] + snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t") + snpPropertiesList.append(snpEntry) + snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList) + self.assertEquals([("1", 20)], snpLocationList) + self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict) + + + def testDoNotProcessLine(self): + self.assertTrue(chksnp.doNotProcessLine("#anything")) + self.assertFalse(chksnp.doNotProcessLine("line to process")) + + + def testAnnotateSNPFromDB(self): + snpLocationList = [("1", 691), ("1", 81752)] + snpDict = {("1", 691): "foo\tfoo\tchr1\t691", + ("1", 81752): "foo2\tfoo2\tchr1\t81752"} + result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown", + ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"} + self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.snpDB)) + + snpLocationList = [("1", 691), ("1", 81752)] + snpDict = {("1", 691): "foo\tfoo\tchr1\t691", + ("1", 81752): "foo2\tfoo2\tchr1\t81752"} + result = {("1", 691): "foo\tfoo\tchr1\t691\tN\\A\tN\\A", + ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"} + self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.altSnpDB)) + + + def testAnnotateSNPFromDBList(self): + snpLocationList = [] + snpDict = {} + dbList = [self.snpDB] + self.assertEquals({}, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList)) + + snpLocationList = [("1", 21)] + snpDict = {("1", 21): "foo\tfoo\tchr1\t21"} + dbList = [self.snpDB] + result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"} + self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList)) + + snpLocationList = [("1", 21)] + snpDict = {("1", 21): "foo\tfoo\tchr1\t21"} + dbList = [self.snpDB] + result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"} + self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=10000)) + + snpLocationList = [("1", 691)] + snpDict = {("1", 691): "foo\tfoo\tchr1\t691"} + dbList = [self.snpDB] + result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown"} + self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList)) + + snpLocationList = [("1", 691), ("1", 81752)] + snpDict = {("1", 691): "foo\tfoo\tchr1\t691", + ("1", 81752): "foo2\tfoo2\tchr1\t81752"} + dbList = [self.snpDB] + result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown", + ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"} + self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList)) + + snpLocationList = [("1", 691), ("1", 81752)] + snpDict = {("1", 691): "foo\tfoo\tchr1\t691", + ("1", 81752): "foo2\tfoo2\tchr1\t81752"} + dbList = [self.snpDB, self.altSnpDB] + result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown", + ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"} + self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList)) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestChksnp)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testCommoncode.py b/test/testCommoncode.py new file mode 100644 index 0000000..1ea4f80 --- /dev/null +++ b/test/testCommoncode.py @@ -0,0 +1,555 @@ +''' +Created on Aug 30, 2010 + +@author: sau +''' +import unittest +import os +import string +from array import array +from Erange import commoncode +from cistematic.genomes import Genome + + +class TestCommoncode(unittest.TestCase): + logFile = "testLogFile" + celegansChroms = ["I", "II", "III", "IV", "V", "X", "MtDNA"] + genome = Genome("celegans") + + def setUp(self): + pass + + + def tearDown(self): + try: + os.remove(self.logFile) + except OSError: + pass + + + def testGetReverseComplement(self): + self.assertEquals("T", commoncode.getReverseComplement("A")) + self.assertEquals("A", commoncode.getReverseComplement("T")) + self.assertEquals("C", commoncode.getReverseComplement("G")) + self.assertEquals("G", commoncode.getReverseComplement("C")) + self.assertEquals("N", commoncode.getReverseComplement("N")) + self.assertRaises(KeyError, commoncode.getReverseComplement, "") + self.assertRaises(KeyError, commoncode.getReverseComplement, "B") + + + def testCountDuplicatesInList(self): + testList = [] + self.assertEquals([], commoncode.countDuplicatesInList(testList)) + + testList = [0, 1] + result = [(0, 1), (1, 1)] + self.assertEquals(result, commoncode.countDuplicatesInList(testList)) + + testList = [0, 1, 1] + result = [(0, 1), (1, 2)] + self.assertEquals(result, commoncode.countDuplicatesInList(testList)) + + testList = [0, 1, 2, 1] + result = [(0, 1), (1, 2), (2, 1)] + self.assertEquals(result, commoncode.countDuplicatesInList(testList)) + + + def testWriteLog(self): + messenger = "testMessenger" + message = "testMessage" + + commoncode.writeLog(self.logFile, messenger, message) + file = open(self.logFile) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[%s]" % messenger) + self.assertEquals(fields[3], message) + line = file.readline() + self.assertEquals("", line) + + messenger2 = "testMessenger2" + message2 = "testMessage2" + + commoncode.writeLog(self.logFile, messenger2, message2) + file = open(self.logFile) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[%s]" % messenger) + self.assertEquals(fields[3], message) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[%s]" % messenger2) + self.assertEquals(fields[3], message2) + line = file.readline() + self.assertEquals("", line) + + os.remove(self.logFile) + + commoncode.writeLog(self.logFile, messenger, message) + file = open(self.logFile) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[%s]" % messenger) + self.assertEquals(fields[3], message) + line = file.readline() + self.assertEquals("", line) + + os.remove(self.logFile) + + commoncode.writeLog(self.logFile, "", message) + file = open(self.logFile) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[]") + self.assertEquals(fields[3], message) + line = file.readline() + self.assertEquals("", line) + + os.remove(self.logFile) + + commoncode.writeLog(self.logFile, "", "") + file = open(self.logFile) + line = file.readline() + fields = line.split() + self.assertEquals(fields[2], "[]") + self.assertEquals(3, len(fields)) + line = file.readline() + self.assertEquals("", line) + + + def testGetMergedRegions(self): + testfile = open("regionTestFile", "w") + regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") + testfile.write(regionEntry) + testfile.close() + result = {"1": [(10, 20, 10)]} + self.assertEquals(result, commoncode.getMergedRegions("regionTestFile")) + os.remove("regionTestFile") + + + def testGetMergedRegionsFromList(self): + self.assertEquals({}, commoncode.getMergedRegionsFromList([])) + + regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") + regionList = [regionEntry] + result = {"1": [(10, 20, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) + result = {"1": [(5, 25, 20)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5)) + result = {"1": [(12, 18, 6)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2)) + result = {"chr1": [(10, 20, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True)) + + regionEntry = string.join(["1", "chr1:10-20", "5"], "\t") + regionList = [regionEntry] + result = {"1": [(10, 20, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)) + + regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") + regionList = [regionEntry] + regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t") + regionList.append(regionEntry) + result = {"1": [(10, 40, 30)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) + result = {"1": [(10, 20, 10), (15, 40, 25)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False)) + result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)) + + regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t") + regionList = [regionEntry] + regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t") + regionList.append(regionEntry) + result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)) + + regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t") + regionList = [regionEntry] + regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t") + regionList.append(regionEntry) + result = {"1": [(10, 20, 10), (2030, 2040, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) + result = {"1": [(10, 2040, 2030)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000)) + result = {"1": [(10, 20, 10), (2030, 2040, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5)) + result = {"1": [(2030, 2040, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12)) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1)) + + regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t") + regionList = [regionEntry] + regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t") + regionList.append(regionEntry) + result = {"2": [(15, 40, 25)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)) + self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1) + self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6)) + self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1)) + + regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t") + regionList = [regionEntry] + result = {"1": [(10, 20, 10)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList)) + result = {"1": [(10, 20, 10, 3, 40)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + result = {"1": [("1", 10, 20, 10, 3, 40)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t") + regionList.append(regionEntry) + result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t") + regionList.append(regionEntry) + result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t") + regionList.append(regionEntry) + result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)) + result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)], + "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)] + } + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)) + + regionList = ["# comment"] + regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t") + regionList.append(regionEntry) + result = {"1": [(10, 20, 10, 3, 40)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + regionList = ["# pvalue"] + regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t") + regionList.append(regionEntry) + result = {"1": [(10, 20, 10, 3, 40)]} + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + regionList = ["# readShift"] + regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t") + regionList.append(regionEntry) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + regionList = ["# pvalue readShift"] + regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t") + regionList.append(regionEntry) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True)) + self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)) + #Test fails - the header line is required if there are fields after the peak which isn't so good + #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True)) + + + def testRegionsOverlap(self): + self.assertTrue(commoncode.regionsOverlap(100, 200, 1, 300)) + self.assertTrue(commoncode.regionsOverlap(100, 200, 150, 300)) + self.assertTrue(commoncode.regionsOverlap(100, 500, 1, 300)) + self.assertTrue(commoncode.regionsOverlap(100, 200, 110, 160)) + + self.assertFalse(commoncode.regionsOverlap(100, 200, 250, 300)) + self.assertFalse(commoncode.regionsOverlap(100, 200, 1, 60)) + + self.assertFalse(commoncode.regionsOverlap(-200, -100, 1, 300)) + self.assertFalse(commoncode.regionsOverlap(100, 200, -300, -1)) + + self.assertTrue(commoncode.regionsOverlap(-200, -100, -300, -1)) + + self.assertTrue(commoncode.regionsOverlap(-100, -200, -300, -1)) + self.assertTrue(commoncode.regionsOverlap(-200, -100, -1, -300)) + self.assertTrue(commoncode.regionsOverlap(-100, -200, -1, -300)) + + + def testRegionsAreWithinDistance(self): + self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 40, 50, 30)) + self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5)) + self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 25, 50, 10)) + self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5)) + + self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 100, 150, 5)) + self.assertFalse(commoncode.regionsAreWithinDistance(100, 200, 10, 15, 5)) + + self.assertTrue(commoncode.regionsAreWithinDistance(20, 10, 30, 150, 10)) + self.assertFalse(commoncode.regionsAreWithinDistance(20, 10, 100, 150, 5)) + self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 150, 100, 5)) + + + #TODO: write test + def testFindPeak(self): + hitList = [] + result = ([], 0.0, array("f"), 0.0) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 0)) + + hitList= [[4, "+", 0.5]] + result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10)) + result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True)) + result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto")) + result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True)) + + hitList= [[4, "+", 0.5]] + result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True)) + + hitList= [[4, "+", 0.5]] + result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True)) + result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3) + self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True)) + + + #TODO: write test + def testGetBestShiftForRegion(self): + hitList = [[14, "-", 1.0], [16, "-", 1.0], [24, "+", 1.0], [26, "+", 10.0]] + self.assertEquals(74, commoncode.getBestShiftForRegion(hitList, 0, 100)) + self.assertEquals(16, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=30)) + self.assertEquals(0, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=10)) + + + #TODO: write test + def testGetFeaturesByChromDict(self): + firstFeatures = {"I": (4123, 4219, "Y74C9A.3", "R", "3UTR"), + "II": (1866, 1910, "2L52.1", "F", "CDS"), + "III": (1270, 1506, "cTel54X.1", "R", "CDS"), + "IV": (694, 1064, "Y38C1AB.4", "F", "CDS"), + "V": (1479, 1578, "cTel3X.1", "F", "CDS"), + "X": (3622, 4099, "CE7X_3.1", "F", "CDS"), + "MtDNA": (112, 543, "MTCE.3", "F", "CDS") + } + featureDict = commoncode.getFeaturesByChromDict(self.genome) + for chrom in featureDict.keys(): + self.assertTrue(chrom in self.celegansChroms) + self.assertEquals(firstFeatures[chrom], featureDict[chrom][0]) + + restrictList = ["almost certainly not a value feature"] + featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList) + self.assertEquals({}, featureDict) + + restrictList = ["Y74C9A.3"] + featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList) + self.assertEquals(["I"], featureDict.keys()) + featureDict, complementDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList, regionComplement=True) + result = {"I": [(0, 4123, "nonExon1", "F", "nonExon"), + (4219, 4220, "nonExon2", "F", "nonExon"), + (4357, 5194, "nonExon3", "F", "nonExon"), + (5295, 6036, "nonExon4", "F", "nonExon"), + (6326, 9726, "nonExon5", "F", "nonExon"), + (9845, 10094, "nonExon6", "F", "nonExon"), + (10147, 10148, "nonExon7", "F", "nonExon"), + (10231, 250000000, "nonExon8", "F", "nonExon")] + } + self.assertEquals(result, complementDict) + + regionDict = {"I": [("new feature", 100, 150, 50)]} + featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict) + result = (100, 150, "new feature", "+", "custom") + self.assertEquals(result, featureDict["I"][0]) + + + def testGetLocusByChromDict(self): + firstLoci = {"I": (4123, 10231, "Y74C9A.3", 6108), + "II": (1866, 4662, "2L52.1", 2796), + "III": (1270, 2916, "cTel54X.1", 1646), + "IV": (694, 14925, "Y38C1AB.4", 14231), + "V": (1479, 3038, "cTel3X.1", 1559), + "X": (3622, 7153, "CE7X_3.1", 3531), + "MtDNA": (112, 548, "MTCE.3", 436) + } + + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, useCDS=False)) + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, downstream=1, useCDS=False)) + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=-1, downstream=-1, useCDS=False, lengthCDS=1)) + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstreamSpanTSS=True, lengthCDS=1)) + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, downstream=1, lengthCDS=1)) + self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, lengthCDS=-1)) + + locusDict = commoncode.getLocusByChromDict(self.genome) + for chrom in locusDict.keys(): + self.assertTrue(chrom in self.celegansChroms) + self.assertEquals(firstLoci[chrom], locusDict[chrom][0]) + + regionDict = {"I": [("new region", 100, 150, 50)]} + locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict) + self.assertEquals((100, 150, "new region", 50), locusDict["I"][0]) + locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True) + self.assertEquals((100, 150, "new region", 50, "+"), locusDict["I"][0]) + + # Long Test + #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, upstream=100) + #self.assertEquals((150, 250, "new region", 100), locusDict["I"][0]) + + # Long Test + #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, downstream=10) + #self.assertEquals((90, 100, "new region", 10), locusDict["I"][0]) + + + def testComputeRegionBins(self): + regionsByChromDict = {} + hitDict = {} + bins = 4 + readlen = 10 + result = ({}, {}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + result = ({"regionID": [0.0, 0.0, 0.0, 0.0]}, {}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[1, "+", 1.0]]} + result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")], + "2": [("regionID2", 1, 1000, 1000, "F")] + } + hitDict = {"1": [[1, "+", 1.0]], + "2": [[1, "+", 1.0]] + } + result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5]]} + result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [15, "+", 1.0]]} + result = ({"regionID": [2.0, 0.0, 0.0, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [200, "+", 2.0]]} + result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[1, "+", 1.0]]} + regionList = ["regionID"] + result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[1, "+", 1.0]]} + regionList = ["empty region"] + result = ({"empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")], + "2": [("regionID2", 1, 1000, 1000, "F")] + } + hitDict = {"1": [[1, "+", 1.0]], + "2": [[1, "+", 1.0]] + } + regionList = ["regionID", "regionID2"] + result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")], + "2": [("regionID2", 1, 1000, 1000, "F")] + } + hitDict = {"1": [[1, "+", 1.0]], + "2": [[1, "+", 1.0]] + } + regionList = ["empty region", "regionID2"] + result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")], + "2": [("regionID2", 1, 1000, 1000, "F")] + } + hitDict = {"1": [[1, "+", 1.0]], + "2": [[1, "+", 1.0]] + } + regionList = ["regionID2"] + result = ({"regionID2": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[1, "+", 1.0]]} + result = ({"regionID": [2.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizedTag=2.0)) + + regionsByChromDict = {"1": [(1, 100, "regionID", 100, "F")]} + hitDict = {"1": [[1, "+", 1.0]]} + result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, defaultRegionFormat=False)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0]]} + fixedFirstBin = 20 + result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0]]} + fixedFirstBin = 5 + result = ({"regionID": [0.0, 1.0, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]} + fixedFirstBin = 20 + result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[80, "+", 1.0], [85, "+", 0.5]]} + fixedFirstBin = 5 + result = ({"regionID": [0.0, 1.5, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]} + binLength = 25 + result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]} + binLength = 50 + result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]} + binLength = 15 + result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]} + hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]} + binLength = 15 + result = ({"regionID": [1.0, 0.0, 0.7, 0.5]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + + regionsByChromDict = {"1": [("regionID", 1, 100, 100, "R")]} + hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]} + result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen)) + result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100}) + fixedFirstBin = 10 + result = ({"regionID": [0.0, 2.2, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + fixedFirstBin = 20 + result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin)) + binLength = 50 + result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + binLength = 10 + result = ({"regionID": [0.0, 0.5, 0.0, 1.7]}, {"regionID": 100}) + self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength)) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestCommoncode)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testErange.py b/test/testErange.py new file mode 100644 index 0000000..d9392ff --- /dev/null +++ b/test/testErange.py @@ -0,0 +1,66 @@ +''' +Runs all unit test for Erange. +Functionality will eventually be incorporated into unittest in Python 2.7+ +Uses test suites until then + +Created on Sep 8, 2010 + +@author: sau +''' + +import sys +import unittest +import testAnalyzeGO +import testChksnp +import testCommoncode +import testGeneMrnaCounts +#import testGetFasta +import testGetNovelSNPs +import testGetSNPGeneInfo +import testGetSNPs +import testMakeBamFromRds +import testmakebedfromrds +#import testMakeGraphs +import testMakeRdsFromBam +import testMakeSNPTrack +import testMarkLinkers +import testPeaksToRegion +import testProcessVelvet +import testReadDataset +import testRnaAToIFilter +import testRnaEditing +import testRNAPATH +import testTranscripts + + +def main(argv=None): + if not argv: + argv = sys.argv + + suite = unittest.TestSuite() + suite.addTest(testAnalyzeGO.suite()) + suite.addTest(testChksnp.suite()) + suite.addTest(testCommoncode.suite()) + suite.addTest(testGeneMrnaCounts.suite()) + #suite.addTest(testGetFasta.suite()) + suite.addTest(testGetNovelSNPs.suite()) + suite.addTest(testGetSNPGeneInfo.suite()) + suite.addTest(testGetSNPs.suite()) + suite.addTest(testMakeBamFromRds.suite()) + suite.addTest(testmakebedfromrds.suite()) + #suite.addTest(testMakeGraphs.suite()) + suite.addTest(testMakeRdsFromBam.suite()) + suite.addTest(testMakeSNPTrack.suite()) + suite.addTest(testMarkLinkers.suite()) + suite.addTest(testPeaksToRegion.suite()) + suite.addTest(testProcessVelvet.suite()) + suite.addTest(testReadDataset.suite()) + suite.addTest(testRnaAToIFilter.suite()) + suite.addTest(testRnaEditing.suite()) + suite.addTest(testRNAPATH.suite()) + #suite.addTest(testTranscripts.suite()) + + unittest.TextTestRunner(verbosity=2).run(suite) + +if __name__ == '__main__': + main(sys.argv) \ No newline at end of file diff --git a/test/testGeneMrnaCounts.py b/test/testGeneMrnaCounts.py new file mode 100644 index 0000000..62f1649 --- /dev/null +++ b/test/testGeneMrnaCounts.py @@ -0,0 +1,220 @@ +''' +Created on Aug 19, 2010 + +@author: sau + +Located feature 728439 by: + from Erange.commoncode import getFeaturesByChromDict + genome = Genome(self.genomeName) + featuresByChromDict = getFeaturesByChromDict(genome) + print featuresByChromDict["1"][:3] + +''' +import unittest +import os +from Erange import geneMrnaCounts +from cistematic.core.geneinfo import geneinfoDB +from cistematic.genomes import Genome +from Erange.commoncode import readDataset + + +class TestGeneMrnaCounts(unittest.TestCase): + idb = geneinfoDB(cache=True) + testDBName = "testRDS.rds" + genomeName = "hsapiens" + outfilename = "testGeneMrnaCounts.txt" + + def setUp(self): + self.rds = readDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False) + + + def tearDown(self): + del(self.rds) + os.remove(self.testDBName) + + + def testGeneMrnaCounts(self): + geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename) + outfile = open(self.outfilename, "r") + for line in outfile: + fields = line.split("\t") + self.assertEquals("0\n", fields[2]) + + outfile.close() + os.remove(self.outfilename) + + rdsEntryList = [("testRead", "chr1", 18700, 18800, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename) + possibleCounts = ["0\n", "1\n"] + outfile = open(self.outfilename, "r") + for line in outfile: + fields = line.split("\t") + self.assertTrue(fields[2] in possibleCounts) + + outfile.close() + os.remove(self.outfilename) + + geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename, + markGID=True, trackStrand=True) + + possibleCounts = ["0\n", "1\n"] + outfile = open(self.outfilename, "r") + for line in outfile: + fields = line.split("\t") + self.assertTrue(fields[2] in possibleCounts) + + outfile.close() + os.remove(self.outfilename) + reads = self.rds.getReadsDict(withFlag=True, entryDict=True) + self.assertEquals("728439", reads["1"][0]["flag"]) + + geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename, + countFeats=True, markGID=True, cachePages=150000) + + possibleCounts = ["0\n", "1\n"] + outfile = open(self.outfilename, "r") + for line in outfile: + fields = line.split("\t") + self.assertTrue(fields[2] in possibleCounts) + + outfile.close() + os.remove(self.outfilename) + reads = self.rds.getReadsDict(withFlag=True, entryDict=True) + self.assertEquals("728439", reads["1"][0]["flag"]) + + + def testCountFeatures(self): + testDict = {} + self.assertEquals(0, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": []} + self.assertEquals(0, geneMrnaCounts.countFeatures(testDict)) + + #TODO: This is likely not the result we want + testDict = {"chr1": "not a list"} + self.assertEquals(10, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": 10} + self.assertEquals(0, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": 10, + "chr2": ["f1"]} + self.assertEquals(1, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": ["f1", "f2"]} + self.assertEquals(2, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": ["f1", "f2"], + "chr2": []} + self.assertEquals(2, geneMrnaCounts.countFeatures(testDict)) + + testDict = {"chr1": ["f1", "f2"], + "chr2": ["f1"]} + self.assertEquals(3, geneMrnaCounts.countFeatures(testDict)) + + + def testGetGeneSymbol(self): + # Case: Null/None inputs + gid = "" + searchGID = False + geneInfoDict = {} + idb = None + genomeName = "" + geneAnnotDict = {} + self.assertEquals("LOC", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + # Case: symbol is in geneInfoDict + gid = "1" + searchGID = False + geneInfoDict = {"1": [["gene1", "wrong name"], ["wrong name 2"]]} + idb = None + genomeName = "test" + geneAnnotDict = {("test", "1"): ["wrong name 3"]} + self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + # Case: symbol not in geneInfoDict, is in geneAnnotDict + gid = "1" + searchGID = False + geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]} + idb = None + genomeName = "test" + geneAnnotDict = {("test", "1"): ["gene1"]} + self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + # Case: symbol not in geneInfoDict or geneAnnotDict - non-null/None inputs + gid = "1" + searchGID = False + geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]} + idb = None + genomeName = "test" + geneAnnotDict = {("test", "0"): ["wrong name 3"]} + self.assertEquals("LOC1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + # Case: using search, gid not in idb + gid = "almostCertainlyNotInTheIDB" + searchGID = True + geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]} + idb = self.idb + genomeName = "human" + geneAnnotDict = {("human", "0"): ["wrong name 3"]} + self.assertEquals("LOCalmostCertainlyNotInTheIDB", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + # Case: using search + # sql to get gid: select gID from gene_info where genome="human" and locustag !="-" and locustag != symbol limit 5; + gid = "RP11-177A2.3" + searchGID = True + geneInfoDict = {"27": [["correct"], ["wrong name 2"]]} + idb = self.idb + genomeName = "human" + geneAnnotDict = {("human", "0"): ["wrong name 3"]} + self.assertEquals("correct", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)) + + + def testWriteOutputFile(self): + genome = Genome(self.genomeName) + gidList = ["RP11-177A2.3"] + gidCount = {"RP11-177A2.3": 1} + geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False) + + outfile = open(self.outfilename, "r") + line = outfile.readline() + result = "RP11-177A2.3\tLOCRP11-177A2.3\t1\n" + self.assertEquals(result, line) + outfile.close() + os.remove(self.outfilename) + + genome = Genome("hsapiens") + gidList = ["RP11-177A2.3"] + gidCount = {"something else": 1} + geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False) + + outfile = open(self.outfilename, "r") + line = outfile.readline() + result = "RP11-177A2.3\tLOCRP11-177A2.3\t0\n" + self.assertEquals(result, line) + outfile.close() + os.remove(self.outfilename) + + def testMain(self): + argv = ["geneMRNACounts", self.genomeName, self.testDBName, self.outfilename] + geneMrnaCounts.main(argv) + outfile = open(self.outfilename, "r") + for line in outfile: + fields = line.split("\t") + self.assertEquals("0\n", fields[2]) + + outfile.close() + os.remove(self.outfilename) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGeneMrnaCounts)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testGetFasta.py b/test/testGetFasta.py new file mode 100644 index 0000000..cb71685 --- /dev/null +++ b/test/testGetFasta.py @@ -0,0 +1,231 @@ +''' +Created on Aug 27, 2010 + +@author: sau +''' +import unittest +import os +from Erange import getfasta +#from Erange import ReadDataset +from Erange.commoncode import readDataset + +testDBName = "testRDS.rds" + + +class TestGetFasta(unittest.TestCase): + + + def setUp(self): + self.regionDict = {} + self.minHitThresh = -1 + self.maxsize = 3000 + self.outfilename = "testFileForTestGetFasta.fa" + + + def tearDown(self): + try: + os.remove(self.outfilename) + except OSError: + print "fasta file does not exist" + + try: + os.remove(testDBName) + except OSError: + print "RDS file does not exist" + + + def testGetDefaultRegion(self): + self.assertEquals({}, getfasta.getDefaultRegion(self.regionDict, self.maxsize)) + + regionDict = {"1": [], + "2": [] + } + result = {"2": [], + "1": [] + } + self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize)) + + regionDict = {"1": [(10, 20, 10)], + "2": [] + } + result = {"2": [], + "1": [{"start": 10, "length": 10, "topPos": [-1]}] + } + self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize)) + + regionDict = {"1": [(10, 20, 10)], + "2": [(11, 21, 11)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}], + "1": [{"start": 10, "length": 10, "topPos": [-1]}] + } + self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize)) + + regionDict = {"1": [(10, 20, 10), (100, 4000, 3900)], + "2": [(11, 21, 11)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}], + "1": [{"start": 10, "length": 10, "topPos": [-1]}] + } + self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize)) + + regionDict = {"1": [(10, 20, 10), (100, 4000, 3900), (50, 60, 10)], + "2": [(11, 21, 11)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}], + "1": [{"start": 10, "length": 10, "topPos": [-1]}, + {"start": 50, "length": 10, "topPos": [-1]}] + } + self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize)) + + + def testGetRegionUsingPeaks(self): + self.assertEquals({}, getfasta.getRegionUsingPeaks(self.regionDict, self.minHitThresh, self.maxsize)) + + regionDict = {"1": [], + "2": [] + } + result = {"2": [], + "1": [] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize)) + + regionDict = {"1": [(10, 20, 10, 15, 1)], + "2": [] + } + result = {"2": [], + "1": [{"start": 10, "length": 10, "topPos": [5]}] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize)) + + result = {"2": [], + "1": [] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, 3, self.maxsize)) + + regionDict = {"1": [(10, 20, 10, 15, 1)], + "2": [(11, 21, 11, 18, 1)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [7]}], + "1": [{"start": 10, "length": 10, "topPos": [5]}] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize)) + + regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1)], + "2": [(11, 21, 11, 18, 1)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [7]}], + "1": [{"start": 10, "length": 10, "topPos": [5]}] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize)) + + regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1), (50, 60, 10, 59, 1)], + "2": [(11, 21, 11, 18, 1)] + } + result = {"2": [{"start": 11, "length": 11, "topPos": [7]}], + "1": [{"start": 10, "length": 10, "topPos": [5]}, + {"start": 50, "length": 10, "topPos": [9]}] + } + self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize)) + + + #TODO: write test. This seems to not make sense. We are always returning a "topPos" of range(rlen). + # need to check to see if the issue might be with commoncode.findPeak as there is a lot of questionable + # logic in that one + def testGetRegionUsingRDS(self): + rds = readDataset(testDBName, initialize=True, datasetType="DNA", verbose=False) + rds.insertMetadata([("readsize", "100")]) + rdsEntryList = [("testRead", "chr1", 10, 100, "+", 1.0, "", "")] + rds.insertUniqs(rdsEntryList) + self.assertEquals({}, getfasta.getRegionUsingRDS(self.regionDict, rds, self.minHitThresh, self.maxsize)) + + regionDict = {"1": [], + "2": [] + } + result = {"2": [], + "1": [] + } + self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize)) + + # Ack with a capital ACK. + regionDict = {"1": [(1, 600, 5)], + "2": [] + } + result = {"1": [{"start": 1, "length": 5, "topPos": [0, 1, 2, 3, 4]}], + "2": [] + } + self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize)) + + del(rds) + + + def testWriteFastaFile(self): + ncregions = {} + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + for line in open(self.outfilename): + self.assertEquals("", line) + + ncregions = {"1": [], + "2": [] + } + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + for line in open(self.outfilename): + self.assertEquals("", line) + + ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}], + "2": [] + } + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + fastaFile = open(self.outfilename) + self.assertEquals(">chr1:11956-12057\n", fastaFile.readline()) + self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline()) + self.assertEquals("", fastaFile.readline()) + + ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}], + "2": [{"start": 18000, "length": 50, "topPos": [30]}] + } + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + fastaFile = open(self.outfilename) + self.assertEquals(">chr1:11956-12057\n", fastaFile.readline()) + self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline()) + self.assertEquals(">chr2:17980-18081\n", fastaFile.readline()) + self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline()) + self.assertEquals("", fastaFile.readline()) + + ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6, 20]}], + "2": [{"start": 18000, "length": 50, "topPos": [30]}] + } + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + fastaFile = open(self.outfilename) + self.assertEquals(">chr1:11956-12057\n", fastaFile.readline()) + self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline()) + self.assertEquals(">chr2:17980-18081\n", fastaFile.readline()) + self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline()) + self.assertEquals("", fastaFile.readline()) + + ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}, + {"start": 15000, "length": 50, "topPos": [2]} + ], + "2": [{"start": 18000, "length": 50, "topPos": [30]}] + } + getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename) + fastaFile = open(self.outfilename) + self.assertEquals(">chr1:11956-12057\n", fastaFile.readline()) + self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline()) + self.assertEquals(">chr1:14952-15053\n", fastaFile.readline()) + self.assertEquals("AGTGAATGAGGGAAAGGGCAGGGCCCGGGACTGGGGAATCTGTAGGGTCAATGGAGGAGTTCAGAGAAGGTGCAACATTTCTGACCCCCTACAAGGTGCTT\n", fastaFile.readline()) + self.assertEquals(">chr2:17980-18081\n", fastaFile.readline()) + self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline()) + self.assertEquals("", fastaFile.readline()) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGetFasta)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testGetNovelSNPs.py b/test/testGetNovelSNPs.py new file mode 100644 index 0000000..93865d5 --- /dev/null +++ b/test/testGetNovelSNPs.py @@ -0,0 +1,33 @@ +''' +Created on Aug 26, 2010 + +@author: sau +''' +import unittest + + +class TestGetNovelSNPs(unittest.TestCase): + + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testName(self): + pass + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGetNovelSNPs)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testGetSNPGeneInfo.py b/test/testGetSNPGeneInfo.py new file mode 100644 index 0000000..ed33674 --- /dev/null +++ b/test/testGetSNPGeneInfo.py @@ -0,0 +1,131 @@ +''' +Created on Aug 26, 2010 + +@author: sau +''' +import unittest +from Erange import getSNPGeneInfo + + +class TestGetSNPGeneInfo(unittest.TestCase): + + + def setUp(self): + self.geneDict = {} + self.snpDict = {} + self.rpkmDict = {} + self.withSense = False + + + def tearDown(self): + pass + + + def testDoNotProcessLine(self): + self.assertTrue(getSNPGeneInfo.doNotProcessLine("#anything")) + self.assertFalse(getSNPGeneInfo.doNotProcessLine("line to process")) + + + def testGetSNPGeneInfoList(self): + geneInfoList = getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense) + self.assertEquals([], geneInfoList) + + badGeneDict = {"badEntry": "foo"} + self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneInfoList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"} + self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneInfoList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense) + + self.snpDict[("1", 1)] = "chr1\tpos 1\n" + result = [{"symbol": "gene1", + "rpkm": "N\\A", + "geneID": "ID1", + "snpDescription": "chr1\tpos 1" } + ] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + self.rpkmDict["ID1"] = 300 + result = [{"symbol": "gene1", + "rpkm": "300", + "geneID": "ID1", + "snpDescription": "chr1\tpos 1" } + ] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"} + self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + result = [{"symbol": "gene1", + "sense": "+", + "rpkm": "300", + "geneID": "ID1", + "snpDescription": "chr1\tpos 1" } + ] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, True)) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"} + self.snpDict[("1", 10)] = "chr1\tpos 10\n" + result = [{"symbol": "gene1", + "rpkm": "300", + "geneID": "ID1", + "snpDescription": "chr1\tpos 10" }, + {"symbol": "gene1", + "rpkm": "300", + "geneID": "ID1", + "snpDescription": "chr1\tpos 1" } + ] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + + #TODO: write test + def testGetSNPGeneInfo(self): + pass + + + def testGetSNPGeneOutputList(self): + geneOutputList = getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense) + self.assertEquals([], geneOutputList) + + badGeneDict = {"badEntry": "foo"} + self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneOutputList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"} + self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneOutputList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense) + + self.snpDict[("1", 1)] = "chr1\tpos 1\n" + result = ["chr1\tpos 1\tgene1\tID1\tN\\A"] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + self.rpkmDict["ID1"] = 300 + result = ["chr1\tpos 1\tgene1\tID1\t300"] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"} + self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + result = ["chr1\tpos 1\tgene1\tID1\t300\t+"] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, True)) + + self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"} + self.snpDict[("1", 10)] = "chr1\tpos 10\n" + result = ["chr1\tpos 10\tgene1\tID1\t300", + "chr1\tpos 1\tgene1\tID1\t300" + ] + self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)) + + + #TODO: write test + def testWriteSNPGeneInfo(self): + pass + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGetSNPGeneInfo)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testGetSNPs.py b/test/testGetSNPs.py new file mode 100644 index 0000000..68ef8c0 --- /dev/null +++ b/test/testGetSNPs.py @@ -0,0 +1,84 @@ +''' +Created on Jun 4, 2010 + +@author: sau +''' +import os, unittest +from Erange.commoncode import readDataset +from Erange import getSNPs + + +class TestGetSNPs(unittest.TestCase): + + def setUp(self): + self.rdsDNA = readDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True) + + uniqueInsertList = [("uniqueID1", "chr1", 10, 20, "+", 1.0, "", ""), + ("uniqueID2", "chr1", 100, 200, "+", 1.0, "", ""), + ("uniqueID3", "chr1", 1000, 2000, "+", 1.0, "", "G10A")] + + multiInsertList = [("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""), + ("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""), + ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""), + ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""), + ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""), + ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", "")] + + self.rdsDNA.insertUniqs(uniqueInsertList) + self.rdsDNA.insertMulti(multiInsertList) + + + def tearDown(self): + os.remove("./testDNARDSForUnitTests.rds") + self.rdsDNA = None + + + def testGetMatchDict(self): + uniqueTestDict = getSNPs.getMatchDict(self.rdsDNA, "chr1", withSplices=False) + + self.assertEqual(uniqueTestDict[10][0], 20, "incorrect result for unique chr position 10") + self.assertEqual(uniqueTestDict[100][0], 200, "incorrect result for unique chr position 100") + self.assertEqual(uniqueTestDict[1000][0], 2000, "incorrect result for unique chr position 1000") + + self.assertRaises(KeyError, getSNPs.getMatchDict, self.rdsDNA, "chr2", withSplices=False) + + + def testGetMismatchDict(self): + mismatchDict = getSNPs.getMismatchDict(self.rdsDNA, "chr1") + result = {1009: {"totalBaseDict": {"A-G": 1}, + "uniqueReadCount": 1, + "uniqBaseDict": {"A-G": 1}, + "back": "1000:A-G", "totalCount": 1 + } + } + self.assertEquals(result, mismatchDict) + + + #TODO: write unit test + def testGetSNPs(self): + pass + + + #TODO: write unit test + def testWriteSNPsToFile(self): + pass + + + def testDoNotProcessChromosome(self): + self.assertFalse(getSNPs.doNotProcessChromosome(True, "chr1")) + self.assertFalse(getSNPs.doNotProcessChromosome(False, "chr1")) + self.assertFalse(getSNPs.doNotProcessChromosome(False, "badName")) + self.assertTrue(getSNPs.doNotProcessChromosome(True, "badName")) + self.assertTrue(getSNPs.doNotProcessChromosome(True, "")) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestGetSNPs)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMakeBamFromRds.py b/test/testMakeBamFromRds.py new file mode 100644 index 0000000..8c0df53 --- /dev/null +++ b/test/testMakeBamFromRds.py @@ -0,0 +1,38 @@ +''' +Created on Jun 4, 2010 + +@author: sau +''' +import unittest +from Erange import MakeBamFromRds + + +class TestMakeBamFromRds(unittest.TestCase): + + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testGetMismatches(self): + mismatchString = "3A10T" + self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("A3G, T10A")) + + mismatchString = "" + self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("")) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeBamFromRds)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMakeGraphs.py b/test/testMakeGraphs.py new file mode 100644 index 0000000..567e5d2 --- /dev/null +++ b/test/testMakeGraphs.py @@ -0,0 +1,83 @@ +''' +Created on Jul 28, 2010 + +@author: sau +''' + +import os, unittest +from Erange import makeGraphs + +testFileName = "/tmp/testEdgeFileForUnitTests.txt" + +class TestMakeGraphs(unittest.TestCase): + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testGetEdges(self): + nodeList = [] + self.assertEquals({}, makeGraphs.getEdges(nodeList)) + + nodeEntry = "ex_node1\tex_node2\t1" + nodeList.append(nodeEntry) + result = {"ex_node1": [("ex_node2", 1)], + "ex_node2": [("ex_node1", 1)]} + self.assertEquals(result, makeGraphs.getEdges(nodeList)) + + nodeEntry = "ex_node1\tex_node3\t2" + nodeList.append(nodeEntry) + result = {"ex_node1": [("ex_node2", 1), ("ex_node3", 2)], + "ex_node2": [("ex_node1", 1)], + "ex_node3": [("ex_node1", 2)] + } + self.assertEquals(result, makeGraphs.getEdges(nodeList)) + + result = {"node1": [("node2", 1), ("node3", 2)], + "node2": [("node1", 1)], + "node3": [("node1", 2)] + } + self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True)) + + nodeEntry = "ex:node1\tex:node2\t1" + nodeList = [nodeEntry] + result = {"ex:node1": [("ex:node2", 1)], + "ex:node2": [("ex:node1", 1)]} + self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True)) + + nodeEntry = "badLine" + nodeList = [nodeEntry] + self.assertEquals({}, makeGraphs.getEdges(nodeList)) + nodeEntry = "node1\tnode2\t1" + nodeList.append(nodeEntry) + result = {"node1": [("node2", 1)], + "node2": [("node1", 1)]} + self.assertEquals(result, makeGraphs.getEdges(nodeList)) + + + def testGetEdgesFromFile(self): + self.edgeFile = open(testFileName, "w") + self.edgeFile.write("node1\tnode2\t1") + self.edgeFile.close() + + result = {"node1": [("node2", 1)], + "node2": [("node1", 1)]} + self.assertEquals(result, makeGraphs.getEdgesFromFile(testFileName)) + + os.remove(testFileName) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeGraphs)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMakeRdsFromBam.py b/test/testMakeRdsFromBam.py new file mode 100644 index 0000000..4b4da1a --- /dev/null +++ b/test/testMakeRdsFromBam.py @@ -0,0 +1,66 @@ +''' +Created on Jun 10, 2010 + +@author: sau +''' +import unittest +from Erange import MakeRdsFromBam + +class TestMakeRdsFromBam(unittest.TestCase): + + + def testGetSpliceBounds(self): + start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)]) + + self.assertEqual(start, 0, "incorrect start position for 262") + self.assertEqual(startR, 8, "incorrect right start position for 262") + self.assertEqual(stopL, 2, "incorrect left stop position for 262") + self.assertEqual(stopR, 10, "incorrect right stop position for 262") + + + def testGetMismatches(self): + querySequence = "GATTACA" + + resultString = "A3T" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "+")) + resultString = "T3A" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "-")) + resultString = "T7A" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("6T", querySequence, "+")) + + resultString = "A3T,T7A" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A3T0", querySequence, "+")) + + resultString = "" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^T", querySequence, "+")) + + resultString = "T5A" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "+")) + resultString = "A5T" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "-")) + + resultString = "A3N" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", "", "+")) + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A")) + + resultString = "" + self.assertEquals(resultString, MakeRdsFromBam.getMismatches("badMismatchTagData", querySequence, "+")) + + + def testIsSpliceEntry(self): + self.assertTrue(MakeRdsFromBam.isSpliceEntry([(1,6), (3, 4), (1, 2)])) + self.assertFalse(MakeRdsFromBam.isSpliceEntry([(1,6), (2, 4), (1, 2)])) + self.assertFalse(MakeRdsFromBam.isSpliceEntry([])) + self.assertFalse(MakeRdsFromBam.isSpliceEntry("")) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeRdsFromBam)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMakeSNPTrack.py b/test/testMakeSNPTrack.py new file mode 100644 index 0000000..b52b546 --- /dev/null +++ b/test/testMakeSNPTrack.py @@ -0,0 +1,85 @@ +''' +Created on Aug 25, 2010 + +@author: sau +''' +import unittest +from Erange import makeSNPtrack + + +class TestMakeSNPTrack(unittest.TestCase): + + baseColor = {"A": "200, 0, 255", + "T": "200, 0, 255", + "C": "200, 0, 255", + "G": "200, 0, 255" + } + + specialColors = {"A-G": "255, 0, 0", + "T-C": "0, 0, 255" + } + + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testGetHeader(self): + track = "test track" + header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track) + self.assertEquals(header, makeSNPtrack.getHeader(track)) + + + def testDoNotProcessLine(self): + self.assertTrue(makeSNPtrack.doNotProcessLine("#anything")) + self.assertFalse(makeSNPtrack.doNotProcessLine("line to process")) + + + def testGetBedOutputLine(self): + chromosome = "chr1" + readStart = 10 + readStop = 11 + readName = "A" + score = "0" + sense = "+" + color = self.baseColor[readName] + snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6", readName] + outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color) + self.assertEquals(outline, makeSNPtrack.getBedOutputLine(snpPropertiesList)) + + snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6"] + self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList) + + snpPropertiesList = [] + self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList) + + snpPropertiesList = ["0", "1", chromosome, "some string", "4", "5", "6", readName] + self.assertRaises(ValueError, makeSNPtrack.getBedOutputLine, snpPropertiesList) + + + def testGetSNPColor(self): + for base in self.baseColor.keys(): + self.assertEquals(self.baseColor[base], makeSNPtrack.getSNPColor(base)) + + for base in self.specialColors.keys(): + self.assertEquals(self.specialColors[base], makeSNPtrack.getSNPColor(base)) + + defaultColor = "200, 0, 255" + self.assertEquals(defaultColor, makeSNPtrack.getSNPColor("")) + self.assertEquals(defaultColor, makeSNPtrack.getSNPColor("V")) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeSNPTrack)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testMarkLinkers.py b/test/testMarkLinkers.py new file mode 100644 index 0000000..775b2e0 --- /dev/null +++ b/test/testMarkLinkers.py @@ -0,0 +1,142 @@ +''' +Created on Sep 15, 2010 + +@author: sau +''' +import unittest +import os +from Erange.chiapet import markLinkers + + +class TestMarkLinkers(unittest.TestCase): + linkerFileName = "/Users/sau/Eclipse/erange/source/Erange/chiapet/linkers.fa" + inFileName = "linkerTestIn.txt" + outFileName = "linkerTestOut.txt" + + def setUp(self): + infile = open(self.inFileName, "w") + infile.close() + + + def tearDown(self): + try: + os.remove(self.inFileName) + except OSError: + pass + + try: + os.remove(self.outFileName) + except OSError: + pass + + + def testMarkLinkers(self): + markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName) + output = open(self.outFileName) + for line in output: + self.assertEquals("", line) + + output.close() + os.remove(self.outFileName) + + infile = open(self.inFileName, "w") + print >> infile, "" + print >> infile, "@Linker1" + print >> infile, "........................GTTGGATAAGATATCGCGG....." + print >> infile, "@NoLinker" + print >> infile, "GATTACA.GATTACA.GATTACA.GATTACA.GATTACA.GATTACA." + print >> infile, "@Linker2" + print >> infile, "........................GTTGGAATGTATATCGCGG....." + print >> infile, "@Linker1Short" + print >> infile, "..............GTTGGAATGTATATCGCGG..............." + print >> infile, "@Linker2Short" + print >> infile, "..............GTTGGAATGTATATCGCGG..............." + infile.close() + + markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName) + output = open(self.outFileName) + self.assertEquals(">L1_Linker1\n", output.readline()) + self.assertEquals("....................\n", output.readline()) + self.assertEquals(">NA_NoLinker\n", output.readline()) + self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline()) + self.assertEquals(">NA_NoLinker\n", output.readline()) + self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline()) + self.assertEquals(">NA_Linker2\n", output.readline()) + self.assertEquals("....................\n", output.readline()) + self.assertEquals(">L2_Linker2\n", output.readline()) + self.assertEquals("....................\n", output.readline()) + self.assertEquals(">NA_Linker1Short\n", output.readline()) + self.assertEquals("..............GTTGGA\n", output.readline()) + self.assertEquals(">NA_Linker1Short\n", output.readline()) + self.assertEquals("..............GTTGGA\n", output.readline()) + self.assertEquals(">NA_Linker2Short\n", output.readline()) + self.assertEquals("..............GTTGGA\n", output.readline()) + self.assertEquals(">NA_Linker2Short\n", output.readline()) + self.assertEquals("..............GTTGGA\n", output.readline()) + + output.close() + #TODO: Check that we really do want to output the same line + #multiple times in the case where neither linker is detected. + #See if downstream there is a real reason for doing it this way + #or if it was handled as a bug introduced at this stage of the + #analysis. + + + def testGetLinkerInformation(self): + linkerDict, linkerList = markLinkers.getLinkerInformation([]) + resultDict = {} + resultList = [] + self.assertEquals(resultDict, linkerDict) + self.assertEquals(resultList, linkerList) + + linkerData = [">linker_b.1", + "GTTGGATAAGATATCGCGG", + ">linker_b.2", + "GTTGGAATGTATATCGCGG" + ] + linkerDict, linkerList = markLinkers.getLinkerInformation(linkerData) + resultDict = {"linker_b.1": "GTTGGATAAG", + "linker_b.2": "GTTGGAATGT" + } + resultList = ["linker_b.1", "linker_b.2"] + self.assertEquals(resultDict, linkerDict) + self.assertEquals(resultList, linkerList) + + + def testGetLinkerInformationFromFile(self): + linkerDict, linkerList = markLinkers.getLinkerInformationFromFile("bad file name") + resultDict = {} + resultList = [] + self.assertEquals(resultDict, linkerDict) + self.assertEquals(resultList, linkerList) + + linkerDict, linkerList = markLinkers.getLinkerInformationFromFile(self.linkerFileName) + resultDict = {"linker_b.1": "GTTGGATAAG", + "linker_b.2": "GTTGGAATGT" + } + resultList = ["linker_b.1", "linker_b.2"] + self.assertEquals(resultDict, linkerDict) + self.assertEquals(resultList, linkerList) + + + def testMain(self): + argv = ["markLinkers", self.linkerFileName, self.inFileName, self.outFileName] + markLinkers.main(argv) + output = open(self.outFileName) + for line in output: + self.assertEquals("", line) + + output.close() + os.remove(self.outFileName) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMarkLinkers)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testPeaksToRegion.py b/test/testPeaksToRegion.py new file mode 100644 index 0000000..dd16d86 --- /dev/null +++ b/test/testPeaksToRegion.py @@ -0,0 +1,79 @@ +''' +Created on Oct 4, 2010 + +@author: sau +''' +import unittest +import os +from Erange import peakstoregion + +inFileName = "testPeaksToRegionInFile.txt" +outFileName = "testPeaksToRegionOutFile.txt" + + +class TestPeaksToRegion(unittest.TestCase): + + + def setUp(self): + self.inFile = open(inFileName, "w") + self.inFile.write("stuff\tpeak1\tchr1\t1000\t1.3\n") + self.inFile.write("stuff\tpeak2\tchr1\t800\t9.7\n") + self.inFile.write("stuff\tpeak3\tchr2\t1000\t3.0\n") + self.inFile.close() + + + def tearDown(self): + try: + os.remove(outFileName) + except OSError: + pass + + try: + os.remove(inFileName) + except OSError: + pass + + + def testPeaksToRegion(self): + peakstoregion.peakstoregion(inFileName, outFileName) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0]) + self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1]) + self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2]) + + + def testMain(self): + argv = ["peakstoregion", inFileName, outFileName] + peakstoregion.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0]) + self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1]) + self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2]) + + argv = ["peakstoregion", inFileName, outFileName, 600, 2, 3, 1, -1] + peakstoregion.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("peak1\tchr1\t400\t1600\t1.3\n", results[0]) + self.assertEquals("peak2\tchr1\t200\t1400\t9.7\n", results[1]) + self.assertEquals("peak3\tchr2\t400\t1600\t3.0\n", results[2]) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestPeaksToRegion)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testProcessVelvet.py b/test/testProcessVelvet.py new file mode 100644 index 0000000..99ef5a3 --- /dev/null +++ b/test/testProcessVelvet.py @@ -0,0 +1,236 @@ +''' +Created on Sep 15, 2010 + +@author: sau +''' +import unittest +import os +from Erange.rnapath import processvelvet + + +class TestProcessVelvet(unittest.TestCase): + inFileName = "testProcessVelvetIn.txt" + filterFileName = "testProcessVelvetFilter.txt" + outFileName = "testProcessVelvetOut.txt" + + + def setUp(self): + infile = open(self.inFileName, "w") + infile.close() + filter = open(self.filterFileName, "w") + filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODE1-1_0\n") + filter.close() + + + def tearDown(self): + try: + os.remove(self.inFileName) + except OSError: + pass + + try: + os.remove(self.filterFileName) + except OSError: + pass + + try: + os.remove(self.outFileName) + except OSError: + pass + + + def testProcessVelvet(self): + processvelvet.processvelvet(self.inFileName, self.outFileName) + outfile = open(self.outFileName) + for line in outfile: + self.assertEquals("", line) + + os.remove(self.outFileName) + + infile = open(self.inFileName, "w") + print >> infile, ">NODE1-1_0" + print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + infile.close() + + processvelvet.processvelvet(self.inFileName, self.outFileName) + outfile = open(self.outFileName) + self.assertEquals(">chr0\n", outfile.readline()) + self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName) + outfile = open(self.outFileName) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + + infile = open(self.inFileName, "w") + print >> infile, ">NODE1-1_1" + print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" + print >> infile, ">NODE1-1_0" + print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + infile.close() + + processvelvet.processvelvet(self.inFileName, self.outFileName) + outfile = open(self.outFileName) + self.assertEquals(">chr1\n", outfile.readline()) + self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline()) + self.assertEquals(">chr0\n", outfile.readline()) + self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName) + outfile = open(self.outFileName) + self.assertEquals(">chr1\n", outfile.readline()) + self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + + infile = open(self.inFileName, "w") + print >> infile, ">NODE1-1_1" + print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" + print >> infile, ">NODE1-1_0" + print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + print >> infile, ">NODE1-1_2" + print >> infile, "GATTACA" + infile.close() + + processvelvet.processvelvet(self.inFileName, self.outFileName) + outfile = open(self.outFileName) + self.assertEquals(">chr1\n", outfile.readline()) + self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline()) + self.assertEquals(">chr0\n", outfile.readline()) + self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline()) + self.assertEquals(">chr2\n", outfile.readline()) + self.assertEquals("GATTACA\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName) + outfile = open(self.outFileName) + self.assertEquals(">chr1\n", outfile.readline()) + self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline()) + self.assertEquals(">chr2\n", outfile.readline()) + self.assertEquals("GATTACA\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + os.remove(self.outFileName) + processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName, minSize=10) + outfile = open(self.outFileName) + self.assertEquals(">chr1\n", outfile.readline()) + self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + + + def testGetFilterList(self): + self.assertEquals([], processvelvet.getFilterList()) + self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName)) + self.assertEquals([], processvelvet.getFilterList("whatfile?")) + + filter = open(self.filterFileName, "a") + filter.write("some fields without the key trigger string\n") + filter.close() + self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName)) + + filter = open(self.filterFileName, "a") + filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n") + filter.close() + self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName)) + + filter = open(self.filterFileName, "a") + filter.write("0\tNODEWrongField\n") + filter.close() + self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName)) + + filter = open(self.filterFileName, "a") + filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n") + filter.close() + self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName)) + + + def testWriteNode(self): + node = {"contigPrefix": "chr", + "completeID": "", + "currentSeq": "" + } + + counts = {"acceptedSize": 0, + "nSize": 0, + "contigsAccepted": 0, + "filteredSize": 0 + } + + filterList = [] + + outfile = open(self.outFileName, "w") + processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False) + outfile.close() + results = open(self.outFileName) + self.assertEquals("", results.readline()) + results.close() + os.remove(self.outFileName) + + node["completeID"] = "<5" + node["currentSeq"] = "GATTACA\n" + outfile = open(self.outFileName, "w") + processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False) + self.assertEquals(counts["filteredSize"], 7) + counts["filteredSize"] = 0 + outfile.close() + results = open(self.outFileName) + self.assertEquals("", results.readline()) + results.close() + os.remove(self.outFileName) + + node["completeID"] = "NODE1_1" + node["currentSeq"] = "GATTACA\n" + filterList = ["NODE1_1"] + outfile = open(self.outFileName, "w") + processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False) + self.assertEquals(counts["filteredSize"], 7) + counts["filteredSize"] = 0 + outfile.close() + results = open(self.outFileName) + self.assertEquals("", results.readline()) + results.close() + os.remove(self.outFileName) + + node["completeID"] = "NODE1_1" + node["currentSeq"] = "GATTACA\n" + filterList = [] + outfile = open(self.outFileName, "w") + processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False) + self.assertEquals(counts["acceptedSize"], 7) + outfile.close() + results = open(self.outFileName) + self.assertEquals(">chr1\n", results.readline()) + self.assertEquals("GATTACA\n", results.readline()) + self.assertEquals("", results.readline()) + results.close() + os.remove(self.outFileName) + + + def testMain(self): + argv = ["processVelvet", self.inFileName, self.outFileName] + processvelvet.main(argv) + outfile = open(self.outFileName) + for line in outfile: + self.assertEquals("", line) + + os.remove(self.outFileName) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestProcessVelvet)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testRNAPATH.py b/test/testRNAPATH.py new file mode 100644 index 0000000..e217ff1 --- /dev/null +++ b/test/testRNAPATH.py @@ -0,0 +1,210 @@ +''' +Created on Sep 10, 2010 + +@author: sau +''' +import unittest +import os +from Erange.rnapath import RNAPATH + +compDict = {"A": "T", + "T": "A", + "G": "C", + "C": "G", + "S": "S", + "W": "W", + "R": "Y", + "Y": "R", + "M": "K", + "K": "M", + "H": "D", + "D": "H", + "B": "V", + "V": "B", + "N": "N", + "a": "t", + "t": "a", + "g": "c", + "c": "g", + "n": "n", + "z": "z" +} + + +class TestRNAPATH(unittest.TestCase): + incontigfilename = "contigIn.txt" + distalPairsfile = "distalPair.txt" + outpathfilename = "rnapathOut.txt" + outcontigfilename = "contigOut.txt" + + def setUp(self): + inContigs = open(self.incontigfilename, "w") + inContigs.close() + + distal = open(self.distalPairsfile, "w") + distal.close() + + + def tearDown(self): + try: + os.remove(self.incontigfilename) + except OSError: + pass + + try: + os.remove(self.distalPairsfile) + except OSError: + pass + + try: + os.remove(self.outpathfilename) + except OSError: + pass + + try: + os.remove(self.outcontigfilename) + except OSError: + pass + + + def testCompNT(self): + for nt in compDict.keys(): + self.assertEquals(compDict[nt], RNAPATH.compNT(nt)) + + self.assertEquals("N", RNAPATH.compNT("5")) + self.assertEquals("N", RNAPATH.compNT("anything")) + + + def testComplement(self): + self.assertEquals("", RNAPATH.complement("")) + for nt in compDict.keys(): + self.assertEquals(compDict[nt], RNAPATH.complement(nt)) + + self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA")) + self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", 7)) + self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", -75632)) + self.assertEquals("TGTA", RNAPATH.complement("GATTACA", 4)) + + #TODO: do we want to return when length > seqlength? This is + # the current return and it seems very wrong we only N fill + # after going more then seqlength in negative direction + self.assertEquals("TGTAATCTG", RNAPATH.complement("GATTACA", 9)) + self.assertEquals("TGTAATCTGTAATCNNNNN", RNAPATH.complement("GATTACA", 19)) + + #TODO: write test + def testRnaPath(self): + RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename) + outfile = open(self.outpathfilename) + self.assertTrue("#settings:" in outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + outcontig = open(self.outcontigfilename) + self.assertEquals(0, len(outcontig.readlines())) + outcontig.close() + + #infile = open(self.incontigfilename, "w") + #infile.write(">chr1 stuff\n") + #infile.write("GATTACA\n") + #infile.close() + #RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename) + #outfile = open(self.outpathfilename) + #self.assertTrue("#settings:" in outfile.readline()) + #self.assertEquals("", outfile.readline()) + #outfile.close() + + + #TODO: write test + def testGetPath(self): + pass + + + #TODO: write test + def testTraverseGraph(self): + leafList = [] + edgeMatrix = RNAPATH.EdgeMatrix(0) + pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix) + self.assertEquals([], pathList) + self.assertEquals({}, visitedDict) + + leafList = [1] + edgeMatrix = RNAPATH.EdgeMatrix(3) + edgeMatrix.edgeArray[2][1] = 3 + edgeMatrix.edgeArray[1][2] = 3 + pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix) + self.assertEquals([ [1, 2] ], pathList) + self.assertEquals({1: "", 2: ""}, visitedDict) + + leafList = [1, 2] + edgeMatrix = RNAPATH.EdgeMatrix(3) + edgeMatrix.edgeArray[2][1] = 3 + edgeMatrix.edgeArray[1][2] = 3 + pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix) + self.assertEquals([ [1, 2] ], pathList) + self.assertEquals({1: "", 2: ""}, visitedDict) + + + #TODO: write test + def testGetContigsFromFile(self): + contigNum, nameList, contigDict, origSize = RNAPATH.getContigsFromFile(self.incontigfilename) + self.assertEquals(0, contigNum) + self.assertEquals([], nameList) + self.assertEquals({}, contigDict) + self.assertEquals([], origSize) + + + #TODO: check for boundary condition and special cases + def testEdgeMatrix(self): + edgeMatrix = RNAPATH.EdgeMatrix(0) + result = "[]" + self.assertEquals(result, str(edgeMatrix.edgeArray)) + + edgeMatrix = RNAPATH.EdgeMatrix(3) + result = "[[0 0 0]\n [0 0 0]\n [0 0 0]]" + self.assertEquals(result, str(edgeMatrix.edgeArray)) + self.assertEquals([], edgeMatrix.visitLink(0)) + + edgeMatrix.edgeArray[0][1] = 1 + self.assertEquals([], edgeMatrix.visitLink(0)) + + edgeMatrix.edgeArray[0][1] = 2 + result = [0] + self.assertEquals(result, edgeMatrix.visitLink(0)) + + edgeMatrix.edgeArray[2][1] = 2 + result = [] + self.assertEquals(result, edgeMatrix.visitLink(0)) + edgeMatrix.edgeArray[2][1] = 2 + result = [] + self.assertEquals(result, edgeMatrix.visitLink(1)) + edgeMatrix.edgeArray[2][1] = 2 + result = [2] + self.assertEquals(result, edgeMatrix.visitLink(2)) + + edgeMatrix.edgeArray[2][1] = 3 + edgeMatrix.edgeArray[1][2] = 3 + result = [1, 2] + self.assertEquals(result, edgeMatrix.visitLink(1)) + + + def testMain(self): + argv = ["RNAPATH", self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename] + RNAPATH.main(argv) + outfile = open(self.outpathfilename) + self.assertTrue("#settings:" in outfile.readline()) + self.assertEquals("", outfile.readline()) + outfile.close() + outcontig = open(self.outcontigfilename) + self.assertEquals(0, len(outcontig.readlines())) + outcontig.close() + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestRNAPATH)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testReadDataset.py b/test/testReadDataset.py new file mode 100644 index 0000000..3ac8f54 --- /dev/null +++ b/test/testReadDataset.py @@ -0,0 +1,935 @@ +''' +Created on Jul 21, 2010 + +@author: sau +''' +import unittest +import os +import sqlite3 as sqlite +from Erange import ReadDataset + +testDBName = "testRDS.rds" +rnaTestDBName = "testRDSRNA.rds" + +class TestReadDataset(unittest.TestCase): + + + def setUp(self): + self.rds = ReadDataset.ReadDataset(testDBName, initialize=True, datasetType="DNA", verbose=False) + self.rnaRds = ReadDataset.ReadDataset(rnaTestDBName, initialize=True, datasetType="RNA", verbose=False) + + + def tearDown(self): + del(self.rds) + os.remove(testDBName) + del(self.rnaRds) + os.remove(rnaTestDBName) + + + #TODO: rename and integrate + def testZeeNewStuff(self): + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", ""), + ("dup start", "chr1", 1, 150, "+", 1.0, "", ""), + ("new read", "chr1", 80, 100, "+", 1.0, "", ""), + ("testRead", "chr2", 201, 400, "+", 1.0, "", ""), + ("dup start", "chr2", 201, 450, "+", 1.0, "", ""), + ("new read", "chr2", 280, 400, "+", 1.0, "", ""), + ("three up", "chr3", 1, 80, "+", 1.0, "", ""), + ("three two", "chr3", 201, 230, "+", 1.0, "", "") + ] + self.rds.insertUniqs(rdsEntryList) + dbcon = sqlite.connect(testDBName) + sql = dbcon.cursor() + sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1)") + result = [("chr1", 1), ("chr2", 201)] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)") + result = [("chr1", 80), ("chr2", 280), ("chr3", 1), ("chr3", 201)] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + sql.execute("select * from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select * from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)") + result = [(2, "dup start", "chr1", 1, 150, "+", 1.0, "", ""), + (3, "new read", "chr1", 80, 100, "+", 1.0, "", ""), + (5, "dup start", "chr2", 201, 450, "+", 1.0, "", ""), + (6, "new read", "chr2", 280, 400, "+", 1.0, "", ""), + (7, "three up", "chr3", 1, 80, "+", 1.0, "", ""), + (8, "three two", "chr3", 201, 230, "+", 1.0, "", "") + ] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + sql.execute("select chrom,start from uniqs where start > 100 group by chrom,start having ( count(start) > 1 and count(chrom) > 1) order by chrom,start") + result = [("chr2", 201)] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + + rdsEntryList = [("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr2", 80, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr2", 1, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr2", 5000, 25000, "+", 0.5, "", ""), + ("testMultiRead", "chr3", 1, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr3", 70, 500, "+", 0.5, "", "") + ] + self.rds.insertMulti(rdsEntryList) + sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1)") + result = [("chr1", 1), ("chr2", 201), ("chr3", 1)] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)") + result = [("chr1", 80), + ("chr2", 1), ("chr2", 80), ("chr2", 280), ("chr2", 5000), + ("chr3", 70), ("chr3", 201) + ] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)") + result = sql.fetchall() + result = [("chr1", 1), ("chr1", 80), + ("chr2", 1), ("chr2", 80), ("chr2", 201), ("chr2", 280), ("chr2", 5000), + ("chr3", 1), ("chr3", 70), ("chr3", 201) + ] + for eachEntry in sql.fetchall(): + self.assertTrue(eachEntry in result) + + result = {"1": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}], + "3": [{"start": 1, "sense": "+"}, {"start": 70, "sense": "+"}, {"start": 201, "sense": "+"}], + "2": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}, {"start": 201, "sense": "+"}, {"start": 280, "sense": "+"}, {"start": 5000, "sense": "+"}] + } + self.assertEquals(result, self.rds.getReadsDict(combine5p=True, doMulti=True)) + + print self.rds.getReadsDict(combine5p=True, doMulti=True, withWeight=True) + + def testReadDatasetBuiltIns(self): + # Initialize an existing rds file + self.assertRaises(sqlite.OperationalError, ReadDataset.ReadDataset, testDBName, initialize=True, datasetType="DNA", verbose=True) + self.assertEquals(0, len(self.rds)) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, len(self.rds)) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + self.assertEquals(2, len(self.rds)) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList) + self.rnaRds.insertSplices(rdsEntryList) + self.assertEquals(2, len(self.rds)) + self.assertEquals(1, len(self.rnaRds)) + + + def testInsertUniqs(self): + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, len(self.rds)) + + rdsEntryList = [("testRead2", "chr1", 200, 300, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(2, len(self.rds)) + + + def testInsertMulti(self): + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + self.assertEquals(1, len(self.rds)) + + + def testInsertSplices(self): + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList) + self.rnaRds.insertSplices(rdsEntryList) + self.assertEquals(0, len(self.rds)) + self.assertEquals(1, len(self.rnaRds)) + + + def testGetChromosomes(self): + result = [] + self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True)) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + result = ["chr1"] + self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True)) + + self.assertRaises(sqlite.OperationalError, self.rds.getChromosomes, table="badTableName") + + + #TODO: write unit test + def testAttachDB(self): + pass + + + #TODO: write unit test + def testDetachDB(self): + pass + + + #TODO: write unit test + def testImportFromDB(self): + pass + + + def testGetTables(self): + result = ["metadata", "uniqs", "multi"] + self.assertEquals(result, self.rds.getTables()) + + result = ["metadata", "uniqs", "multi", "splices"] + self.assertEquals(result, self.rnaRds.getTables()) + + + def testHasIndex(self): + self.assertFalse(self.rds.hasIndex()) + self.rds.buildIndex() + self.assertTrue(self.rds.hasIndex()) + + + def testGetMetadata(self): + returnDict = self.rds.getMetadata() + self.assertTrue(returnDict.has_key("rdsVersion")) + self.assertEquals(returnDict["dataType"], "DNA") + + result = {"dataType": "RNA"} + self.assertEquals(result, self.rnaRds.getMetadata("dataType")) + + result = {} + self.assertEquals(result, self.rds.getMetadata("badMetaDataName")) + + + def testGetReadSize(self): + self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getReadSize) + + self.rds.insertMetadata([("readsize", "100")]) + self.assertEquals(100, self.rds.getReadSize()) + + self.rds.updateMetadata("readsize", 100) + self.assertEquals(100, self.rds.getReadSize()) + + self.rds.updateMetadata("readsize", "100 import") + self.assertEquals(100, self.rds.getReadSize()) + + self.rds.updateMetadata("readsize", "badReadSize") + self.assertRaises(ValueError, self.rds.getReadSize) + + + def testGetDefaultCacheSize(self): + self.assertEquals(100000, self.rds.getDefaultCacheSize()) + + + def testGetMaxCoordinate(self): + self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1")) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1")) + self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr2")) + self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1", doUniqs=False)) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rnaRds.insertMulti(rdsEntryList) + self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1")) + self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True)) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1")) + self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True)) + self.assertEquals(1150, self.rnaRds.getMaxCoordinate("chr1", doSplices=True)) + + + def testGetReadsDict(self): + self.assertEquals({}, self.rds.getReadsDict()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + reads = self.rds.getReadsDict() + self.assertEquals(1, len(reads)) + self.assertEquals(1, len(reads["1"])) + read = reads["1"][0] + self.assertEquals(["start", "sense"], read.keys()) + self.assertEquals(1, read["start"]) + self.assertEquals("+", read["sense"]) + + reads = self.rds.getReadsDict(bothEnds=True, noSense=False, fullChrom=True, + withWeight=True, withFlag=True, withMismatch=True, withID=True, + withChrom=True, readIDDict=True) + self.assertEquals(1, len(reads)) + self.assertEquals(1, len(reads["testRead"])) + read = reads["testRead"][0] + self.assertEquals(["readID", "weight", "stop", "mismatch","start", "flag","sense", "chrom"], read.keys()) + self.assertEquals("testRead", read["readID"]) + self.assertEquals(1.0, read["weight"]) + self.assertEquals(100, read["stop"]) + self.assertEquals("", read["mismatch"]) + self.assertEquals(1, read["start"]) + self.assertEquals("", read["flag"]) + self.assertEquals("+", read["sense"]) + self.assertEquals("chr1", read["chrom"]) + + self.assertEquals({}, self.rds.getReadsDict(hasMismatch=True)) + self.assertEquals({}, self.rds.getReadsDict(strand="-")) + self.assertEquals(1, len(self.rds.getReadsDict(strand="+"))) + + rdsEntryList = [("testRead2", "chr1", 201, 300, "-", 1.0, "A", "G22A")] + self.rds.insertUniqs(rdsEntryList) + reads = self.rds.getReadsDict() + self.assertEquals(1, len(reads)) + reads = self.rds.getReadsDict() + self.assertEquals(2, len(reads["1"])) + read = reads["1"][1] + self.assertEquals(201, read["start"]) + reads = self.rds.getReadsDict(strand="+") + self.assertEquals(1, len(reads)) + read = reads["1"][0] + self.assertEquals("+", read["sense"]) + reads = self.rds.getReadsDict(strand="-") + self.assertEquals(1, len(reads)) + reads = self.rds.getReadsDict(start=199) + self.assertEquals(1, len(reads["1"])) + reads = self.rds.getReadsDict(hasMismatch=True) + self.assertEquals(1, len(reads["1"])) + + rdsEntryList = [("testMultiRead", "chr2", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr2", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + reads = self.rds.getReadsDict() + self.assertEquals(1, len(reads)) + reads = self.rds.getReadsDict(doMulti=True) + self.assertEquals(2, len(reads)) + reads = self.rds.getReadsDict(doUniqs=False, doMulti=True) + self.assertFalse(reads.has_key("1")) + + + def testGetSplicesDict(self): + self.assertRaises(sqlite.OperationalError, self.rds.getSplicesDict) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + reads = self.rnaRds.getSplicesDict() + self.assertEquals(1, len(reads)) + self.assertEquals(1, len(reads["1"])) + read = reads["1"][0] + result = ["startR", "stopL", "sense", "startL", "stopR"] + self.assertEquals(result, read.keys()) + self.assertEquals(1000, read["startL"]) + self.assertEquals("+", read["sense"]) + reads = self.rnaRds.getSplicesDict(splitRead=True) + self.assertEquals(2, len(reads["1"])) + self.assertEquals(1000, reads["1"][0]["startL"]) + self.assertFalse(reads["1"][0].has_key("startR")) + self.assertFalse(reads["1"][0].has_key("stopR")) + self.assertEquals(1150, reads["1"][1]["startR"]) + self.assertFalse(reads["1"][1].has_key("startL")) + self.assertFalse(reads["1"][1].has_key("stopL")) + self.assertEquals(reads["1"][0]["sense"], reads["1"][1]["sense"]) + + reads = self.rnaRds.getSplicesDict(noSense=False, fullChrom=True, + withWeight=True, withFlag=True, withMismatch=True, withID=True, + withChrom=True, readIDDict=True) + self.assertEquals(1, len(reads)) + self.assertEquals(1, len(reads["testSpliceRead"])) + read = reads["testSpliceRead"][0] + result = ["readID", "weight", "startR", "mismatch","stopR", "stopL", "flag", "startL", "sense", "chrom"] + self.assertEquals(result, read.keys()) + self.assertEquals("testSpliceRead", read["readID"]) + self.assertEquals(1.0, read["weight"]) + self.assertEquals(1150, read["startR"]) + self.assertEquals("", read["mismatch"]) + self.assertEquals(1200, read["stopR"]) + self.assertEquals(1100, read["stopL"]) + self.assertEquals("", read["flag"]) + self.assertEquals(1000, read["startL"]) + self.assertEquals("+", read["sense"]) + self.assertEquals("chr1", read["chrom"]) + + self.assertEquals({}, self.rnaRds.getSplicesDict(hasMismatch=True)) + self.assertEquals({}, self.rnaRds.getSplicesDict(strand="-")) + self.assertEquals(1, len(self.rnaRds.getSplicesDict(strand="+"))) + + rdsEntryList = [("testSpliceRead2", "chr1", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")] + self.rnaRds.insertSplices(rdsEntryList) + reads = self.rnaRds.getSplicesDict() + self.assertEquals(1, len(reads)) + reads = self.rnaRds.getSplicesDict() + self.assertEquals(2, len(reads["1"])) + read = reads["1"][1] + self.assertEquals(2000, read["startL"]) + reads = self.rnaRds.getSplicesDict(strand="+") + self.assertEquals(1, len(reads)) + read = reads["1"][0] + self.assertEquals("+", read["sense"]) + reads = self.rnaRds.getSplicesDict(strand="-") + self.assertEquals(1, len(reads)) + reads = self.rnaRds.getSplicesDict(start=1199) + self.assertEquals(1, len(reads["1"])) + reads = self.rnaRds.getSplicesDict(hasMismatch=True) + self.assertEquals(1, len(reads["1"])) + + rdsEntryList = [("testSpliceRead3", "chr2", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")] + self.rnaRds.insertSplices(rdsEntryList) + reads = self.rnaRds.getSplicesDict() + self.assertEquals(2, len(reads)) + self.assertEquals(2, len(reads["1"])) + self.assertEquals(1, len(reads["2"])) + reads = self.rnaRds.getSplicesDict(withID=True, chrom="chr2") + self.assertFalse(reads.has_key("1")) + self.assertEquals("testSpliceRead3", reads["2"][0]["readID"]) + + + def testGetCounts(self): + self.assertEquals(0, self.rds.getCounts()) + self.assertEquals((0, 0, 0), self.rds.getCounts(multi=True, reportCombined=False)) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, self.rds.getCounts()) + self.assertEquals((1, 0, 0), self.rds.getCounts(multi=True, reportCombined=False)) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + self.assertEquals(2, self.rds.getCounts(multi=True)) + self.assertEquals((1, 1, 0), self.rds.getCounts(multi=True, reportCombined=False)) + + self.assertEquals(1, self.rds.getCounts(chrom="chr1")) + self.assertEquals(0, self.rds.getCounts(chrom="chr2")) + self.assertEquals(1, self.rds.getCounts(rmin=1)) + self.assertEquals(1, self.rds.getCounts(rmin=1, rmax=1000)) + self.assertEquals(1, self.rds.getCounts(rmax=1000)) + self.assertEquals(0, self.rds.getCounts(rmin=1000)) + self.assertEquals(0, self.rds.getCounts(rmax=0)) + self.assertEquals(1, self.rds.getCounts(sense="+")) + self.assertEquals(0, self.rds.getCounts(sense="-")) + + self.assertEquals(0, self.rnaRds.getCounts()) + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + self.assertEquals(1, self.rnaRds.getCounts(splices=True)) + + + def testGetTotalCounts(self): + self.assertEquals(0, self.rds.getTotalCounts()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, self.rds.getTotalCounts()) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + + self.assertEquals(2, self.rds.getTotalCounts()) + self.assertEquals(2, self.rds.getTotalCounts(chrom="chr1")) + self.assertEquals(0, self.rds.getTotalCounts(chrom="chr2")) + self.assertEquals(2, self.rds.getTotalCounts(rmin=1)) + self.assertEquals(2, self.rds.getTotalCounts(rmax=1000)) + self.assertEquals(1, self.rds.getTotalCounts(rmin=101, rmax=1000)) + self.assertEquals(1, self.rds.getTotalCounts(rmin=1, rmax=100)) + self.assertEquals(0, self.rds.getTotalCounts(rmin=1000)) + self.assertEquals(0, self.rds.getTotalCounts(rmax=0)) + + + def testGetTableEntryCount(self): + table = "uniqs" + self.assertEquals(0, self.rds.getTableEntryCount(table)) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, self.rds.getTableEntryCount(table)) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + + self.assertEquals(1, self.rds.getTableEntryCount(table)) + self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1")) + self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2")) + self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1)) + self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000)) + self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=101, rmax=1000)) + self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000)) + self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0)) + self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' ")) + self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' ")) + self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True)) + + table="multi" + self.assertEquals(1, self.rds.getTableEntryCount(table)) + self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1")) + self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2")) + self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1)) + self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000)) + self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=101, rmax=1000)) + self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000)) + self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0)) + self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' ")) + self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' ")) + self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True)) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + table="splices" + self.assertEquals(1, self.rnaRds.getTableEntryCount(table)) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, chrom="chr1")) + self.assertEquals(0, self.rnaRds.getTableEntryCount(table, chrom="chr2")) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1, startField="startL")) + self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmin=1) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmax=2000, startField="startL")) + self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmax=2000) + self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=999, startField="startL")) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1000, startField="startL")) + self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=0, startField="startL")) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, restrict=" sense ='+' ")) + self.assertEquals(0, self.rnaRds.getTableEntryCount(table, restrict=" sense ='-' ")) + self.assertEquals(1, self.rnaRds.getTableEntryCount(table, distinct=True, startField="startL")) + self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, distinct=True) + + + def testGetUniqsCount(self): + self.assertEquals(0, self.rds.getUniqsCount()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(1, self.rds.getUniqsCount()) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + + self.assertEquals(1, self.rds.getUniqsCount()) + self.assertEquals(1, self.rds.getUniqsCount(chrom="chr1")) + self.assertEquals(0, self.rds.getUniqsCount(chrom="chr2")) + self.assertEquals(1, self.rds.getUniqsCount(rmin=1)) + self.assertEquals(1, self.rds.getUniqsCount(rmax=1000)) + self.assertEquals(0, self.rds.getUniqsCount(rmin=101, rmax=1000)) + self.assertEquals(0, self.rds.getUniqsCount(rmin=1000)) + self.assertEquals(0, self.rds.getUniqsCount(rmax=0)) + self.assertEquals(1, self.rds.getUniqsCount(restrict=" sense ='+' ")) + self.assertEquals(0, self.rds.getUniqsCount(restrict=" sense ='-' ")) + self.assertEquals(1, self.rds.getUniqsCount(distinct=True)) + + + def testGetSplicesCount(self): + self.assertEquals(0, self.rnaRds.getSplicesCount()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + self.assertEquals(0, self.rnaRds.getSplicesCount()) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rnaRds.insertMulti(rdsEntryList) + self.assertEquals(0, self.rnaRds.getSplicesCount()) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + + self.assertEquals(1, self.rnaRds.getSplicesCount()) + self.assertEquals(1, self.rnaRds.getSplicesCount(chrom="chr1")) + self.assertEquals(0, self.rnaRds.getSplicesCount(chrom="chr2")) + self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1)) + self.assertEquals(1, self.rnaRds.getSplicesCount(rmax=2000)) + self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=999)) + self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1000)) + self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=0)) + self.assertEquals(1, self.rnaRds.getSplicesCount(restrict=" sense ='+' ")) + self.assertEquals(0, self.rnaRds.getSplicesCount(restrict=" sense ='-' ")) + self.assertEquals(1, self.rnaRds.getSplicesCount(distinct=True)) + + + def testGetMultiCount(self): + self.assertEquals(0, self.rds.getMultiCount()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + self.assertEquals(0, self.rds.getMultiCount()) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rds.insertMulti(rdsEntryList) + + self.assertEquals(1, self.rds.getMultiCount()) + self.assertEquals(1, self.rds.getMultiCount(chrom="chr1")) + self.assertEquals(0, self.rds.getMultiCount(chrom="chr2")) + self.assertEquals(1, self.rds.getMultiCount(rmin=1)) + self.assertEquals(1, self.rds.getMultiCount(rmax=1000)) + self.assertEquals(0, self.rds.getMultiCount(rmin=1, rmax=100)) + self.assertEquals(0, self.rds.getMultiCount(rmin=1000)) + self.assertEquals(0, self.rds.getMultiCount(rmax=0)) + self.assertEquals(1, self.rds.getMultiCount(restrict=" sense ='+' ")) + self.assertEquals(0, self.rds.getMultiCount(restrict=" sense ='-' ")) + self.assertEquals(1, self.rds.getMultiCount(distinct=True)) + + + def testGetReadIDs(self): + self.assertEquals([], self.rnaRds.getReadIDs()) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + result = ["testRead"] + self.assertEquals(result, self.rnaRds.getReadIDs()) + + rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")] + self.rnaRds.insertMulti(rdsEntryList) + result = ["testRead"] + self.assertEquals(result, self.rnaRds.getReadIDs()) + result = ["testMultiRead", "testRead"] + self.assertEquals(result, self.rnaRds.getReadIDs(multi=True)) + + rdsEntryList = [("testRead2", "chr1", 201, 300, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + result = ["testRead", "testRead2"] + self.assertEquals(result, self.rnaRds.getReadIDs()) + result = ["testRead"] + self.assertEquals(result, self.rnaRds.getReadIDs(limit=1)) + result = ["testMultiRead"] + self.assertEquals(result, self.rnaRds.getReadIDs(multi=True, limit=1)) + + rdsEntryList = [("testPair/1", "chr1", 301, 400, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + result = ["testPair", "testRead", "testRead2"] + self.assertEquals(result, self.rnaRds.getReadIDs(paired=True)) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + result = ["testSpliceRead"] + self.assertEquals(result, self.rnaRds.getReadIDs(uniqs=False, splices=True)) + result = ["testPair/1", "testRead", "testRead2", "testSpliceRead"] + self.assertEquals(result, self.rnaRds.getReadIDs(splices=True)) + + + def testGetMismatches(self): + self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getMismatches) + self.rds.insertMetadata([("readsize", "5")]) + + rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + result = {"chr1": []} + self.assertEquals(result, self.rds.getMismatches()) + + rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "C3T")] + self.rds.insertUniqs(rdsEntryList) + result = {"chr1": [[1, 3, "T", "C"]]} + self.assertEquals(result, self.rds.getMismatches()) + result = {"chr2": []} + self.assertEquals(result, self.rds.getMismatches(mischrom="chr2")) + + rdsEntryList = [("testRead", "chr1", 10, 15, "+", 1.0, "", "C3T")] + self.rds.insertUniqs(rdsEntryList) + result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]]} + self.assertEquals(result, self.rds.getMismatches()) + + rdsEntryList = [("testRead", "chr2", 10, 15, "+", 1.0, "", "C3T")] + self.rds.insertUniqs(rdsEntryList) + result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]], + "chr2": [[10, 12, "T", "C"]]} + self.assertEquals(result, self.rds.getMismatches()) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "C41T")] + self.rnaRds.insertSplices(rdsEntryList) + self.rnaRds.insertMetadata([("readsize", "150")]) + result = {"chr1": [[1000, 1040, "T", "C"]]} + #TODO: This test case fails. If there are only splice entries for a chromosome it shouldn't + # be necessary to specify the chromosome. + #self.assertEquals(result, self.rnaRds.getMismatches()) + self.assertEquals(result, self.rnaRds.getMismatches(mischrom="chr1")) + + + #TODO: needs fixing up + def testGetChromProfile(self): + chromProfile = self.rds.getChromProfile("chr1") + result = [] + self.assertEquals(result, chromProfile.tolist()) + + rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + chromProfile = self.rds.getChromProfile("chr1") + result = [] + self.assertEquals(result, chromProfile.tolist()) + + self.rds.insertMetadata([("readsize", "5")]) + chromProfile = self.rds.getChromProfile("chr1") + result = [0.0, 1.0, 1.0, 1.0, 1.0] + self.assertEquals(result, chromProfile.tolist()) + + rdsEntryList = [("testRead2", "chr1", 7, 11, "+", 1.0, "", "")] + self.rds.insertUniqs(rdsEntryList) + # This doesn't seem to make sense the default behavior is to only get the first readlen bases + chromProfile = self.rds.getChromProfile("chr1") + result = [0.0, 1.0, 1.0, 1.0, 1.0] + self.assertEquals(result, chromProfile.tolist()) + + # as it stands this doesn't see right either. Getting an indexError at currentpos 5. + chromProfile = self.rds.getChromProfile("chr1", cstop=11) + result = [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + self.assertEquals(result, chromProfile.tolist()) + + + def testInsertMetadata(self): + result = {} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.insertMetadata([("testMeta", "100")]) + result = {"testMeta": "100"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.insertMetadata([("testMeta", "200")]) + result = {"testMeta:2": "200", "testMeta": "100"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + + def testUpdateMetadata(self): + result = {} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.insertMetadata([("testMeta", "100")]) + result = {"testMeta": "100"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.updateMetadata("testMeta", "200") + result = {"testMeta": "200"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.updateMetadata("testMeta", "300", "200") + result = {"testMeta": "300"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + self.rds.updateMetadata("testMeta", "200", "200") + result = {"testMeta": "300"} + self.assertEquals(result, self.rds.getMetadata("testMeta")) + + + def testFlagReads(self): + readData = self.rnaRds.getReadsDict(withFlag=True) + self.assertEquals({}, readData) + + rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + result = [""] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + regions = [()] + self.assertRaises(sqlite.ProgrammingError, self.rnaRds.flagReads, regions) + + regions = [("test", "chr1", "0", "1000")] + self.rnaRds.flagReads(regions) + result = ["test"] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + regions = [("test2", "chr1", "600", "1000")] + self.rnaRds.flagReads(regions) + result = ["test"] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + rdsEntryList = [("testRead2", "chr1", 101, 200, "+", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + regions = [("test2", "chr1", "101", "1000")] + self.rnaRds.flagReads(regions) + result = ["test", "test2"] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "", ""), + ("testMultiRead", "chr1", 201, 300, "+", 0.5, "", "")] + self.rnaRds.insertMulti(rdsEntryList) + regions = [("test", "chr1", "0", "1000")] + self.rnaRds.flagReads(regions) + result = ["test", "test", "", ""] + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True) + self.assertEquals(result, flags) + + regions = [("multi", "chr1", "1", "1000")] + self.rnaRds.flagReads(regions, uniqs=False, multi=True) + result = ["test", "test", "multi", "multi"] + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True) + self.assertEquals(result, flags) + + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")] + self.rnaRds.insertSplices(rdsEntryList) + regions = [("test", "chr1", "0", "1500")] + self.rnaRds.flagReads(regions) + result = ["", "test", "test", "multi", "multi"] + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True) + self.assertEquals(result, flags) + + regions = [("splice", "chr1", "1", "1500")] + self.rnaRds.flagReads(regions, uniqs=False, multi=False, splices=True) + result = [" L:splice R:splice", "test", "test", "multi", "multi"] + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True) + self.assertEquals(result, flags) + + rdsEntryList = [("testNegSense", "chr1", 301, 400, "-", 1.0, "", "")] + self.rnaRds.insertUniqs(rdsEntryList) + regions = [("test", "chr1", "0", "1500", "+")] + self.rnaRds.flagReads(regions, sense="anythingBut'Both'") + result = ["test", "test", ""] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + regions = [("neg", "chr1", "0", "1500", "-")] + self.rnaRds.flagReads(regions, sense="anythingBut'Both'") + result = ["test", "test", "neg"] + flags = self.getRDSFlags("1", self.rnaRds) + self.assertEquals(result, flags) + + + def getRDSFlags(self, chromosome, rds, doMulti=False, splice=False): + if splice: + readData = rds.getSplicesDict(withFlag=True) + else: + readData = rds.getReadsDict(withFlag=True, doMulti=doMulti) + + flags = [] + for read in readData[chromosome]: + flags.append(read["flag"]) + + if splice: + nonSplice = self.getRDSFlags(chromosome, rds, doMulti, splice=False) + for flag in nonSplice: + flags.append(flag) + + return flags + + + def testSetFlags(self): + rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")] + self.rds.insertUniqs(rdsEntryList) + self.rnaRds.insertUniqs(rdsEntryList) + rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""), + ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")] + self.rnaRds.insertMulti(rdsEntryList) + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")] + self.rnaRds.insertSplices(rdsEntryList) + + result = ["reset"] + self.rds.setFlags("reset") + flags = self.getRDSFlags("1", self.rds) + self.assertEquals(result, flags) + + result = ["splice", "uniq", "resetMulti", "resetMulti"] + self.rnaRds.setFlags("resetMulti", uniqs=False, splices=False) + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True) + self.assertEquals(result, flags) + + result = ["resetAll", "resetAll", "resetAll", "resetAll"] + self.rnaRds.setFlags("resetAll") + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True) + self.assertEquals(result, flags) + + + def testResetFlags(self): + rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")] + self.rds.insertUniqs(rdsEntryList) + self.rnaRds.insertUniqs(rdsEntryList) + rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""), + ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")] + self.rnaRds.insertMulti(rdsEntryList) + rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")] + self.rnaRds.insertSplices(rdsEntryList) + + self.rds.resetFlags() + result = [""] + flags = self.getRDSFlags("1", self.rds) + self.assertEquals(result, flags) + + self.rnaRds.resetFlags() + result = ["", "", ""] + flags = self.getRDSFlags("1", self.rnaRds, doMulti=True) + self.assertEquals(result, flags) + + self.rnaRds.resetFlags() + result = ["", ""] + flags = self.getRDSFlags("1", self.rnaRds, splice=True) + self.assertEquals(result, flags) + + + def testReweighMultireads(self): + rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""), + ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")] + self.rds.insertMulti(rdsEntryList) + readData = ("0.25", "chr1", "201", "testMultiRead") + self.rds.reweighMultireads([readData]) + readDict = self.rds.getReadsDict(withWeight=True, doMulti=True) + read = readDict["1"][0] + self.assertEquals(0.25, read["weight"]) + + + #TODO: write unit test + def testSetSynchronousPragma(self): + pass + + + #TODO: write unit test + def testSetDBcache(self): + pass + + + #TODO: write unit test + def testExecute(self): + pass + + + #TODO: write unit test + def testExecuteCommit(self): + pass + + + def testBuildIndex(self): + self.assertFalse(self.rds.hasIndex()) + self.rds.buildIndex() + self.assertTrue(self.rds.hasIndex()) + + + def testDropIndex(self): + self.assertFalse(self.rds.hasIndex()) + self.rds.buildIndex() + self.assertTrue(self.rds.hasIndex()) + self.rds.dropIndex() + self.assertFalse(self.rds.hasIndex()) + + self.assertFalse(self.rnaRds.hasIndex()) + self.rnaRds.buildIndex() + self.assertTrue(self.rnaRds.hasIndex()) + self.rnaRds.dropIndex() + self.assertFalse(self.rnaRds.hasIndex()) + + + #TODO: write unit test + def testMemSync(self): + pass + + + #TODO: write unit test + def testCopyDBEntriesToMemory(self): + pass + + + #TODO: write unit test + def testCopySpliceDBEntriesToMemory(self): + pass + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestReadDataset)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testRnaAToIFilter.py b/test/testRnaAToIFilter.py new file mode 100644 index 0000000..d2fdfa5 --- /dev/null +++ b/test/testRnaAToIFilter.py @@ -0,0 +1,84 @@ +''' +Created on Aug 25, 2010 + +@author: sau +''' +import unittest +from Erange import rnaAToIFilter + + +class TestRnaAToIFilter(unittest.TestCase): + + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testRnaAToIFilter(self): + snpPropertiesList = [] + self.assertEquals([], rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 7 8 9 10 11 12 13"] + result = [] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"] + result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"] + result = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 R"] + result = [] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"] + result = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 F"] + result = [] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F", + "0 1 2 3 4 5 6 7 8 9 10 11 12 13" + ] + result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F", + "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R" + ] + result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F", + "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R" + ] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F", + "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F" + ] + result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F", + "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F" + ] + self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList)) + + snpPropertiesList = ["invalid entry"] + self.assertRaises(IndexError, rnaAToIFilter.rnaAToIFilter, snpPropertiesList) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestRnaAToIFilter)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testRnaAToIFilter'] + unittest.main() \ No newline at end of file diff --git a/test/testRnaEditing.py b/test/testRnaEditing.py new file mode 100644 index 0000000..5eb4a0d --- /dev/null +++ b/test/testRnaEditing.py @@ -0,0 +1,49 @@ +''' +Created on Aug 23, 2010 + +@author: sau +''' +import unittest +from Erange import rnaEditing + + +class TestRnaEditing(unittest.TestCase): + + + def setUp(self): + pass + + + def tearDown(self): + pass + + + def testGetGenesWithMultipleSNPs(self): + snpList = [] + self.assertEquals([], rnaEditing.getGenesWithMultipleSNPs(snpList)) + + snpList = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp1"], + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp2"], + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"] + ] + + result = ["snp3", "snp2", "snp1"] + self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList)) + result = [] + self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2)) + + snpList.append([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"]) + result = ["snp3"] + self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2)) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestRnaEditing)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testTranscripts.py b/test/testTranscripts.py new file mode 100644 index 0000000..cf401a7 --- /dev/null +++ b/test/testTranscripts.py @@ -0,0 +1,98 @@ +''' +Created on Oct 4, 2010 + +@author: sau +''' +import unittest +import os +from Erange import transcripts + +inFileName = "testTranscriptsInFile.txt" +outFileName = "testTranscriptsOutFile.txt" + + +class TestTranscripts(unittest.TestCase): + + + def setUp(self): + self.inFile = open(inFileName, "w") + self.inFile.write("line1\t3.5\n") + self.inFile.write("line2\t1.5\n") + self.inFile.write("line3\tpadding\t3.5\n") + self.inFile.close() + + + def tearDown(self): + try: + os.remove(outFileName) + except OSError: + pass + + try: + os.remove(inFileName) + except OSError: + pass + + + def testTranscripts(self): + transcripts.transcripts(inFileName, outFileName) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("line1\t700000.0\t2.3\n", results[0]) + self.assertEquals("line2\t300000.0\t1.0\n", results[1]) + self.assertEquals("line3\t700000.0\t2.3\n", results[2]) + + def testMain(self): + argv = ["transcripts.py", inFileName, outFileName] + transcripts.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("line1\t700000.0\t2.3\n", results[0]) + self.assertEquals("line2\t300000.0\t1.0\n", results[1]) + self.assertEquals("line3\t700000.0\t2.3\n", results[2]) + + argv = ["transcripts.py", inFileName, outFileName, "--transcriptome", "400000"] + transcripts.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("line1\t1400000.0\t4.7\n", results[0]) + self.assertEquals("line2\t600000.0\t2.0\n", results[1]) + self.assertEquals("line3\t1400000.0\t4.7\n", results[2]) + + argv = ["transcripts.py", inFileName, outFileName, "--cells", "5e5"] + transcripts.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("line1\t700000.0\t4.7\n", results[0]) + self.assertEquals("line2\t300000.0\t2.0\n", results[1]) + self.assertEquals("line3\t700000.0\t4.7\n", results[2]) + + argv = ["transcripts.py", inFileName, outFileName, "--efficiency", "0.15"] + transcripts.main(argv) + output = open(outFileName) + results = output.readlines() + output.close() + self.assertEquals(3, len(results)) + self.assertEquals("line1\t700000.0\t4.7\n", results[0]) + self.assertEquals("line2\t300000.0\t2.0\n", results[1]) + self.assertEquals("line3\t700000.0\t4.7\n", results[2]) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestTranscripts)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/test/testmakebedfromrds.py b/test/testmakebedfromrds.py new file mode 100644 index 0000000..f11ccd6 --- /dev/null +++ b/test/testmakebedfromrds.py @@ -0,0 +1,170 @@ +''' +Created on Jun 4, 2010 + +@author: sau +''' +import unittest +from Erange import makebedfromrds + + +class TestMakeBedFromRds(unittest.TestCase): + + def testGetSenseColor(self): + senseColor = makebedfromrds.getSenseColor('+', .5) + self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for low weight and plus sense color") + + senseColor = makebedfromrds.getSenseColor('-', .5) + self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for low weight and non-plus sense") + + senseColor = makebedfromrds.getSenseColor('+', 5) + self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for high weight and plus sense") + + senseColor = makebedfromrds.getSenseColor('-', 5) + self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for high weight and non-plus sense") + + + def testGetMultiSenseColor(self): + senseColor = makebedfromrds.getMultiSenseColor('+') + self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for plus sense") + + senseColor = makebedfromrds.getMultiSenseColor('-') + self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for non-plus sense") + + + def testGetSingleSenseColor(self): + senseColor = makebedfromrds.getSingleSenseColor('+') + self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for plus sense") + + senseColor = makebedfromrds.getSingleSenseColor('-') + self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for non-plus sense") + + + def testGetReadSizes(self): + numPieces = 3 + startList = [0, 1, 2] + stopList = [3, 4, 5] + readSizes = makebedfromrds.getReadSizes(numPieces, startList, stopList) + self.assertEqual(readSizes, "3,3,3", "incorrect read size list") + + readSizes = makebedfromrds.getReadSizes(1, startList, stopList) + self.assertEquals(readSizes, "3", "incorrect read size list for numPieces=1") + + self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, [], stopList) + self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, startList, []) + self.assertRaises(IndexError, makebedfromrds.getReadSizes, 4, startList, stopList) + + + def testGetReadCoords(self): + numPieces = 3 + startList = [0, 1, 2] + readCoords = makebedfromrds.getReadCoords(numPieces, startList) + self.assertEqual(readCoords, "0,1,2", "incorrect read coords list") + + readCoords = makebedfromrds.getReadCoords(1, startList) + self.assertEqual(readCoords, "0", "incorrect read coords list for numPieces=1") + + self.assertRaises(IndexError, makebedfromrds.getReadCoords, numPieces, []) + self.assertRaises(IndexError, makebedfromrds.getReadCoords, 4, startList) + + + def testGetSpliceColor(self): + lpart = 1 + rpart = 2 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for hacktype 1 splice") + self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for hacktype 1 splice") + + lpart = 0 + rpart = 0 + leftweight = 1.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left unique") + self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left unique") + + lpart = 0 + rpart = 0 + leftweight = 0.0 + rightweight = 1.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 right unique") + self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 right unique") + + lpart = 0 + rpart = 0 + leftweight = 1.0 + rightweight = 1.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left and right unique") + self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left and right unique") + + lpart = 0 + rpart = 0 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 multi") + self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 multi") + + lpart = 1 + rpart = 1 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1") + self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 lpart + rpart = 2") + self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 lpart + rpart = 2") + + lpart = 2 + rpart = 0 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight) + self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for left splice") + self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for left splice") + + lpart = 0 + rpart = 0 + leftweight = 1.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight) + self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for left unique") + self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for left unique") + + lpart = 0 + rpart = 0 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight) + self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for multi splice") + self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for multi splice") + + lpart = 1 + rpart = 0 + leftweight = 0.0 + rightweight = 0.0 + aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight) + self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for lpart = 1 multi splice") + self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for lpart = 1 multi splice") + + + def testDoNotOutputChromosome(self): + self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", True), "chrM is output when enforceChr=True") + self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", False), "chrM is output when enforceChr=False") + self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", True), "chr is not output when enforceChr=True") + self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", False), "chr is not output when enforceChr=False") + self.assertTrue(makebedfromrds.doNotOutputChromosome("Bad", True), "bad name chr is output when enforceChr=True") + self.assertFalse(makebedfromrds.doNotOutputChromosome("Bad", False), "bad name chr is not output when enforceChr=True") + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(TestMakeBedFromRds)) + + return suite + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() \ No newline at end of file diff --git a/transcripts.py b/transcripts.py new file mode 100755 index 0000000..53b6aea --- /dev/null +++ b/transcripts.py @@ -0,0 +1,53 @@ +# +# transcripts.py +# ENRAGE +# +# Created by Ali Mortazavi on 1/25/08. +# +""" usage: python %s rpkmFile outFile [--transcriptome size] [--cells count] [--efficiency fraction] + where transcriptome size is in Gbp, cell count is in arbitrary units and efficiency is a fraction +""" + +import sys, optparse + +def main(argv=None): + if not argv: + argv = sys.argv + + print "%prog: version 3.0" + usage = "usage: python %prog rpkmFile outFile [options]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--transcriptome", type="float", dest="tSize", + help="transcriptome size in Gbp [default 200000.0]") + parser.add_option("--cells", type="float", dest="cellCount", + help="arbitrary units [default 1e6]") + parser.add_option("--efficiency", type="float", dest="efficiency", + help="fraction [default 0.3]") + parser.set_defaults(tSize=200000.0, cellCount=1e6, efficiency=0.3) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 2: + print usage + sys.exit(1) + + infile = args[0] + outfile = args[1] + + transcripts(infile, outfile, options.tSize, options.cellCount, options.efficiency) + + +def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency=0.3): + infile = open(infilename) + outfile = open(outfilename, "w") + for line in infile: + fields = line.strip().split() + rpkm = float(fields[-1]) + transcripts = rpkm * tSize + transPerCell = transcripts / cellCount / efficiency + outfile.write("%s\t%.1f\t%.1f\n" % (fields[0], transcripts, transPerCell)) + infile.close() + outfile.close() + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/trimreads.py b/trimreads.py new file mode 100755 index 0000000..d246e15 --- /dev/null +++ b/trimreads.py @@ -0,0 +1,115 @@ +# +# trimquery.py +# ENRAGE +# +# Created by Ali Mortazavi on 8/12/08. +# + +import sys, optparse +from cistematic.core import complement + +print "%prog: version 2.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %prog length infile outfile [--fastq] [--fromback] [--paired] [--flip] [--filter maxN]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--fastq", action="store_true", dest="fastq") + parser.add_option("--fromback", action="store_true", dest="fromBack") + parser.add_option("--paired", action="store_true", dest="paired") + parser.add_option("--flip", action="store_true", dest="flipseq") + parser.add_option("--filter", type="int", dest="maxN") + parser.set_defaults(fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + print "\t where paired fragments are separated by a : when given the -paired flag" + sys.exit(1) + + length = int(args[0]) + infile = args[1] + outfile = args[2] + + trimreads(length, infile, outfile, options.fastq, options.fromBack, options.paired, options.flipseq, options.maxN) + + +def trimreads(length, inFileName, outFileName, fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None): + infile = open(inFileName) + outfile = open(outFileName, "w") + + if paired: + pairedlength = 2 * length + index = 0 + + if fromBack: + length = -1 * length + + filtering = False + if maxN is not None: + filtering = True + print "filtering out reads with more than %d Ns" % maxN + else: + maxN = 2 + + print "trimming reads from %s to %d bp and saving them in %s" % (inFileName, length, outFileName) + + filtered = 0 + header = "" + for line in infile: + line = line.strip() + if len(line) == 0: + continue + + firstChar = line[0] + if (not fastq and firstChar == ">") or (fastq and firstChar in ["@", "+"]): + header = line + "\n" + else: + if filtering: + if line.count("N") > maxN: + filtered += 1 + continue + + seq1 = line[length:] + seq2 = line[:length] + if flipseq: + try: + tempseq1 = seq1 + seq1 = complement(tempseq1) + except: + seq1 = tempseq1 + + try: + tempseq2 = seq2 + seq2 = complement(tempseq2) + except: + seq2 = tempseq2 + + if paired: + if len(line) < pairedlength: + continue + + outfile.write("%s%s:%s\n" % (header, seq1, seq2)) + else: + if fromBack: + outfile.write("%s%s\n" % (header, seq1)) + else: + outfile.write("%s%s\n" % (header, seq2)) + + index += 1 + if index % 1000000 == 0: + print ".", + + sys.stdout.flush() + + outfile.close() + print "returned %d reads" % index + if filtering: + print "%d additional reads filtered" % filtered + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/utrChanges.py b/utrChanges.py new file mode 100755 index 0000000..d95d18c --- /dev/null +++ b/utrChanges.py @@ -0,0 +1,91 @@ +# +# utrChanges.py +# ENRAGE +# + +try: + import psyco + psyco.full() +except: + pass + +import sys +from commoncode import getMergedRegions, getLocusByChromDict +from cistematic.genomes import Genome + +print "%s: version 1.3" % sys.argv[0] + + +def main(argv=None): + if not argv: + argv = sys.argv + + if len(argv) < 4: + print "usage: python %s genome acceptedfile outfile" % argv[0] + sys.exit(1) + + genome = argv[1] + acceptfile = argv[2] + outfile = argv[3] + + utrChanges(genome, acceptfile, outfile) + + +def utrChanges(genome, acceptfile, outFileName): + acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + outfile = open(outFileName, "w") + + hg = Genome(genome) + + origLocusByChromDict = getLocusByChromDict(hg, keepSense = True) + newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict = acceptDict, keepSense = True) + + new3utr = 0 + new5utr = 0 + changedGene = 0 + + for chrom in origLocusByChromDict: + for (gstart, gstop, gid, glen, sense) in origLocusByChromDict[chrom]: + for (newstart, newstop, newgid, newlen, newsense) in newLocusByChromDict[chrom]: + if gid == newgid: + changedBoundary = False + new3p = "F" + new5p = "F" + if newstart < gstart: + if sense == "R": + new3utr += 1 + new3p = "T" + changedBoundary = True + elif sense == "F": + new5utr += 1 + new5p = "T" + changedBoundary = True + else: + print sense + + if newstop > gstop: + if sense == "R": + new5utr += 1 + new5p = "T" + changedBoundary = True + elif sense == "F": + new3utr += 1 + new3p = "T" + changedBoundary = True + else: + print sense + + if changedBoundary: + changedGene += 1 + outfile.write("%s\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\t%s\n" % (gid, chrom, gstart, gstop, sense, chrom, newstart, newstop, new5p, new3p)) + + continue + + outfile.close() + print "%d new 5'utr" % new5utr + print "%d new 3'utr" % new3utr + print "%s affected genes" % changedGene + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/weighMultireads.py b/weighMultireads.py new file mode 100755 index 0000000..ed27edf --- /dev/null +++ b/weighMultireads.py @@ -0,0 +1,300 @@ +# +# weightMultireads.py +# ENRAGE +# + +# Created by Ali Mortazavi on 10/02/08. +# + +try: + import psyco + psyco.full() +except: + pass + +from commoncode import readDataset +import sys, time, string, optparse + +print "%prog: version 3.1" + +def main(argv=None): + if not argv: + argv = sys.argv + + usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]" + + parser = optparse.OptionParser(usage=usage) + parser.add_option("--radius", type="int", dest="radius") + parser.add_option("--noradius", action="store_false", dest="doRadius") + parser.add_option("--usePairs", type="int", dest="pairDist") + parser.add_option("--verbose", action="store_true", dest="verbose") + parser.add_option("--cache", type="int", dest="cachePages") + parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 1: + print usage + sys.exit(1) + + rdsfile = args[0] + + weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages) + + +def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None): + + if radius is not None: + doRadius = True + else: + radius = 100 + + usePairs = False + if pairDist is not None: + usePairs = True + + tooFar = pairDist * 10 + + doCache = False + if cachePages is not None: + doCache = True + else: + cachePages = 1 + + RDS = readDataset(rdsfile, verbose = True, cache=doCache) + readlen = RDS.getReadSize() + halfreadlen = readlen / 2 + + if cachePages > RDS.getDefaultCacheSize(): + RDS.setDBcache(cachePages) + + if verbose: + print time.ctime() + + multiIDs = RDS.getReadIDs(uniqs=False,multi=True) + if verbose: + print "got multiIDs ", time.ctime() + + fixedPair = 0 + fixedReads = [] + if usePairs: + print "doing pairs with pairDist = %d" % pairDist + uidDict = {} + midDict = {} + jointList = [] + bothMultiList = [] + mainIDList = [] + guDict = {} + muDict = {} + + if RDS.dataType == "RNA": + uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True) + else: + uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False) + + if verbose: + print "got uniqIDs ", time.ctime() + + for readID in uniqIDs: + (mainID, pairID) = readID.split("/") + try: + uidDict[mainID].append(pairID) + except: + uidDict[mainID] = [pairID] + mainIDList.append(mainID) + + if verbose: + print "uidDict all ", len(uidDict), time.ctime() + + for mainID in mainIDList: + if len(uidDict[mainID]) == 2: + del uidDict[mainID] + + if verbose: + print "uidDict first candidates ", len(uidDict), time.ctime() + + for readID in multiIDs: + (frontID, multiplicity) = readID.split("::") + (mainID, pairID) = frontID.split("/") + try: + if pairID not in midDict[mainID]: + midDict[mainID].append(pairID) + except: + midDict[mainID] = [pairID] + + if verbose: + print "all multis ", len(midDict), time.ctime() + + mainIDList = uidDict.keys() + for mainID in mainIDList: + if mainID not in midDict: + del uidDict[mainID] + + if verbose: + print "uidDict actual candidates ", len(uidDict), time.ctime() + + for readID in midDict: + listLen = len(midDict[readID]) + if listLen == 1: + if readID in uidDict: + jointList.append(readID) + elif listLen == 2: + bothMultiList.append(readID) + + if verbose: + print "joint ", len(jointList), time.ctime() + print "bothMulti ", len(bothMultiList), time.ctime() + + del uidDict + del midDict + del mainIDList + del uniqIDs + + uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True) + if verbose: + print "got uniq dict ", len(uniqDict), time.ctime() + + if RDS.dataType == "RNA": + spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True) + if verbose: + print "got splice dict ", len(spliceDict), time.ctime() + + for readID in jointList: + try: + guDict[readID] = uniqDict[readID][0] + except: + if RDS.dataType == "RNA": + guDict[readID] = spliceDict[readID][0] + + del uniqDict + del spliceDict + if verbose: + print "guDict actual ", len(guDict), time.ctime() + + multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True) + if verbose: + print "got multi dict ", len(multiDict), time.ctime() + + for readID in jointList: + muDict[readID] = multiDict[readID] + + for readID in bothMultiList: + muDict[readID] = multiDict[readID] + + del multiDict + if verbose: + print "muDict actual ", len(muDict), time.ctime() + + RDS.setSynchronousPragma("OFF") + for readID in jointList: + try: + (ustart, uchrom, upair) = guDict[readID] + ustop = ustart + readlen + except: + (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID] + + muList = muDict[readID] + muLen = len(muList) + bestMatch = [tooFar] * muLen + found = False + for index in range(muLen): + (mstart, mchrom, mpair) = muList[index] + if uchrom != mchrom: + continue + + if abs(mstart - ustart) < pairDist: + bestMatch[index] = abs(mstart - ustart) + found = True + elif abs(mstart - ustop) < pairDist: + bestMatch[index] = abs(mstart - ustop) + found = True + + if found: + theMatch = -1 + theDist = tooFar + reweighList = [] + for index in range(muLen): + if theDist > bestMatch[index]: + theMatch = index + theDist = bestMatch[index] + + theID = string.join([readID, mpair], "/") + for index in range(muLen): + if index == theMatch: + score = 1 - (muLen - 1) / (100. * (muLen)) + else: + score = 1 / (100. * muLen) + + start = muList[index][0] + chrom = "chr%s" % muList[index][1] + reweighList.append((round(score,3), chrom, start, theID)) + + if theMatch > 0: + RDS.reweighMultireads(reweighList) + fixedPair += 1 + if verbose and fixedPair % 10000 == 1: + print "fixed %d" % fixedPair + print guDict[readID] + print muDict[readID] + print reweighList + + fixedReads.append(theID) + + RDS.setSynchronousPragma("ON") + + del guDict + del muDict + print "fixed %d pairs" % fixedPair + print time.ctime() + + skippedReads = 0 + if doRadius: + print "doing uniq read radius with radius = %d" % radius + multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True) + print "got multiDict" + RDS.setSynchronousPragma("OFF") + rindex = 0 + for readID in multiIDs: + theID = readID + if theID in fixedReads: + skippedReads += 1 + continue + + if "::" in readID: + (readID, multiplicity) = readID.split("::") + + scores = [] + coords = [] + for read in multiDict[readID]: + (start, weight, rID, chrom) = read + achrom = "chr%s" % chrom + regionStart = start + halfreadlen - radius + regionStop = start + halfreadlen + radius + uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True) + scores.append(uniqs + 1) + coords.append((achrom, start, theID)) + + total = float(sum(scores)) + reweighList = [] + for index in range(len(scores)): + reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2])) + + RDS.reweighMultireads(reweighList) + rindex += 1 + if rindex % 10000 == 0: + print rindex + + RDS.setSynchronousPragma("ON") + if verbose: + print "skipped ", skippedReads + + print "reweighted ", rindex + + if doCache: + RDS.saveCacheDB(rdsfile) + + if verbose: + print "finished", time.ctime() + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file