--- /dev/null
+"""
+MakeBamFromRds
+
+Converts ERANGE RDS zero based file to Bam zero based format.
+
+Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
+
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import re
+import optparse
+import random
+import pysam
+from commoncode import readDataset
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ verstring = "MakeBamFromRds: version 1.0"
+ print verstring
+
+ doPairs = False
+
+ usage = "usage: python %prog rdsFile bamFile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+ parser.add_option("--pairs", action="store_true", dest="doPairs")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--chrom", action="append", dest="chromList")
+ parser.add_option("--fasta", dest="fastaFileName")
+ parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
+ doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
+ doCache=False, cachePages=100000, fastaFileName="",
+ chromList=[])
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ rdsfile = args[0]
+ outfilename = args[1]
+
+ allChrom = True
+ if options.chromList:
+ allChrom = False
+
+ makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
+ options.doSplices, doPairs, options.withFlag, options.useFlagLike,
+ options.enforceChr, allChrom, options.doCache, options.cachePages,
+ options.chromList, options.fastaFileName)
+
+
+def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
+ doSplices=False, doPairs=False, withFlag="",
+ useFlagLike=False, enforceChr=False, allChrom=True,
+ doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
+
+ if not withUniqs and not withMulti and not doSplices:
+ print "must be outputting at least one of uniqs, multi, or -splices - exiting"
+ sys.exit(1)
+
+ print "\nsample:"
+ RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+
+ if cachePages > RDS.getDefaultCacheSize():
+ RDS.setDBcache(cachePages)
+
+ readlength = RDS.getReadSize()
+
+ if allChrom:
+ if withUniqs:
+ chromList = RDS.getChromosomes()
+ elif withMulti:
+ chromList = RDS.getChromosomes(table="multi")
+ else:
+ chromList = RDS.getChromosomes(table="splices")
+
+ chromList.sort()
+
+ fastaSequenceDict = {}
+ if fastaFileName:
+ fastafile = open(fastaFileName)
+ fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
+ fastafile.close()
+
+ referenceSequenceList = []
+ chromRemoveList = []
+ for chromosome in chromList:
+ if doNotOutputChromosome(chromosome, enforceChr):
+ chromRemoveList.append(chromosome)
+ else:
+ chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
+ referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
+ referenceSequenceList.append(referenceDataDict)
+
+ for chrom in chromRemoveList:
+ chromList.remove(chrom)
+
+ header = {"HD": {"VN": "1.0"}}
+ if referenceSequenceList:
+ header["SQ"] = referenceSequenceList
+
+ outfile = pysam.Samfile(outfilename, "wb", header=header)
+
+ totalWrites = 0
+ noncanonicalSplices = 0
+ for chrom in chromList:
+ index = 0
+ print "chromosome %s" % (chrom)
+ if withUniqs or withMulti:
+ hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
+ withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+ flagLike=useFlagLike, entryDict=True)
+
+ for read in hitDict[chrom]:
+ writeBAMEntry(outfile, chrom, read, readlength)
+ index += 1
+
+ if doSplices:
+ numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+ index += numSpliceReadsWritten
+ noncanonicalSplices += noncanonical
+
+ print index
+ totalWrites += index
+
+ outfile.close()
+ print "%d total reads written" % totalWrites
+ print "%d non-canonical splices" % noncanonicalSplices
+
+
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+ index = 0
+ noncanonicalSplices = 0
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+ if chrom not in spliceDict:
+ pass
+ else:
+ for read in spliceDict[chrom]:
+ if fastaSequenceDict.has_key(chrom):
+ read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+ noncanonicalSplices += noncanonical
+
+ writeBAMEntry(outfile, chrom, read, readlength)
+ index += 1
+
+ return index, noncanonicalSplices
+
+
+def writeBAMEntry(outfile, chrom, outputDict, readlength):
+ tagList = []
+ alignedRead = pysam.AlignedRead()
+ alignedRead.qname = outputDict["readID"]
+ if outputDict["sense"] == "-":
+ alignedRead.is_reverse = True
+
+ alignedRead.rname = outfile.references.index(chrom)
+
+ if outputDict.has_key("startL"):
+ startL = outputDict["startL"]
+ stopL = outputDict["stopL"]
+ startR = outputDict["startR"]
+ stopR = outputDict["stopR"]
+ alignedRead.pos = startL
+ alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
+ tagList.append(("XS", outputDict["sense"]))
+ else:
+ alignedRead.pos = outputDict["start"]
+ alignedRead.cigar = [(0, readlength)]
+
+ if outputDict.has_key("pairID"):
+ pairID = outputDict["pairID"]
+ if pairID == "1":
+ alignedRead.is_read1 = True
+ alignedRead.is_proper_pair = True
+ elif pairID == "2":
+ alignedRead.is_read2 = True
+ alignedRead.is_proper_pair = True
+ else:
+ pass
+
+ if outputDict.has_key("mismatch"):
+ mismatchTag = getMismatches(outputDict["mismatch"])
+ if mismatchTag:
+ tagList.append(("MD", mismatchTag))
+
+ if tagList:
+ alignedRead.tags = tagList
+
+ outfile.write(alignedRead)
+
+
+def getMismatches(mismatchString):
+ mismatch = ""
+ positions = re.findall("\d+", mismatchString)
+ nucleotides = re.findall("([ACGTN])\d+", mismatchString)
+ for index in range(0, len(positions)):
+ mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+
+ return mismatch
+
+
+def doNotOutputChromosome(chrom, enforceChr):
+ result = False
+
+ if chrom == "chrM":
+ result = True
+
+ if enforceChr and ("chr" not in chrom):
+ result = True
+
+ return result
+
+
+def getFastaSequenceDictionary(fastaFileName):
+ fastaSeqDict = {}
+ fchrom = ""
+ fseq = ""
+
+ fastafile = open(fastaFileName)
+ for line in fastafile:
+ if line[0] == ">":
+ if fchrom != "":
+ fastaSeqDict[fchrom] = fseq
+
+ fseq = ""
+ fchrom = line[1:-1]
+ else:
+ fseq += line.strip()
+
+ fastafile.close()
+
+ return fastaSeqDict
+
+
+def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+ spliceSense = {"GTAG": "+",
+ "GCAG": "+",
+ "ATAC": "+",
+ "CTAC": "-",
+ "CTGC": "-",
+ "GTAT": "-"
+ }
+
+ noncanonical = 0
+ intronstart = stopLeft
+ intronlen = startRight - stopLeft
+ leftJunctionSig =fastaSequence[intronstart:intronstart+2]
+ rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
+ spliceJunction = leftJunctionSig + rightJunctionSig
+ spliceJunction = spliceJunction.upper()
+ if spliceSense.has_key(spliceJunction):
+ sense = spliceSense[spliceJunction]
+ else:
+ noncanonical += 1
+ senses = ["+", "-"]
+ random.shuffle(senses)
+ sense = senses[0]
+
+ return sense, noncanonical
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+MakeRdsFromBam
+
+Created on Jun 3, 2010
+
+@author: sau
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse, re
+import pysam
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 1.0"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print verstring
+
+ usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
+ \ninput reads must be sorted to properly record multireads"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init",
+ help="append to existing rds file [default: create new]")
+ parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+ help="set data type to RNA [default: DNA]")
+ parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+ help="input file is in sam format")
+ parser.add_option("--index", action="store_true", dest="doIndex",
+ help="index the output rds file")
+ parser.add_option("--cache", type="int", dest="cachePages",
+ help="number of cache pages to use [default: 100000")
+ parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+ help="multi counts over this value are discarded [default: 10]")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+ help="use the raw read names")
+ parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
+ maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ try:
+ label = args[0]
+ except IndexError:
+ print "no label specified - see --help for usage"
+ sys.exit(1)
+
+ try:
+ samFileName = args[1]
+ except IndexError:
+ print "no samfile specified - see --help for usage"
+ sys.exit(1)
+
+ try:
+ outDbName = args[2]
+ except IndexError:
+ print "no outrdsfile specified - see --help for usage"
+ sys.exit(1)
+
+ makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
+ options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
+
+
+def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
+ cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
+
+ if useSamFile:
+ fileMode = "r"
+ else:
+ fileMode = "rb"
+
+ try:
+ samfile = pysam.Samfile(samFileName, fileMode)
+ except ValueError:
+ print "samfile index not found"
+ sys.exit(1)
+
+ if rnaDataType:
+ dataType = "RNA"
+ else:
+ dataType = "DNA"
+
+ writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
+
+ rds = readDataset(outDbName, init, dataType, verbose=True)
+ if not init and doIndex:
+ try:
+ if rds.hasIndex():
+ rds.dropIndex()
+ except:
+ pass
+
+ if "sam_mapped" not in rds.getMetadata():
+ rds.insertMetadata([("sam_mapped", "True")])
+
+ defaultCacheSize = rds.getDefaultCacheSize()
+
+ if cachePages > defaultCacheSize:
+ if init:
+ rds.setDBcache(cachePages, default=True)
+ else:
+ rds.setDBcache(cachePages)
+
+ propertyList = []
+ for arg in sys.argv:
+ if "::" in arg:
+ (pname, pvalue) = arg.strip().split("::")
+ propertyList.append((pname, pvalue))
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+ countReads = {"unmapped": 0,
+ "total": 0,
+ "unique": 0,
+ "multi": 0,
+ "multiDiscard": 0,
+ "splice": 0
+ }
+
+ readsize = 0
+ insertSize = 100000
+
+ uniqueInsertList = []
+ multiInsertList = []
+ spliceInsertList = []
+
+ processedEntryDict = {}
+ uniqueReadDict = {}
+ multiReadDict = {}
+ spliceReadDict = {}
+
+ samFileIterator = samfile.fetch(until_eof=True)
+
+ for read in samFileIterator:
+ if read.is_unmapped:
+ countReads["unmapped"] += 1
+ continue
+
+ if readsize == 0:
+ take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+ readsize = sum([length for op,length in read.cigar if op in take])
+ if init:
+ rds.insertMetadata([("readsize", readsize)])
+
+ #Build the read dictionaries
+ try:
+ readSequence = read.seq
+ except KeyError:
+ readSequence = ""
+
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
+ if trimReadID:
+ rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+ else:
+ rdsEntryName = read.qname
+
+ if processedEntryDict.has_key(readName):
+ if isSpliceEntry(read.cigar):
+ if spliceReadDict.has_key(readName):
+ del spliceReadDict[readName]
+ else:
+ if uniqueReadDict.has_key(readName):
+ del uniqueReadDict[readName]
+
+ if multiReadDict.has_key(readName):
+ (read, priorCount, rdsEntryName) = multiReadDict[readName]
+ count = priorCount + 1
+ multiReadDict[readName] = (read, count, rdsEntryName)
+ else:
+ multiReadDict[readName] = (read, 1, rdsEntryName)
+ else:
+ processedEntryDict[readName] = ""
+ if isSpliceEntry(read.cigar):
+ spliceReadDict[readName] = (read,rdsEntryName)
+ else:
+ uniqueReadDict[readName] = (read, rdsEntryName)
+
+ if countReads["total"] % insertSize == 0:
+ for entry in uniqueReadDict.keys():
+ (readData, rdsEntryName) = uniqueReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
+ countReads["unique"] += 1
+
+ for entry in spliceReadDict.keys():
+ (readData, rdsEntryName) = spliceReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+ countReads["splice"] += 1
+
+ for entry in multiReadDict.keys():
+ (readData, count, rdsEntryName) = multiReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ if count > maxMultiReadCount:
+ countReads["multiDiscard"] += 1
+ else:
+ multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+ countReads["multi"] += 1
+
+ rds.insertUniqs(uniqueInsertList)
+ rds.insertMulti(multiInsertList)
+ uniqueInsertList = []
+ uniqueReadDict = {}
+ multiInsertList = []
+ multiReadDict = {}
+ if dataType == "RNA":
+ rds.insertSplices(spliceInsertList)
+ spliceInsertList = []
+ spliceReadDict = {}
+
+ print ".",
+ sys.stdout.flush()
+ processedEntryDict = {}
+
+ countReads["total"] += 1
+
+ if len(uniqueReadDict.keys()) > 0:
+ for entry in uniqueReadDict.keys():
+ (readData, rdsEntryName) = uniqueReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
+ countReads["unique"] += 1
+
+ rds.insertUniqs(uniqueInsertList)
+
+ if len(multiReadDict.keys()) > 0:
+ for entry in multiReadDict.keys():
+ (readData, count, rdsEntryName) = multiReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ if count > maxMultiReadCount:
+ countReads["multiDiscard"] += 1
+ else:
+ multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+ countReads["multi"] += 1
+
+ countReads["multi"] += len(multiInsertList)
+
+ if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
+ for entry in spliceReadDict.keys():
+ (readData, rdsEntryName) = spliceReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+ countReads["splice"] += 1
+
+ rds.insertSplices(spliceInsertList)
+
+ countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
+ countString += "\t%d unique reads" % countReads["unique"]
+ countString += "\t%d multi reads" % countReads["multi"]
+ countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
+ if dataType == "RNA":
+ countString += "\t%d spliced reads" % countReads["splice"]
+
+ print countString.replace("\t", "\n")
+
+ writeLog("%s.log" % outDbName, verstring, countString)
+
+ if doIndex:
+ print "building index...."
+ if cachePages > defaultCacheSize:
+ rds.setDBcache(cachePages)
+ rds.buildIndex(cachePages)
+ else:
+ rds.buildIndex(defaultCacheSize)
+
+
+def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
+ start = int(alignedRead.pos)
+ stop = int(start+readSize)
+ sense = getReadSense(alignedRead.is_reverse)
+ try:
+ mismatchTag = alignedRead.opt("MD")
+ mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
+ except KeyError:
+ mismatches = ""
+
+ return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
+
+
+def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
+ (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
+ startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
+
+ return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
+
+
+def getPairedReadNumberSuffix(read):
+ readSuffix = ""
+ if not isPairedRead(read):
+ return ""
+
+ if read.is_read1:
+ readSuffix = "/1"
+ elif read.is_read2:
+ readSuffix = "/2"
+
+ return readSuffix
+
+
+def isPairedRead(read):
+ return read.is_proper_pair and (read.is_read1 or read.is_read2)
+
+
+def isSpliceEntry(cigarTupleList):
+ isSplice = False
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ isSplice = True
+ break
+
+ return isSplice
+
+
+def getReadSense(reverse):
+ if reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+
+def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
+ output = []
+ deletionMarker = "^"
+ position = 0
+
+ lengths = re.findall("\d+", mismatchTag)
+ mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
+
+ for mismatchEntry in range(len(mismatchSequences)):
+ mismatch = mismatchSequences[mismatchEntry]
+ position = position + int(lengths[mismatchEntry])
+ if string.find(mismatch, deletionMarker) == 0:
+ continue
+
+ try:
+ if querySequence:
+ genomicNucleotide = querySequence[position]
+ else:
+ genomicNucleotide = "N"
+
+ if sense == "-":
+ mismatch = getComplementNucleotide(mismatch)
+ genomicNucleotide = getComplementNucleotide(genomicNucleotide)
+
+ elandCompatiblePosition = int(position + 1)
+ output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+ position += 1
+ except IndexError:
+ if logErrors:
+ errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
+ writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
+
+ return ""
+
+ return string.join(output, ",")
+
+
+def getComplementNucleotide(nucleotide):
+ complement = {"A": "T",
+ "T": "A",
+ "C": "G",
+ "G": "C",
+ "N": "N"
+ }
+
+ return complement[nucleotide]
+
+
+def getSpliceBounds(start, readsize, cigarTupleList):
+ stopR = int(start + readsize)
+ offset = 0
+
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ stopL = int(start + offset)
+ startR = int(stopL + length)
+
+ return start, startR, stopL, stopR
+ else:
+ offset += length
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+Created on Jul 1, 2010
+
+@author: sau
+"""
+
+import sqlite3 as sqlite
+import string
+import tempfile
+import shutil
+import os
+from os import environ
+from array import array
+from commoncode import getReverseComplement
+
+if environ.get("CISTEMATIC_TEMP"):
+ cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+ cisTemp = "/tmp"
+
+tempfile.tempdir = cisTemp
+currentRDSVersion = "1.1"
+
+
+class ReadDatasetError(Exception):
+ pass
+
+
+class ReadDataset():
+ """ Class for storing reads from experiments. Assumes that custom scripts
+ will translate incoming data into a format that can be inserted into the
+ class using the insert* methods. Default class subtype ('DNA') includes
+ tables for unique and multireads, whereas 'RNA' subtype also includes a
+ splices table.
+ """
+
+ def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
+ cache=False, reportCount=True):
+ """ creates an rds datafile if initialize is set to true, otherwise
+ will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
+ """
+ self.dbcon = ""
+ self.memcon = ""
+ self.dataType = ""
+ self.rdsVersion = currentRDSVersion
+ self.memBacked = False
+ self.memChrom = ""
+ self.memCursor = ""
+ self.cachedDBFile = ""
+
+ if cache:
+ if verbose:
+ print "caching ...."
+
+ self.cacheDB(datafile)
+ dbFile = self.cachedDBFile
+ else:
+ dbFile = datafile
+
+ self.dbcon = sqlite.connect(dbFile)
+ self.dbcon.row_factory = sqlite.Row
+ self.dbcon.execute("PRAGMA temp_store = MEMORY")
+ if initialize:
+ if datasetType not in ["DNA", "RNA"]:
+ raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
+ else:
+ self.dataType = datasetType
+
+ self.initializeTables(self.dbcon)
+ else:
+ metadata = self.getMetadata("dataType")
+ self.dataType = metadata["dataType"]
+
+ try:
+ metadata = self.getMetadata("rdsVersion")
+ self.rdsVersion = metadata["rdsVersion"]
+ except:
+ try:
+ self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
+ except IOError:
+ print "could not add rdsVersion - read-only ?"
+ self.rdsVersion = "pre-1.0"
+
+ if verbose:
+ if initialize:
+ print "INITIALIZED dataset %s" % datafile
+ else:
+ print "dataset %s" % datafile
+
+ metadata = self.getMetadata()
+ print "metadata:"
+ pnameList = metadata.keys()
+ pnameList.sort()
+ for pname in pnameList:
+ print "\t" + pname + "\t" + metadata[pname]
+
+ if reportCount:
+ ucount = self.getUniqsCount()
+ mcount = self.getMultiCount()
+ if self.dataType == "DNA" and not initialize:
+ try:
+ print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
+ except ValueError:
+ print "\n%s unique reads and %s multireads" % (ucount, mcount)
+ elif self.dataType == "RNA" and not initialize:
+ scount = self.getSplicesCount()
+ try:
+ print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
+ except ValueError:
+ print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
+
+ print "default cache size is %d pages" % self.getDefaultCacheSize()
+ if self.hasIndex():
+ print "found index"
+ else:
+ print "not indexed"
+
+
+ def __len__(self):
+ """ return the number of usable reads in the dataset.
+ """
+ total = self.getUniqsCount()
+ total += self.getMultiCount()
+
+ if self.dataType == "RNA":
+ total += self.getSplicesCount()
+
+ total = int(total)
+
+ return total
+
+
+ def __del__(self):
+ """ cleanup copy in local cache, if present.
+ """
+ if self.cachedDBFile != "":
+ self.uncacheDB()
+
+
+ def cacheDB(self, filename):
+ """ copy geneinfoDB to a local cache.
+ """
+ self.cachedDBFile = "%s.db" % tempfile.mktemp()
+ shutil.copyfile(filename, self.cachedDBFile)
+
+
+ def saveCacheDB(self, filename):
+ """ copy geneinfoDB to a local cache.
+ """
+ shutil.copyfile(self.cachedDBFile, filename)
+
+
+ def uncacheDB(self):
+ """ delete geneinfoDB from local cache.
+ """
+ global cachedDBFile
+ if self.cachedDBFile != "":
+ try:
+ os.remove(self.cachedDBFile)
+ except:
+ print "could not delete %s" % self.cachedDBFile
+
+ self.cachedDB = ""
+
+
+ def attachDB(self, filename, asname):
+ """ attach another database file to the readDataset.
+ """
+ stmt = "attach '%s' as %s" % (filename, asname)
+ self.execute(stmt)
+
+
+ def detachDB(self, asname):
+ """ detach a database file to the readDataset.
+ """
+ stmt = "detach %s" % (asname)
+ self.execute(stmt)
+
+
+ def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+ """ import into current RDS the table (with columns destcolumns,
+ with default all columns) from the database file asname,
+ using the column specification of ascolumns (default all).
+ """
+ stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+ if flagged != "":
+ stmt += " where flag = '%s' " % flagged
+
+ self.executeCommit(stmt)
+
+
+ def getTables(self, asname=""):
+ """ get a list of table names in a particular database file.
+ """
+ resultList = []
+ sql = self.getSqlCursor()
+
+ if asname != "":
+ asname += "."
+
+ stmt = "select name from %ssqlite_master where type='table'" % asname
+ sql.execute(stmt)
+ results = sql.fetchall()
+
+ for row in results:
+ resultList.append(row["name"])
+
+ return resultList
+
+
+ def getSqlCursor(self):
+ if self.memBacked:
+ sql = self.getMemCursor()
+ else:
+ sql = self.getFileCursor()
+
+ return sql
+
+
+ def hasIndex(self):
+ """ check whether the RDS file has at least one index.
+ """
+ stmt = "select count(*) from sqlite_master where type='index'"
+ count = int(self.execute(stmt, returnResults=True)[0][0])
+ if count > 0:
+ return True
+
+ return False
+
+
+ def initializeTables(self, dbConnection, cache=100000):
+ """ creates table schema in a database connection, which is
+ typically a database file or an in-memory database.
+ """
+ dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+ dbConnection.execute("create table metadata (name varchar, value varchar)")
+ dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
+ positionSchema = "start int, stop int"
+ tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+ dbConnection.execute("create table uniqs %s" % tableSchema)
+ dbConnection.execute("create table multi %s" % tableSchema)
+ if self.dataType == "RNA":
+ positionSchema = "startL int, stopL int, startR int, stopR int"
+ tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+ dbConnection.execute("create table splices %s" % tableSchema)
+
+ dbConnection.commit()
+
+
+ def getFileCursor(self):
+ """ returns a cursor to file database for low-level (SQL)
+ access to the data.
+ """
+ return self.dbcon.cursor()
+
+
+ def getMemCursor(self):
+ """ returns a cursor to memory database for low-level (SQL)
+ access to the data.
+ """
+ return self.memcon.cursor()
+
+
+ def getMetadata(self, valueName=""):
+ """ returns a dictionary of metadata.
+ """
+ whereClause = ""
+ resultsDict = {}
+
+ if valueName != "":
+ whereClause = " where name='%s'" % valueName
+
+ sql = self.getSqlCursor()
+
+ sql.execute("select name, value from metadata %s" % whereClause)
+ results = sql.fetchall()
+
+ for row in results:
+ parameterName = row["name"]
+ parameterValue = row["value"]
+ if parameterName not in resultsDict:
+ resultsDict[parameterName] = parameterValue
+ else:
+ trying = True
+ index = 2
+ while trying:
+ newName = string.join([parameterName, str(index)], ":")
+ if newName not in resultsDict:
+ resultsDict[newName] = parameterValue
+ trying = False
+
+ index += 1
+
+ return resultsDict
+
+
+ def getReadSize(self):
+ """ returns readsize if defined in metadata.
+ """
+ metadata = self.getMetadata()
+ if "readsize" not in metadata:
+ raise ReadDatasetError("no readsize parameter defined")
+ else:
+ mysize = metadata["readsize"]
+ if "import" in mysize:
+ mysize = mysize.split()[0]
+
+ return int(mysize)
+
+
+ def getDefaultCacheSize(self):
+ """ returns the default cache size.
+ """
+ return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+
+
+ def getChromosomes(self, table="uniqs", fullChrom=True):
+ """ returns a list of distinct chromosomes in table.
+ """
+ statement = "select distinct chrom from %s" % table
+ sql = self.getSqlCursor()
+
+ sql.execute(statement)
+ results = []
+ for row in sql:
+ if fullChrom:
+ if row["chrom"] not in results:
+ results.append(row["chrom"])
+ else:
+ if len(row["chrom"][3:].strip()) < 1:
+ continue
+
+ if row["chrom"][3:] not in results:
+ results.append(row["chrom"][3:])
+
+ results.sort()
+
+ return results
+
+
+ def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+ doMulti=False, doSplices=False):
+ """ returns the maximum coordinate for reads on a given chromosome.
+ """
+ maxCoord = 0
+ sql = self.getSqlCursor()
+
+ if doUniqs:
+ try:
+ sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
+ maxCoord = int(sql.fetchall()[0][0])
+ except:
+ print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+ if doSplices:
+ sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
+ try:
+ spliceMax = int(sql.fetchall()[0][0])
+ if spliceMax > maxCoord:
+ maxCoord = spliceMax
+ except:
+ pass
+
+ if doMulti:
+ sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
+ try:
+ multiMax = int(sql.fetchall()[0][0])
+ if multiMax > maxCoord:
+ maxCoord = multiMax
+ except:
+ pass
+
+ if verbose:
+ print "%s maxCoord: %d" % (chrom, maxCoord)
+
+ return maxCoord
+
+
+ def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
+ withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
+ readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+ flagLike=False, strand='', combine5p=False):
+ """ returns a dictionary of reads in a variety of formats
+ and which can be restricted by chromosome or custom-flag.
+ Returns unique reads by default, but can return multireads
+ with doMulti set to True.
+ """
+ whereClause = []
+ resultsDict = {}
+
+ if chrom != "" and chrom != self.memChrom:
+ whereClause.append("chrom = '%s'" % chrom)
+
+ if flag != "":
+ if flagLike:
+ flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+ whereClause.append(flagLikeClause)
+ else:
+ whereClause.append("flag = '%s'" % flag)
+
+ if start > -1:
+ whereClause.append("start > %d" % start)
+
+ if stop > -1:
+ whereClause.append("stop < %d" % stop)
+
+ if len(readLike) > 0:
+ readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
+ whereClause.append(readIDClause)
+
+ if hasMismatch:
+ whereClause.append("mismatch != ''")
+
+ if strand in ["+", "-"]:
+ whereClause.append("sense = '%s'" % strand)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ groupBy = []
+ if findallOptimize:
+ selectClause = ["select start, sense, sum(weight)"]
+ groupBy = ["GROUP BY start, sense"]
+ else:
+ selectClause = ["select ID, chrom, start, readID"]
+ if bothEnds:
+ selectClause.append("stop")
+
+ if not noSense:
+ selectClause.append("sense")
+
+ if withWeight:
+ selectClause.append("weight")
+
+ if withFlag:
+ selectClause.append("flag")
+
+ if withMismatch:
+ selectClause.append("mismatch")
+
+ if limit > 0 and not combine5p:
+ groupBy.append("LIMIT %d" % limit)
+
+ selectQuery = string.join(selectClause, ",")
+ groupQuery = string.join(groupBy)
+ if doUniqs:
+ stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
+ if doMulti:
+ stmt.append("UNION ALL")
+ stmt.append(selectQuery)
+ stmt.append("from multi")
+ stmt.append(whereQuery)
+ stmt.append(groupQuery)
+ else:
+ stmt = [selectQuery, "from multi", whereQuery]
+
+ if combine5p:
+ if findallOptimize:
+ selectQuery = "select start, sense, weight, chrom"
+
+ if doUniqs:
+ subSelect = [selectQuery, "from uniqs", whereQuery]
+ if doMulti:
+ subSelect.append("union all")
+ subSelect.append(selectQuery)
+ subSelect.append("from multi")
+ subSelect.append(whereQuery)
+ else:
+ subSelect = [selectQuery, "from multi", whereQuery]
+
+ sqlStmt = string.join(subSelect)
+ if findallOptimize:
+ selectQuery = "select start, sense, sum(weight)"
+
+ stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
+ selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
+
+ if findallOptimize:
+ if self.memBacked:
+ self.memcon.row_factory = None
+ sql = self.memcon.cursor()
+ else:
+ self.dbcon.row_factory = None
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by start")
+ elif readIDDict:
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by readID, start")
+ else:
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by chrom, start")
+
+ sqlQuery = string.join(stmt)
+ sql.execute(sqlQuery)
+
+ if findallOptimize:
+ resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
+ if self.memBacked:
+ self.memcon.row_factory = sqlite.Row
+ else:
+ self.dbcon.row_factory = sqlite.Row
+ else:
+ currentChrom = ""
+ currentReadID = ""
+ pairID = 0
+ for row in sql:
+ readID = row["readID"]
+ if fullChrom:
+ chrom = row["chrom"]
+ else:
+ chrom = row["chrom"][3:]
+
+ if not readIDDict and chrom != currentChrom:
+ resultsDict[chrom] = []
+ currentChrom = chrom
+ dictKey = chrom
+ elif readIDDict:
+ theReadID = readID
+ if "::" in readID:
+ theReadID = readID.split("::")[0]
+
+ if "/" in theReadID and withPairID:
+ (theReadID, pairID) = readID.split("/")
+
+ if theReadID != currentReadID:
+ resultsDict[theReadID] = []
+ currentReadID = theReadID
+ dictKey = theReadID
+
+ newrow = {"start": int(row["start"])}
+ if bothEnds:
+ newrow["stop"] = int(row["stop"])
+
+ if not noSense:
+ newrow["sense"] = row["sense"]
+
+ if withWeight:
+ newrow["weight"] = float(row["weight"])
+
+ if withFlag:
+ newrow["flag"] = row["flag"]
+
+ if withMismatch:
+ newrow["mismatch"] = row["mismatch"]
+
+ if withID:
+ newrow["readID"] = readID
+
+ if withChrom:
+ newrow["chrom"] = chrom
+
+ if withPairID:
+ newrow["pairID"] = pairID
+
+ resultsDict[dictKey].append(newrow)
+
+ return resultsDict
+
+
+ def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False,
+ withID=False, withChrom=False, withPairID=False, readIDDict=False,
+ splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+ stop=-1, strand=""):
+ """ returns a dictionary of spliced reads in a variety of
+ formats and which can be restricted by chromosome or custom-flag.
+ Returns unique spliced reads for now.
+ """
+ whereClause = []
+ resultsDict = {}
+
+ if chrom != "" and chrom != self.memChrom:
+ whereClause = ["chrom = '%s'" % chrom]
+
+ if flag != "":
+ if flagLike:
+ flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+ whereClause.append(flagLikeClause)
+ else:
+ whereClause.append("flag = '%s'" % flag)
+
+ if hasMismatch:
+ whereClause.append("mismatch != ''")
+
+ if strand != "":
+ whereClause.append("sense = '%s'" % strand)
+
+ if start > -1:
+ whereClause.append("startL > %d" % start)
+
+ if stop > -1:
+ whereClause.append("stopR < %d" % stop)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+ if not noSense:
+ selectClause.append("sense")
+
+ if withWeight:
+ selectClause.append("weight")
+
+ if withFlag:
+ selectClause.append("flag")
+
+ if withMismatch:
+ selectClause.append("mismatch")
+
+ selectQuery = string.join(selectClause, " ,")
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
+ sql.execute(stmt)
+ currentReadID = ""
+ currentChrom = ""
+ for row in sql:
+ pairID = 0
+ readID = row["readID"]
+ if fullChrom:
+ chrom = row["chrom"]
+ else:
+ chrom = row["chrom"][3:]
+
+ if not readIDDict and chrom != currentChrom:
+ resultsDict[chrom] = []
+ currentChrom = chrom
+ dictKey = chrom
+ elif readIDDict:
+ if "/" in readID:
+ (theReadID, pairID) = readID.split("/")
+ else:
+ theReadID = readID
+
+ if theReadID != currentReadID:
+ resultsDict[theReadID] = []
+ currentReadID = theReadID
+ dictKey = theReadID
+
+ newrow = {"startL": int(row["startL"])}
+ newrow["stopL"] = int(row["stopL"])
+ newrow["startR"] = int(row["startR"])
+ newrow["stopR"] = int(row["stopR"])
+ if not noSense:
+ newrow["sense"] = row["sense"]
+
+ if withWeight:
+ newrow["weight"] = float(row["weight"])
+
+ if withFlag:
+ newrow["flag"] = row["flag"]
+
+ if withMismatch:
+ newrow["mismatch"] = row["mismatch"]
+
+ if withID:
+ newrow["readID"] = readID
+
+ if withChrom:
+ newrow["chrom"] = chrom
+
+ if withPairID:
+ newrow["pairID"] = pairID
+
+ if splitRead:
+ leftDict = newrow.copy()
+ del leftDict["startR"]
+ del leftDict["stopR"]
+ rightDict = newrow
+ del rightDict["startL"]
+ del rightDict["stopL"]
+ resultsDict[dictKey].append(leftDict)
+ resultsDict[dictKey].append(rightDict)
+ else:
+ resultsDict[dictKey].append(newrow)
+
+ return resultsDict
+
+
+ def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+ splices=False, reportCombined=True, sense="both"):
+ """ return read counts for a given region.
+ """
+ ucount = 0
+ mcount = 0
+ scount = 0
+ restrict = ""
+ if sense in ["+", "-"]:
+ restrict = " sense ='%s' " % sense
+
+ if uniqs:
+ try:
+ ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
+ except:
+ ucount = 0
+
+ if multi:
+ try:
+ mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
+ except:
+ mcount = 0
+
+ if splices:
+ try:
+ scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
+ except:
+ scount = 0
+
+ if reportCombined:
+ total = ucount + mcount + scount
+ return total
+ else:
+ return (ucount, mcount, scount)
+
+
+ def getTotalCounts(self, chrom="", rmin="", rmax=""):
+ """ return read counts for a given region.
+ """
+ return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
+
+
+ def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
+ """ returns the number of row in the uniqs table.
+ """
+ whereClause = []
+ count = 0
+
+ if chrom !="" and chrom != self.memChrom:
+ whereClause = ["chrom='%s'" % chrom]
+
+ if rmin != "":
+ whereClause.append("%s >= %s" % (startField, str(rmin)))
+
+ if rmax != "":
+ whereClause.append("%s <= %s" % (startField, str(rmax)))
+
+ if restrict != "":
+ whereClause.append(restrict)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ if distinct:
+ sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
+ else:
+ sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+
+ result = sql.fetchone()
+
+ try:
+ count = int(result[0])
+ except:
+ count = 0
+
+ return count
+
+
+ def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the number of row in the splices table.
+ """
+ return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+
+
+ def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the number of distinct readIDs in the uniqs table.
+ """
+ return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+
+
+ def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the total weight of readIDs in the multi table.
+ """
+ return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+
+
+ def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
+ """ get readID's.
+ """
+ stmt = []
+ limitPart = ""
+ if limit > 0:
+ limitPart = "LIMIT %d" % limit
+
+ if uniqs:
+ stmt.append("select readID from uniqs")
+
+ if multi:
+ stmt.append("select readID from multi")
+
+ if splices:
+ stmt.append("select readID from splices")
+
+ if len(stmt) > 0:
+ selectPart = string.join(stmt, " union ")
+ else:
+ selectPart = ""
+
+ sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ sql.execute(sqlQuery)
+ result = sql.fetchall()
+
+ if paired:
+ return [x[0].split("/")[0] for x in result]
+ else:
+ return [x[0] for x in result]
+
+
+ def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
+ """ returns the uniq and spliced mismatches in a dictionary.
+ """
+ readlen = self.getReadSize()
+ if mischrom:
+ hitChromList = [mischrom]
+ else:
+ hitChromList = self.getChromosomes()
+ hitChromList.sort()
+
+ snpDict = {}
+ for achrom in hitChromList:
+ if verbose:
+ print "getting mismatches from chromosome %s" % (achrom)
+
+ snpDict[achrom] = []
+ hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
+ if useSplices and self.dataType == "RNA":
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+ spliceIDList = spliceDict.keys()
+ for k in spliceIDList:
+ spliceEntry = spliceDict[k][0]
+ startpos = spliceEntry["startL"]
+ lefthalf = spliceEntry["stopL"]
+ rightstart = spliceEntry["startR"]
+ sense = spliceEntry["sense"]
+ mismatches = spliceEntry["mismatch"]
+ spMismatchList = mismatches.split(",")
+ for mismatch in spMismatchList:
+ if "N" in mismatch:
+ continue
+
+ change_len = len(mismatch)
+ if sense == "+":
+ change_from = mismatch[0]
+ change_base = mismatch[change_len-1]
+ change_pos = int(mismatch[1:change_len-1])
+ elif sense == "-":
+ change_from = getReverseComplement([mismatch[0]])
+ change_base = getReverseComplement([mismatch[change_len-1]])
+ change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+ firsthalf = int(lefthalf)-int(startpos)+1
+ secondhalf = 0
+ if int(change_pos) <= int(firsthalf):
+ change_at = startpos + change_pos - 1
+ else:
+ secondhalf = change_pos - firsthalf
+ change_at = rightstart + secondhalf
+
+ snpDict[achrom].append([startpos, change_at, change_base, change_from])
+
+ if achrom not in hitDict.keys():
+ continue
+
+ for readEntry in hitDict[achrom]:
+ start = readEntry["start"]
+ sense = readEntry["sense"]
+ mismatches = readEntry["mismatch"]
+ mismatchList = mismatches.split(",")
+ for mismatch in mismatchList:
+ if "N" in mismatch:
+ continue
+
+ change_len = len(mismatch)
+ if sense == "+":
+ change_from = mismatch[0]
+ change_base = mismatch[change_len-1]
+ change_pos = int(mismatch[1:change_len-1])
+ elif sense == "-":
+ change_from = getReverseComplement([mismatch[0]])
+ change_base = getReverseComplement([mismatch[change_len-1]])
+ change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+ change_at = start + change_pos - 1
+ snpDict[achrom].append([start, change_at, change_base, change_from])
+
+ return snpDict
+
+
+ def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
+ useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+ keepStrand="both", shiftValue=0):
+ """return a profile of the chromosome as an array of per-base read coverage....
+ keepStrand = 'both', 'plusOnly', or 'minusOnly'.
+ Will also shift position of unique and multireads (but not splices) if shift is a natural number
+ """
+ metadata = self.getMetadata()
+ try:
+ readlen = int(metadata["readsize"])
+ except KeyError:
+ readlen = 0
+
+ dataType = metadata["dataType"]
+ scale = 1. / normalizationFactor
+ shift = {}
+ shift['+'] = int(shiftValue)
+ shift['-'] = -1 * int(shiftValue)
+
+ if cstop > 0:
+ lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
+ else:
+ lastNT = cstop - cstart + readlen + shift["+"]
+
+ chromModel = array("f",[0.] * lastNT)
+ hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
+ if cstart < 0:
+ cstart = 0
+
+ for readEntry in hitDict[chromosome]:
+ hstart = readEntry["start"]
+ sense = readEntry ["sense"]
+ weight = readEntry["weight"]
+ hstart = hstart - cstart + shift[sense]
+ for currentpos in range(hstart,hstart+readlen):
+ try:
+ if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale * weight
+ elif sense == "-" and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale * weight
+ except:
+ continue
+
+ del hitDict
+ if useSplices and dataType == "RNA":
+ if cstop > 0:
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
+ else:
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
+
+ if chromosome in spliceDict:
+ for spliceEntry in spliceDict[chromosome]:
+ Lstart = spliceEntry["startL"]
+ Lstop = spliceEntry["stopL"]
+ Rstart = spliceEntry["startR"]
+ Rstop = spliceEntry["stopR"]
+ rsense = spliceEntry["sense"]
+ if (Rstop - cstart) < lastNT:
+ for index in range(abs(Lstop - Lstart)):
+ currentpos = Lstart - cstart + index
+ # we only track unique splices
+ if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale
+ elif rsense == "-" and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale
+
+ for index in range(abs(Rstop - Rstart)):
+ currentpos = Rstart - cstart + index
+ # we only track unique splices
+ if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale
+ elif rsense == "-" and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale
+
+ del spliceDict
+
+ return chromModel
+
+
+ def insertMetadata(self, valuesList):
+ """ inserts a list of (pname, pvalue) into the metadata
+ table.
+ """
+ self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def updateMetadata(self, pname, newValue, originalValue=""):
+ """ update a metadata field given the original value and the new value.
+ """
+ stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
+ if originalValue != "":
+ stmt += " and value='%s' " % str(originalValue)
+
+ self.dbcon.execute(stmt)
+ self.dbcon.commit()
+
+
+ def insertUniqs(self, valuesList):
+ """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+ into the uniqs table.
+ """
+ self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def insertMulti(self, valuesList):
+ """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+ into the multi table.
+ """
+ self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def insertSplices(self, valuesList):
+ """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+ into the splices table.
+ """
+ self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
+ """ update reads on file database in a list region of regions for a chromosome to have a new flag.
+ regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
+ sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
+ """
+ restrict = ""
+ if sense != "both":
+ restrict = " and sense = ? "
+
+ if uniqs:
+ self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+ if multi:
+ self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+ if self.dataType == "RNA" and splices:
+ self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
+ self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
+
+ self.dbcon.commit()
+
+
+ def setFlags(self, flag, uniqs=True, multi=True, splices=True):
+ """ set the flag fields in the entire dataset.
+ """
+ if uniqs:
+ self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+
+ if multi:
+ self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+
+ if self.dataType == "RNA" and splices:
+ self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+
+ self.dbcon.commit()
+
+
+ def resetFlags(self, uniqs=True, multi=True, splices=True):
+ """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+ """
+ self.setFlags("", uniqs, multi, splices)
+
+
+ def reweighMultireads(self, readList):
+ self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
+
+
+ def setSynchronousPragma(self, value="ON"):
+ try:
+ self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
+ except:
+ print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+
+
+ def setDBcache(self, cache, default=False):
+ self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
+ if default:
+ self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+
+
+ def execute(self, statement, returnResults=False):
+ sql = self.getSqlCursor()
+
+ sql.execute(statement)
+ if returnResults:
+ result = sql.fetchall()
+ return result
+
+
+ def executeCommit(self, statement):
+ self.execute(statement)
+
+ if self.memBacked:
+ self.memcon.commit()
+ else:
+ self.dbcon.commit()
+
+
+ def buildIndex(self, cache=100000):
+ """ Builds the file indeces for the main tables.
+ Cache is the number of 1.5 kb pages to keep in memory.
+ 100000 pages translates into 150MB of RAM, which is our default.
+ """
+ if cache > self.getDefaultCacheSize():
+ self.setDBcache(cache)
+ self.setSynchronousPragma("OFF")
+ self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+ print "built uPosIndex"
+ self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
+ print "built uChromIndex"
+ self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+ print "built mPosIndex"
+ self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
+ print "built mChromIndex"
+
+ if self.dataType == "RNA":
+ self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
+ print "built sPosIndex"
+ self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
+ print "built sPosIndex2"
+ self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
+ print "built sChromIndex"
+
+ self.dbcon.commit()
+ self.setSynchronousPragma("ON")
+
+
+ def dropIndex(self):
+ """ drops the file indices for the main tables.
+ """
+ try:
+ self.setSynchronousPragma("OFF")
+ self.dbcon.execute("DROP INDEX uPosIndex")
+ self.dbcon.execute("DROP INDEX uChromIndex")
+ self.dbcon.execute("DROP INDEX mPosIndex")
+ self.dbcon.execute("DROP INDEX mChromIndex")
+
+ if self.dataType == "RNA":
+ self.dbcon.execute("DROP INDEX sPosIndex")
+ try:
+ self.dbcon.execute("DROP INDEX sPosIndex2")
+ except:
+ pass
+
+ self.dbcon.execute("DROP INDEX sChromIndex")
+
+ self.dbcon.commit()
+ except:
+ print "problem dropping index"
+
+ self.setSynchronousPragma("ON")
+
+
+ def memSync(self, chrom="", index=False):
+ """ makes a copy of the dataset into memory for faster access.
+ Can be restricted to a "full" chromosome. Can also build the
+ memory indices.
+ """
+ self.memcon = ""
+ self.memcon = sqlite.connect(":memory:")
+ self.initializeTables(self.memcon)
+ cursor = self.dbcon.cursor()
+ whereclause = ""
+ if chrom != "":
+ print "memSync %s" % chrom
+ whereclause = " where chrom = '%s' " % chrom
+ self.memChrom = chrom
+ else:
+ self.memChrom = ""
+
+ self.memcon.execute("PRAGMA temp_store = MEMORY")
+ self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
+ # copy metadata to memory
+ self.memcon.execute("delete from metadata")
+ results = cursor.execute("select name, value from metadata")
+ results2 = []
+ for row in results:
+ results2.append((row["name"], row["value"]))
+
+ self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
+
+ self.copyDBEntriesToMemory("uniqs", whereclause)
+ self.copyDBEntriesToMemory("multi", whereclause)
+ if self.dataType == "RNA":
+ self.copySpliceDBEntriesToMemory(whereclause)
+
+ if index:
+ if chrom != "":
+ self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
+ self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
+ if self.dataType == "RNA":
+ self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
+ self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
+ else:
+ self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+ self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+ if self.dataType == "RNA":
+ self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
+ self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+
+ self.memBacked = True
+ self.memcon.row_factory = sqlite.Row
+ self.memcon.commit()
+
+
+ def copyDBEntriesToMemory(self, dbName, whereClause=""):
+ cursor = self.dbcon.cursor()
+ sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
+ destinationEntries = []
+ for row in sourceEntries:
+ destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+ self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
+
+
+ def copySpliceDBEntriesToMemory(self, whereClause=""):
+ cursor = self.dbcon.cursor()
+ sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
+ destinationEntries = []
+ for row in sourceEntries:
+ destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
+ row["weight"], row["flag"], row["mismatch"]))
+
+ self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+print 'version 3.6'
+
+import sys, optparse
+from commoncode import readDataset
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s rdsfile outfilename [--cache pages]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", type="int", dest="numCachePages",
+ help="number of cache pages to use [default: 100000]")
+ parser.set_defaults(numCachePages=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ hitfile = args[0]
+ outfilename = args[1]
+
+ if options.numCachePages is not None:
+ doCache = True
+ cachePages = options.numCachePages
+ else:
+ doCache = False
+ cachePages = 100000
+
+ altSpliceCounts(hitfile, outfilename, doCache, cachePages)
+
+
+def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
+ startDict = {}
+ stopDict = {}
+ resultDict = {}
+
+ hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ readlen = hitRDS.getReadSize()
+ hitDict = hitRDS.getSplicesDict(noSense=True)
+ outfile = open(outfilename,'w')
+
+ for chrom in hitDict:
+ startDict[chrom] = []
+ stopDict[chrom] = []
+ resultDict[chrom] = []
+
+ index = 0
+ for chrom in hitDict:
+ for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]:
+ index += 1
+ length = tagStop - tagStart
+ if length < readlen + 5:
+ continue
+
+ startDict[chrom].append((tagStart, length))
+ stopDict[chrom].append((tagStop, length))
+
+ startDict[chrom].sort()
+ stopDict[chrom].sort()
+
+ spliceEvent = 0
+ altSpliceEvent = 0
+ alternative = 1
+ for chrom in startDict:
+ firstIndex = 0
+ maxIndex = len(startDict[chrom])
+ while firstIndex < maxIndex:
+ (fstart, flen) = startDict[chrom][firstIndex]
+ (start, length) = (fstart, flen)
+ secondIndex = firstIndex
+ secondLengths = []
+ while (start - fstart) < readlen:
+ if secondIndex >= maxIndex:
+ break
+
+ (start, length) = startDict[chrom][secondIndex]
+ if (start - fstart) < readlen and abs(length - flen) > readlen:
+ line = (chrom, fstart, fstart + flen, chrom, start, start + length)
+ alreadySeen = False
+ for slength in secondLengths:
+ if abs(slength - length) < readlen:
+ alreadySeen = True
+
+ if len(resultDict[chrom]) == 0:
+ resultDict[chrom].append(line)
+ elif line != resultDict[chrom][-1] and not alreadySeen:
+ resultDict[chrom].append(line)
+ secondLengths.append(length)
+ altSpliceEvent += 1
+ spliceEvent += 1
+
+ secondIndex += 1
+
+ firstIndex = secondIndex
+ spliceEvent += 1
+
+ firstIndex = 0
+ maxIndex = len(stopDict[chrom])
+ while firstIndex < maxIndex:
+ (fstop, flen) = stopDict[chrom][firstIndex]
+ (stop, length) = (fstop, flen)
+ secondIndex = firstIndex
+ secondLengths = []
+ while (stop - fstop) < readlen:
+ if secondIndex >= maxIndex:
+ break
+ (stop, length) = stopDict[chrom][secondIndex]
+ if (stop - fstop) < readlen and abs(length - flen) > readlen:
+ line = (chrom, fstop - flen, fstop, chrom, stop - length, stop)
+ alreadySeen = False
+ for slength in secondLengths:
+ if abs(slength - length) < readlen:
+ alreadySeen = True
+
+ if len(resultDict[chrom]) == 0:
+ resultDict[chrom].append(line)
+
+ if line != resultDict[chrom][-1] and not alreadySeen:
+ resultDict[chrom].append(line)
+ secondLengths.append(length)
+ altSpliceEvent += 1
+ spliceEvent += 1
+
+ secondIndex += 1
+
+ firstIndex = secondIndex
+ spliceEvent += 1
+
+ resultDict[chrom].sort()
+ for line in resultDict[chrom]:
+ outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n' % line)
+ alternative += 1
+
+ print chrom, maxIndex, spliceEvent, altSpliceEvent
+
+ print spliceEvent, altSpliceEvent
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print "psyco not running"
+
+import sys, optparse
+from cistematic.cisstat.analyzego import calculateGOStats
+from cistematic.core.geneinfo import geneinfoDB
+
+print "version 2.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--geneName", action="store_true", dest="translateGene",
+ help="translate gene")
+ parser.add_option("--field", type="int", dest="fieldID",
+ help="column containing gene ID/Name")
+ parser.set_defaults(translateGene=False, fieldID=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ fieldID = 1
+ if options.translateGene:
+ fieldID = 0
+
+ if options.fieldID is not None:
+ fieldID = options.fieldID
+
+ genome = args[0]
+ infilename = args[1]
+ prefix = args[2]
+
+ analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+
+
+def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
+ infile = open(infilename)
+ analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
+ infile.close()
+
+
+def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
+ if translateGene:
+ idb = geneinfoDB(cache=True)
+ geneinfoDict = idb.getallGeneInfo(genome)
+ symbolToGidDict = {}
+ for gid in geneinfoDict:
+ symbol = geneinfoDict[gid][0][0].strip()
+ symbolToGidDict[symbol] = gid
+
+ locusList = []
+ for line in geneInfoList:
+ fields = line.split()
+ if translateGene:
+ gene = fields[fieldID]
+ if "LOC" in gene:
+ gID = gene[3:]
+ elif "FAR" in gene:
+ print "ignoring %s" % gene
+ continue
+ else:
+ try:
+ gID = symbolToGidDict[gene]
+ except KeyError:
+ print "ignoring %s" % gene
+ continue
+ else:
+ gID = fields[fieldID]
+
+ if (genome, gID) not in locusList:
+ locusList.append((genome, gID))
+
+ if len(locusList) > 0:
+ calculateGOStats(locusList, prefix)
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys, string
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s factorlabel bedinfilename regionoutfile" % sys.argv[0]
+
+ if len(argv) < 4:
+ print usage
+ sys.exit(1)
+
+ factor = argv[1]
+ infilename = argv[2]
+ outfilename = argv[3]
+
+ bedToRegion(factor, infilename, outfilename)
+
+
+def bedToRegion(factor, infilename, outfilename):
+ index = 1
+ infile = open(infilename)
+ outfile = open(outfilename, 'w')
+ for line in infile:
+ if 'track' in line:
+ continue
+ fields = line.split()
+ line = string.join(fields, '\t')
+ outfile.write('%s%d\t%s\n' % (factor, index, line))
+ index += 1
+ infile.close()
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+print 'version 1.0'
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 2:
+ print 'usage: python %s infile outfile' % sys.argv[0]
+ sys.exit(1)
+
+ infilename = argv[0]
+ outfilename = argv[1]
+
+ binToCDF(infilename, outfilename)
+
+
+def binToCDF(infilename, outfilename):
+ infile = open(infilename)
+ outfile = open(outfilename, 'w')
+
+ for line in infile:
+ fields = line.strip().split()
+ if len(fields) < 4:
+ continue
+
+ total = int(fields[2])
+ if total == 0:
+ outfile.write(line)
+ continue
+
+ outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
+ cum = 0
+ for bin in fields[4:]:
+ cum += int(bin)
+ percent = 100 * cum / total
+ outfile.write('\t%d' % percent)
+
+ outfile.write('\n')
+
+ infile.close()
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# buildMatrix.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 3/6/09.
+#
+import sys, string, optparse
+from commoncode import writeLog
+
+versionString = "%prog: version 1.3"
+print versionString
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog matrix.step.N-1 data.part matrix.step.N [--rescale] [--truncate maxRPKM] [--log altlogfile]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--rescale", action="store_true", dest="rescale")
+ parser.add_option("--truncate", type="int", dest="maxRPKM")
+ parser.add_option("--log", dest="logfilename")
+ parser.set_defaults(rescale=False, maxRPKM=None, logfilename="buildMatrix.log")
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(0)
+
+ infile = args[0]
+ colfilename = args[1]
+ outfilename = args[2]
+
+ if options.maxRPKM is not None:
+ truncateRPKM = True
+ maxRPKM = options.maxRPKM
+ else:
+ truncateRPKM = False
+ maxRPKM = 100000000
+
+ buildMatrix(infile, colfilename, outfilename, truncateRPKM, maxRPKM,
+ options.rescale, options.logfilename)
+
+
+def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
+ maxRPKM=100000000, rescale=False, logfilename="buildMatrix.log"):
+
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+ if "/" in colfilename:
+ colname = colfilename.split("/")[-1]
+ else:
+ colname = colfilename
+
+ fileParts = colname.split(".")
+ colID = fileParts[0]
+
+ infile = open(inFileName)
+ colfile = open(colfilename)
+ outfile = open(outfilename, "w")
+ header = infile.readline()[:-1]
+ if header.strip() == "":
+ header = "#\t"
+
+ outfile.write( "%s\t%s\n" % (header, colID))
+
+ values = []
+ min = 20000000000.
+ max = -1.
+ untruncatedMax = -1.
+ for line in colfile:
+ if doNotProcessLine(line):
+ continue
+
+ fields = line.strip().split()
+ val = float(fields[-1])
+ if truncateRPKM and val > maxRPKM:
+ if val > untruncatedMax:
+ untruncatedMax = val
+
+ val = maxRPKM
+
+ values.append(val)
+ if val < min:
+ min = val
+
+ if val > max:
+ max = val
+
+ range = max - min
+ if rescale:
+ finalValues = [(val - min)/range for val in values]
+ else:
+ finalValues = values
+
+ for val in finalValues:
+ line = infile.readline().strip()
+ line += "\t%1.3f\n" % val
+ outfile.write(line)
+
+ outfile.close()
+
+ if untruncatedMax > 0:
+ max = untruncatedMax
+
+ message = "max value in %s was %.2f" % (colname, max)
+ if untruncatedMax > 0:
+ message += " but was truncated to %d" % maxRPKM
+
+ print message
+ writeLog(logfilename, versionString, message)
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import sqlite3 as sqlite
+import os
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "version 2.0"
+ if len(argv) < 3:
+ print "usage: python %s rmaskdir rmaskdbfile" % argv[0]
+ exit(1)
+
+ rmaskdir = argv[1]
+ rmaskdb = argv[2]
+
+ buildrmaskdb(rmaskdir, rmaskdb)
+
+
+def buildrmaskdb(rmaskdir, rmaskdb):
+ files = os.listdir(rmaskdir)
+ db = sqlite.connect(rmaskdb)
+ sql = db.cursor()
+ sql.execute("create table repeats (chrom varchar, start int, stop int, name varchar, family varchar)")
+ sql.execute("PRAGMA temp_store = MEMORY")
+ sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+ db.commit()
+
+ for filename in files:
+ if "rmsk" not in filename:
+ continue
+
+ print filename
+ infile = open(rmaskdir + "/" + filename)
+ for entry in infile:
+ fields = entry.strip().split("\t")
+ chrom = fields[5][3:]
+ start = int(fields[6])
+ stop = int(fields[7])
+ name = fields[10]
+ family = fields[12]
+ stmt = "insert into repeats values('%s', %d, %d, '%s', '%s')" % (chrom, start, stop, name, family)
+ sql.execute(stmt)
+
+ db.commit()
+
+ print "building index..."
+ sql.execute("PRAGMA SYNCHRONOUS = OFF")
+ sql.execute("create index chromIndex on repeats(chrom)")
+ sql.execute("create index mainIndex on repeats(chrom, start, stop)")
+ db.commit()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+creates table snp {chrom varchar,
+ start int,
+ stop int,
+ name varchar,
+ observed varchar,
+ strand varchar,
+ ucscref varchar,
+ ncbiref varchar,
+ func varchar,
+ moltype varchar,
+ valid varchar,
+ class varchar
+}
+
+sample line in dbsnp file
+608 chr1 3093453 3093454 rs52602943 0 + G G C/G genomic single unknown 0 0 unknown exact 1
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+import sys
+import sqlite3 as sqlite
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "version 2.0"
+ if len(argv) < 3:
+ print "usage: python %s snpfile snpdbname" % argv[0]
+ sys.exit(1)
+
+ snpfilename = argv[1]
+ snpdb = argv[2]
+
+ buildsnpdb(snpfilename, snpdb)
+
+
+def buildsnpdb(snpfilename, snpdb):
+ db = sqlite.connect(snpdb)
+ sql = db.cursor()
+ sql.execute("create table snp (chrom varchar, start long, stop long, name varchar, observed varchar, strand varchar, ucscref varchar, ncbiref varchar, func varchar, moltype varchar, valid varchar, class varchar)")
+ sql.execute("PRAGMA temp_store = MEMORY")
+ sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+ db.commit()
+
+ insertSize = 100000
+ insertCounter = 0
+ valuesList = []
+ print snpfilename
+ infile = open(snpfilename)
+ for entry in infile:
+ try:
+ fields = entry.strip().split("\t")
+ chrom = fields[1][3:]
+ start = int(fields[2])
+ stop = int(fields[3])
+ name = fields[4]
+ strand = fields[6]
+ refNcbi = fields[7]
+ refUcsc = fields[8]
+ observed = fields[9]
+ molType = fields[10]
+ classes = fields[11]
+ valid = fields[12]
+ func = fields[15]
+
+ valuesList.append((chrom, start, stop, name, observed, strand, refUcsc, refNcbi, func, molType, valid, classes))
+ insertCounter += 1
+ except:
+ continue
+
+ if insertCounter % insertSize == 0:
+ print insertCounter
+ db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList)
+ valuesList = []
+
+ if len(valuesList) > 0:
+ db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList)
+
+ db.commit()
+
+ print "building index"
+ sql.execute("PRAGMA SYNCHRONOUS = OFF")
+ sql.execute("create index chromIndex on snp(chrom)")
+ sql.execute("create index mainIndex on snp(chrom,start,stop)")
+ db.commit()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 4:
+ print "usage: python %s bins percent infile" % sys.argv[0]
+ sys.exit(1)
+
+ bins = int(argv[0])
+ percent = int(argv[1])
+ infilename = argv[2]
+
+ cdfDist(bins, percent, infilename)
+
+
+def cdfDist(bins, percent, infilename):
+ infile = open(infilename)
+ binsList = [0] * bins
+
+ for line in infile:
+ fields = line.strip().split()
+ index = 0
+ for binCdf in fields[-1 * bins:]:
+ if int(binCdf) > percent:
+ binsList[index] += 1
+ break
+
+ index += 1
+
+ infile.close()
+ print binsList
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sqlite3 as sqlite
+import sys, string, optparse
+import os.path
+from commoncode import writeLog
+
+versionString = "%prog: version 3.5"
+print versionString
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--startField", type="int", dest="startField")
+ parser.add_option("--log", dest="logfilename")
+ parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ dbfile = args[0]
+ filename = args[1]
+ outfile = args[2]
+ goodfile = args[3]
+
+ checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
+
+
+def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
+ outfile = open(outFileName, "w")
+ goodfile = open(goodFileName, "w")
+ if startField < 0:
+ startField = 0
+
+ if cachePages < 250000:
+ cachePages = 250000
+
+ doLog = False
+ if logfilename is not None:
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ doLog = True
+
+ infile = open(filename)
+ if os.path.isfile(dbfile):
+ db = sqlite.connect(dbfile)
+ sql = db.cursor()
+ sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+ sql.execute("PRAGMA temp_store = MEMORY")
+ else:
+ print "No database - passing through"
+ if doLog:
+ writeLog(logfilename, versionString, "No database - passing through")
+
+ for line in infile:
+ outfile.write("%s\tNR\tNR\t0.00\n" % line)
+ goodfile.write(line)
+
+ outfile.close()
+ goodfile.close()
+ sys.exit(0)
+
+ featureList = []
+ featureDict = {}
+
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split("\t")
+ chrom = fields[startField][3:]
+ start = int(fields[startField + 1])
+ stop = int(fields[startField + 2])
+ featureList.append((chrom,start, stop))
+ featureDict[(chrom, start, stop)] = line.strip()
+
+ infile.close()
+
+ featureList.sort()
+ currentChrom = ""
+ currentMax = 0
+ increment = 20000000
+ for (chrom, start, stop) in featureList:
+ if chrom != currentChrom:
+ currentMax = 0
+
+ if start > currentMax:
+ currentChrom = chrom
+ currentMin = currentMax
+ currentMax += increment
+ print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
+ try:
+ del con
+ except:
+ pass
+
+ con = sqlite.connect(":memory:")
+ sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
+ results = sql.fetchall()
+ results2 = []
+ con.execute("create table repeats(name, family, start, stop)")
+ con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+ con.execute("PRAGMA temp_store = MEMORY")
+ for (rstart, rstop, name, family) in results:
+ results2.append((name, family, int(rstart), int(rstop)))
+
+ con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
+ con.execute("CREATE INDEX posIndex on repeats(start, stop)")
+ print chrom, len(results2)
+ sql2 = con.cursor()
+
+ featureLength = abs(stop - start)
+ results = []
+ finalresults = []
+ sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
+ results = sql2.fetchall()
+ for (rstart, rstop, name, family) in results:
+ overlapLength = float(abs(rstop - start))
+ if overlapLength > featureLength:
+ overlapLength = featureLength
+
+ ratio = overlapLength / featureLength
+ if (name, family, ratio) not in finalresults:
+ finalresults.append((name, family, ratio))
+
+ sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
+ results = sql2.fetchall()
+ for (rstart, rstop, name, family) in results:
+ overlapLength = float(abs(rstart - stop))
+ if overlapLength > featureLength:
+ overlapLength = featureLength
+
+ ratio = overlapLength / featureLength
+ if (name, family, ratio) not in finalresults:
+ finalresults.append((name, family, ratio))
+
+ sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
+ results = sql2.fetchall()
+ for (rstart, rstop, name, family) in results:
+ overlapLength = float(abs(rstop - rstart))
+ if overlapLength > featureLength:
+ overlapLength = featureLength
+
+ ratio = overlapLength / featureLength
+ if (name, family, ratio) not in finalresults:
+ finalresults.append((name, family, ratio))
+
+ sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
+ results = sql2.fetchall()
+ for (rstart, rstop, name, family) in results:
+ overlapLength = float(abs(rstop - rstart))
+ if overlapLength > featureLength:
+ overlapLength = featureLength
+
+ ratio = overlapLength / featureLength
+ if (name, family, ratio) not in finalresults:
+ finalresults.append((name, family, ratio))
+
+ line = featureDict[(chrom, start, stop)]
+ total = 0.
+ for (name, family, fraction) in finalresults:
+ outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
+ total += fraction
+ print outline
+ outfile.write(outline + "\n")
+
+ if len(finalresults) == 0:
+ outline = "%s\tNR\tNR\t%0.00" % line
+ print outline
+ outfile.write(outline + "\n")
+
+ if total < 0.2:
+ goodfile.write(line + "\n")
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+10
+
+dir
+23
+file:///Users/sau/svn/repos/erange/source/Erange/chiapet
+file:///Users/sau/svn/repos
+
+
+
+2010-10-01T18:32:26.347691Z
+22
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9
+\f
+segregateLinkers.py
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+a847d39676e6a4fb9501811ab9a4c0b9
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2380
+\f
+__init__.py
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+d41d8cd98f00b204e9800998ecf8427e
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+0
+\f
+markLinkers.py
+file
+
+
+
+
+
+10c527dc803a21ba14dfd8efc4f1e3d3
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
+linkers.fa
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+2b64087c826083f04e0ff968312e019a
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+63
+\f
--- /dev/null
+>linker_b.1
+GTTGGATAAGATATCGCGG
+>linker_b.2
+GTTGGAATGTATATCGCGG
\ No newline at end of file
--- /dev/null
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ linkerfile = argv[1]
+ infile = argv[2]
+ outfile = argv[3]
+
+ markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+ for line in infile:
+ if len(line) < 2:
+ continue
+
+ if "@" in line:
+ readID = line.strip()
+ readID = readID.replace("@", "")
+ else:
+ found = False
+ for linkerID in linkerList:
+ position = line.find(linkerDict[linkerID])
+ if position >= 19:
+ found = True
+ outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+ outfile.write("%s\n" % line[:20])
+
+ if not found:
+ outfile.write(">NA_%s\n" % readID)
+ outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+ linkerDict = {}
+ linkerList = []
+ try:
+ linkerfile = open(linkerFileName)
+ return getLinkerInformation(linkerfile)
+ except IOError:
+ return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+ linkerDict = {}
+ linkerList = []
+
+ for entry in linkerInformationList:
+ if ">" in entry:
+ linkerID = entry.strip()
+ linkerID = linkerID[1:]
+ linkerList.append(linkerID)
+ else:
+ sequence = entry.strip()
+ linkerDict[linkerID] = sequence[:10]
+
+ return linkerDict, linkerList
+
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ infile1 = argv[1]
+ infile2 = argv[2]
+ outprefix = argv[3]
+
+ segregateLinkers(infile1, infile2, outprefix)
+
+
+def segregateLinkers(infile1name, infile2name, outprefix):
+ infile1 = open(infile1name)
+ infile2 = open(infile2name)
+ same1 = 0
+ same2 = 0
+ mixed = 0
+ hasNA = 0
+
+ outsame1 = open("%s.same1.fa" % outprefix, "w")
+ outsame2 = open("%s.same2.fa" % outprefix, "w")
+ outNA = open("%s.NA.fa" % outprefix, "w")
+ outmixed = open("%s.mixed.fa" % outprefix, "w")
+
+ lines1 = infile1.readlines()
+
+ failed = False
+ for line1 in lines1:
+ line2 = infile2.readline()
+ if failed:
+ line2 = infile2.readline()
+ print line1.strip()
+ print line2.strip()
+ sys.exit(1)
+ continue
+
+ if ">" in line1:
+ try:
+ (linker1, readid1) = line1.split("_")
+ (linker2, readid2) = line2.split("_")
+ shortid1 = readid1.split("/")[0]
+ shortid2 = readid2.split("/")[0]
+ if shortid1 != shortid2:
+ print shortid1, shortid2
+ sys.exit(1)
+
+ failed = False
+ except:
+ print line1.strip()
+ print line2.strip()
+ failed = True
+
+ continue
+
+ if "NA" in linker1 or "NA" in linker2:
+ hasNA += 1
+ outNA.write("%s_%s%s" % (linker1, readid1, line1))
+ outNA.write("%s_%s%s" % (linker2, readid2, line2))
+ elif linker1 == linker2:
+ if "L1" in linker1:
+ same1 += 1
+ outsame1.write("%s_%s%s" % (linker1, readid1, line1))
+ outsame1.write("%s_%s%s" % (linker2, readid2, line2))
+ else:
+ same2 += 1
+ outsame2.write("%s_%s%s" % (linker1, readid1, line1))
+ outsame2.write("%s_%s%s" % (linker2, readid2, line2))
+ else:
+ mixed += 1
+ outmixed.write("%s_%s%s" % (linker1, readid1, line1))
+ outmixed.write("%s_%s%s" % (linker2, readid2, line2))
+
+ print same1
+ print same2
+ print mixed
+ print hasNA
+
+ outmixed.close()
+ outNA.close()
+ outsame1.close()
+ outsame2.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ linkerfile = argv[1]
+ infile = argv[2]
+ outfile = argv[3]
+
+ markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+ for line in infile:
+ if len(line) < 2:
+ continue
+
+ if "@" in line:
+ readID = line.strip()
+ readID = readID.replace("@", "")
+ else:
+ found = False
+ for linkerID in linkerList:
+ position = line.find(linkerDict[linkerID])
+ if position >= 19:
+ found = True
+ outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+ outfile.write("%s\n" % line[:20])
+
+ if not found:
+ outfile.write(">NA_%s\n" % readID)
+ outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+ linkerDict = {}
+ linkerList = []
+ try:
+ linkerfile = open(linkerFileName)
+ return getLinkerInformation(linkerfile)
+ except IOError:
+ return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+ linkerDict = {}
+ linkerList = []
+
+ for entry in linkerInformationList:
+ if ">" in entry:
+ linkerID = entry.strip()
+ linkerID = linkerID[1:]
+ linkerList.append(linkerID)
+ else:
+ sequence = entry.strip()
+ linkerDict[linkerID] = sequence[:10]
+
+ return linkerDict, linkerList
+
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+>linker_b.1
+GTTGGATAAGATATCGCGG
+>linker_b.2
+GTTGGAATGTATATCGCGG
\ No newline at end of file
--- /dev/null
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ linkerfile = argv[1]
+ infile = argv[2]
+ outfile = argv[3]
+
+ markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+ for line in infile:
+ if len(line) < 2:
+ continue
+
+ if "@" in line:
+ readID = line.strip()
+ readID = readID.replace("@", "")
+ else:
+ found = False
+ for linkerID in linkerList:
+ position = line.find(linkerDict[linkerID])
+ if position >= 19:
+ found = True
+ outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+ outfile.write("%s\n" % line[:20])
+
+ if not found:
+ outfile.write(">NA_%s\n" % readID)
+ outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+ linkerDict = {}
+ linkerList = []
+ try:
+ linkerfile = open(linkerFileName)
+ return getLinkerInformation(linkerfile)
+ except IOError:
+ return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+ linkerDict = {}
+ linkerList = []
+
+ for entry in linkerInformationList:
+ if ">" in entry:
+ linkerID = entry.strip()
+ linkerID = linkerID[1:]
+ linkerList.append(linkerID)
+ else:
+ sequence = entry.strip()
+ linkerDict[linkerID] = sequence[:10]
+
+ return linkerDict, linkerList
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ infile1 = argv[1]
+ infile2 = argv[2]
+ outprefix = argv[3]
+
+ segregateLinkers(infile1, infile2, outprefix)
+
+
+def segregateLinkers(infile1name, infile2name, outprefix):
+ infile1 = open(infile1name)
+ infile2 = open(infile2name)
+ same1 = 0
+ same2 = 0
+ mixed = 0
+ hasNA = 0
+
+ outsame1 = open("%s.same1.fa" % outprefix, "w")
+ outsame2 = open("%s.same2.fa" % outprefix, "w")
+ outNA = open("%s.NA.fa" % outprefix, "w")
+ outmixed = open("%s.mixed.fa" % outprefix, "w")
+
+ lines1 = infile1.readlines()
+
+ failed = False
+ for line1 in lines1:
+ line2 = infile2.readline()
+ if failed:
+ line2 = infile2.readline()
+ print line1.strip()
+ print line2.strip()
+ sys.exit(1)
+ continue
+
+ if ">" in line1:
+ try:
+ (linker1, readid1) = line1.split("_")
+ (linker2, readid2) = line2.split("_")
+ shortid1 = readid1.split("/")[0]
+ shortid2 = readid2.split("/")[0]
+ if shortid1 != shortid2:
+ print shortid1, shortid2
+ sys.exit(1)
+
+ failed = False
+ except:
+ print line1.strip()
+ print line2.strip()
+ failed = True
+
+ continue
+
+ if "NA" in linker1 or "NA" in linker2:
+ hasNA += 1
+ outNA.write("%s_%s%s" % (linker1, readid1, line1))
+ outNA.write("%s_%s%s" % (linker2, readid2, line2))
+ elif linker1 == linker2:
+ if "L1" in linker1:
+ same1 += 1
+ outsame1.write("%s_%s%s" % (linker1, readid1, line1))
+ outsame1.write("%s_%s%s" % (linker2, readid2, line2))
+ else:
+ same2 += 1
+ outsame2.write("%s_%s%s" % (linker1, readid1, line1))
+ outsame2.write("%s_%s%s" % (linker2, readid2, line2))
+ else:
+ mixed += 1
+ outmixed.write("%s_%s%s" % (linker1, readid1, line1))
+ outmixed.write("%s_%s%s" % (linker2, readid2, line2))
+
+ print same1
+ print same2
+ print mixed
+ print hasNA
+
+ outmixed.close()
+ outNA.close()
+ outsame1.close()
+ outsame2.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sqlite3 as sqlite
+import sys
+import tempfile, shutil, os, optparse
+from os import environ
+
+if environ.get("CISTEMATIC_TEMP"):
+ cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+ cisTemp = "/tmp"
+tempfile.tempdir = cisTemp
+
+print "version 3.3: %prog"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s dbfile snpsfile nr_snps_outfile [--cache numPages] [--repeats]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--repeats", action="store_true", dest="repeats")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(repeats=False, cachePages=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ dbfile = args[0]
+ filename = args[1]
+ outfile = args[2]
+
+ chkSNPrmask(dbfile, filename, outfile, options.repeats, options.cachePages)
+
+
+def chkSNPrmask(dbfile, filename, outfile, repeats=False, cachePages=None):
+ print dbfile
+
+ if cachePages is not None:
+ if cachePages < 250000:
+ cachePages = 250000
+
+ print "caching locally..."
+ cachefile = tempfile.mktemp() + ".db"
+ shutil.copyfile(dbfile, cachefile)
+ db = sqlite.connect(cachefile)
+ doCache = True
+ print "cached..."
+ else:
+ cachePages = 500000
+ doCache = False
+ db = sqlite.connect(dbfile)
+
+ sql = db.cursor()
+ sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+ sql.execute("PRAGMA temp_store = MEMORY")
+ sql.execute("ANALYZE")
+
+ infile = open(filename)
+ featureList = []
+ featureDict = {}
+
+ for line in infile:
+ if doNotProcessLine(line):
+ continue
+
+ fields = line.strip().split("\t")
+ chrom = fields[2][3:]
+ pos = int(fields[3])
+ featureList.append((chrom,pos))
+ featureDict[(chrom, pos)] = line.strip()
+
+ featureList.sort()
+
+ index = 0
+ currentChrom=None
+ for (chrom, pos) in featureList:
+ index += 1
+ if chrom != currentChrom:
+ print "\n%s" % chrom
+ currentChrom = chrom
+
+ results = []
+ try:
+ sql.execute("select family from repeats where chrom = '%s' and %d between start and stop" % (chrom, pos))
+ results = sql.fetchall()
+ except:
+ pass
+
+ if repeats: # if user wants to keep track of the SNPs in repeats
+ featureDict[(chrom,pos)] += "\tN\A"
+ for x in results:
+ featureDict[(chrom,pos)] += "\t" + str(x)
+ else:
+ for x in results:
+ try:
+ del featureDict[(chrom,pos)]
+ except KeyError:
+ pass
+
+ if index % 100 == 0:
+ print ".",
+ sys.stdout.flush()
+
+ if doCache:
+ print "removing cache"
+ del db
+ os.remove(cachefile)
+
+ outFile = open(outfile, "w")
+ for key, value in featureDict.iteritems():
+ outStr = str(value) + "\n"
+ outFile.write(outStr)
+
+ outFile.close()
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import optparse
+import tempfile
+import shutil
+import os
+import string
+import sqlite3 as sqlite
+
+print "version 3.6: %s" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog dbfile snpsfile dbsnp_outfile [--cache numPages] [--snpDB dbfile]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--snpDB", action="append", dest="snpDBList",
+ help="additional snp db files to check will be searched in order given")
+ parser.set_defaults(cachePages=None, snpDBList=[])
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ dbfile = args[0]
+ infile = args[1]
+ outfile = args[2]
+
+ chkSNPFile(dbfile, infile, outfile, options.cachePages, options.snpDBList)
+
+
+def chkSNPFile(dbfile, inputFileName, outputFileName, cachePages=None, snpDBList=[]):
+
+ snpInputFile = open(inputFileName)
+ snpLocationList, snpDict = getSNPLocationInfo(snpInputFile)
+
+ dbList = [dbfile]
+ for dbFileName in snpDBList:
+ dbList.append(dbFileName)
+
+ annotatedSnpDict = annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
+
+ outputFile = open(outputFileName, "w")
+ outputLine = ""
+ outputFile.write(outputLine)
+ for key,value in annotatedSnpDict.iteritems():
+ outputLine = "%s\n" % str(value)
+ outputFile.write(outputLine)
+
+ outputFile.close()
+
+
+def chkSNP(dbList, snpPropertiesList, cachePages=None):
+
+ snpLocationList, snpDict = getSNPLocationInfo(snpPropertiesList)
+ return annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
+
+
+def getSNPLocationInfo(snpPropertiesList):
+ snpLocationList = []
+ snpDict = {}
+
+ for line in snpPropertiesList:
+ if doNotProcessLine(line):
+ continue
+
+ fields = line.strip().split("\t")
+ chromosome = fields[2][3:]
+ position = int(fields[3])
+ snpLocation = (chromosome, position)
+ snpLocationList.append(snpLocation)
+ snpDict[snpLocation] = line.strip()
+
+ snpLocationList.sort()
+
+ return snpLocationList, snpDict
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None):
+ return annotateSNPFromDBList(snpLocationList, snpDict, [dbFileName], cachePages)
+
+
+def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
+ if os.environ.get("CISTEMATIC_TEMP"):
+ cisTemp = os.environ.get("CISTEMATIC_TEMP")
+ else:
+ cisTemp = "/tmp"
+
+ tempfile.tempdir = cisTemp
+
+ for dbFileName in dbList:
+ if cachePages is not None:
+ print "caching locally..."
+ cachefile = "%s.db" % tempfile.mktemp()
+ shutil.copyfile(dbFileName, cachefile)
+ db = sqlite.connect(cachefile)
+ doCache = True
+ print "cached..."
+ else:
+ db = sqlite.connect(dbFileName)
+ doCache = False
+
+ cacheSize = max(cachePages, 500000)
+ sql = db.cursor()
+ sql.execute("PRAGMA CACHE_SIZE = %d" % cacheSize)
+ sql.execute("PRAGMA temp_store = MEMORY")
+
+ index = 0
+ foundEntries = []
+ for chromosomePosition in snpLocationList:
+ (chromosome, position) = chromosomePosition
+ found = False
+ results = []
+ index += 1
+ startPosition = position - 1
+ sql.execute("select func, name from snp where chrom = '%s' and start = %d and stop = %d" % (chromosome, startPosition, position))
+ results = sql.fetchall()
+ try:
+ (func, name) = results[0]
+ found = True
+ except IndexError:
+ sql.execute("select func, name from snp where chrom = '%s' and start <= %d and stop >= %d" % (chromosome, startPosition, position))
+ results = sql.fetchall()
+ try:
+ (func, name) = results[0]
+ found = True
+ except IndexError:
+ pass
+
+ if found:
+ snpEntry = snpDict[chromosomePosition]
+ snpDict[chromosomePosition] = string.join([snpEntry, str(name), str(func)], "\t")
+ foundEntries.append(chromosomePosition)
+
+ if index % 100 == 0:
+ print ".",
+ sys.stdout.flush()
+
+ for chromosomePosition in foundEntries:
+ del snpLocationList[snpLocationList.index(chromosomePosition)]
+
+ if doCache:
+ print "\nremoving cache"
+ del db
+ os.remove(cachefile)
+
+ for chromosomePosition in snpLocationList:
+ snpEntry = snpDict[chromosomePosition]
+ snpDict[chromosomePosition] = string.join([snpEntry, "N\A", "N\A"], "\t")
+
+ return snpDict
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "version 1.2"
+ if len(argv) < 3:
+ print "usage: python %s field filename" % argv[0]
+ print "\n\tfields are counted starting at zero.\n"
+ sys.exit(1)
+
+ fieldID = int(argv[1])
+ filename = argv[2]
+
+ count = colsum(fieldID, filename)
+ print count
+
+
+def colsum(fieldID, filename):
+ infile = open(filename)
+ count = 0
+
+ for line in infile:
+ fields = line.strip().split()
+ try:
+ if "." in fields[fieldID]:
+ count += float(fields[fieldID])
+ else:
+ count += int(fields[fieldID])
+ except ValueError:
+ pass
+
+ infile.close()
+ return count
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# combineRPKMS.py
+# ENRAGE
+#
+
+print 'version 1.0'
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
+ parser.set_defaults(doFraction=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ firstfile = args[0]
+ expandedfile = args[1]
+ finalfile = args[2]
+ outfile = args[3]
+
+ combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
+
+
+def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
+ firstfile = open(firstfileName)
+ expandedfile = open(expandedfileName)
+ finalfile = open(finalfileName)
+ outfile = open(outfileName, "w")
+
+ firstDict = {}
+ gidDict = {}
+ expandedDict = {}
+
+ for line in firstfile:
+ fields = line.strip().split()
+ firstDict[fields[1]] = fields[-1]
+
+ firstfile.close()
+
+ for line in expandedfile:
+ fields = line.strip().split()
+ expandedDict[fields[1]] = fields[-1]
+ gidDict[fields[1]] = fields[0]
+
+ expandedfile.close()
+
+ if doFraction:
+ header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
+ else:
+ header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
+
+ outfile.write(header)
+
+ for line in finalfile:
+ fields = line.strip().split()
+ gene = fields[0]
+ rnakb = fields[1]
+ finalRPKM = fields[2]
+ firstRPKM = firstDict.get(gene, "")
+ outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+
+ if doFraction:
+ fraction = fields[3]
+ outline += "\t%s" % fraction
+
+ outfile.write(outline + '\n')
+
+ finalfile.close()
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# combinerds.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+from commoncode import readDataset
+
+print '%s: version 1.1' % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 2:
+ print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
+ #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+ sys.exit(1)
+
+ doCache = False
+ cachePages = -1
+ if '--cache' in argv:
+ doCache = True
+ try:
+ cachePages = int(argv[sys.argv.index('-cache') + 1])
+ except:
+ pass
+
+ datafile = argv[1]
+ infileList = []
+ for index in range(2, len(argv)):
+ if argv[index][0] == '-':
+ break
+ infileList.append(sys.argv[index])
+
+ print "destination RDS: %s" % datafile
+
+ if '--initrna' in argv:
+ rds = readDataset(datafile, initialize=True, datasetType='RNA')
+ elif '--init' in argv:
+ rds = readDataset(datafile, initialize=True)
+
+ withFlag = ''
+ if '--flag' in argv:
+ withFlag = argv[sys.argv.index('-flag') + 1]
+ print "restrict to flag = %s" % withFlag
+
+ rds = readDataset(datafile, verbose=True, cache=doCache)
+
+ if cachePages > rds.getDefaultCacheSize():
+ rds.setDBcache(cachePages)
+ cacheVal = cachePages
+ else:
+ cacheVal = rds.getDefaultCacheSize()
+
+ doIndex = False
+ if '--index' in argv:
+ doIndex = True
+
+ tableList = []
+ if '--table' in argv:
+ tableList.append(argv[argv.index('-table') + 1])
+ else:
+ tableList = rds.getTables()
+
+ combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
+
+
+def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
+ metaDict = rds.getMetadata()
+ if "numberImports" not in metaDict:
+ origIndex = 0
+ rds.insertMetadata([("numberImports", str(0))])
+ else:
+ origIndex = int(metaDict["numberImports"])
+
+ index = origIndex
+ for inputfile in infileList:
+ asname = "input" + str(index)
+ rds.attachDB(inputfile,asname)
+ for table in tableList:
+ print "importing table %s from file %s" % (table, inputfile)
+ ascols = "*"
+ if table == "uniqs":
+ ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+ elif table == "multi":
+ ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+ elif table == "splices":
+ ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
+ elif table == "metadata":
+ ascols = "name, value || ' (import_%d)'" % index
+ rds.importFromDB(asname, table, ascols)
+
+ if table != "metadata":
+ rds.importFromDB(asname, table, ascols, withFlag)
+
+ rds.detachDB(asname)
+ rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+ index += 1
+
+ rds.updateMetadata("numberImports", index, origIndex)
+ if doIndex:
+ print "building index...."
+ if cacheVal > 0:
+ rds.buildIndex(cacheVal)
+ else:
+ rds.buildIndex()
+
+ if doCache:
+ rds.saveCacheDB(datafile)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# commoncode.py
+# ENRAGE
+#
+
+import tempfile
+import shutil
+import os
+from os import environ
+import string
+import sqlite3 as sqlite
+from time import strftime
+from array import array
+from collections import defaultdict
+
+commoncodeVersion = 5.5
+currentRDSversion = 1.1
+
+if environ.get("CISTEMATIC_TEMP"):
+ cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+ cisTemp = "/tmp"
+
+tempfile.tempdir = cisTemp
+
+
+def getReverseComplement(base):
+ revComp = {"A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "N": "N"
+ }
+
+ return revComp[base]
+
+
+def countDuplicatesInList(listToCheck):
+ tally = defaultdict(int)
+ for item in listToCheck:
+ tally[item] += 1
+
+ return tally.items()
+
+
+def writeLog(logFile, messenger, message):
+ """ create a log file to write a message from a messenger or append to an existing file.
+ """
+ try:
+ logfile = open(logFile)
+ except IOError:
+ logfile = open(logFile, "w")
+ else:
+ logfile = open(logFile, "a")
+
+ logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
+ logfile.close()
+
+
+def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
+ fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+ doMerge=True, keepPeak=False, returnTop=0):
+
+ """ returns a list of merged overlapping regions;
+ can optionally filter regions that have a scoreField fewer than minHits.
+ Can also optionally return the label of each region, as well as the
+ peak, if supplied (peakPos and peakHeight should be the last 2 fields).
+ Can return the top regions based on score if higher than minHits.
+ """
+ infile = open(regionfilename)
+ lines = infile.readlines()
+ regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
+ fullChrom, chromField, scoreField, pad, compact,
+ doMerge, keepPeak, returnTop)
+
+ infile.close()
+
+ return regions
+
+
+def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
+ fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+ doMerge=True, keepPeak=False, returnTop=0):
+ """ returns a list of merged overlapping regions;
+ can optionally filter regions that have a scoreField fewer than minHits.
+ Can also optionally return the label of each region, as well as the
+ peak, if supplied (peakPos and peakHeight should be the last 2 fields).
+ Can return the top regions based on score if higher than minHits.
+ """
+ regions = {}
+ hasPvalue = 0
+ hasShift = 0
+ if 0 < returnTop < len(regionList):
+ scores = []
+ for regionEntry in regionList:
+ if regionEntry[0] == "#":
+ if "pvalue" in regionEntry:
+ hasPvalue = 1
+
+ if "readShift" in regionEntry:
+ hasShift = 1
+
+ continue
+
+ fields = regionEntry.strip().split("\t")
+ hits = float(fields[scoreField].strip())
+ scores.append(hits)
+
+ scores.sort()
+ returnTop = -1 * returnTop
+ minScore = scores[returnTop]
+ if minScore > minHits:
+ minHits = minScore
+
+ mergeCount = 0
+ chromField = int(chromField)
+ count = 0
+ #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
+ # exits at the first merge. This is not a problem when the input is sorted by start position, but in
+ # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+ # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
+ # be merged with A but that will exit the loop and the merge with C will be missed.
+ for regionEntry in regionList:
+ if regionEntry[0] == "#":
+ if "pvalue" in regionEntry:
+ hasPvalue = 1
+
+ if "readShift" in regionEntry:
+ hasShift = 1
+
+ continue
+
+ fields = regionEntry.strip().split("\t")
+ if minHits >= 0:
+ try:
+ hits = float(fields[scoreField].strip())
+ except (IndexError, ValueError):
+ continue
+
+ if hits < minHits:
+ continue
+
+ if compact:
+ (chrom, pos) = fields[chromField].split(":")
+ (front, back) = pos.split("-")
+ start = int(front)
+ stop = int(back)
+ elif chromField > 1:
+ label = string.join(fields[:chromField],"\t")
+ chrom = fields[chromField]
+ start = int(fields[chromField + 1]) - pad
+ stop = int(fields[chromField + 2]) + pad
+ else:
+ label = fields[0]
+ chrom = fields[1]
+ start = int(fields[2]) - pad
+ stop = int(fields[3]) + pad
+
+ if not fullChrom:
+ chrom = chrom[3:]
+
+ length = abs(stop - start)
+ if keepPeak:
+ peakPos = int(fields[-2 - hasPvalue - hasShift])
+ peakHeight = float(fields[-1 - hasPvalue - hasShift])
+
+ if chrom not in regions:
+ regions[chrom] = []
+
+ merged = False
+
+ if doMerge and len(regions[chrom]) > 0:
+ for index in range(len(regions[chrom])):
+ if keepLabel and keepPeak:
+ (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
+ elif keepLabel:
+ (rlabel, rstart, rstop, rlen) = regions[chrom][index]
+ elif keepPeak:
+ (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
+ else:
+ (rstart, rstop, rlen) = regions[chrom][index]
+
+ if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
+ if start < rstart:
+ rstart = start
+
+ if rstop < stop:
+ rstop = stop
+
+ rlen = abs(rstop - rstart)
+ if keepPeak:
+ if peakHeight > rpeakHeight:
+ rpeakHeight = peakHeight
+ rpeakPos = peakPos
+
+ if keepLabel and keepPeak:
+ regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight)
+ elif keepLabel:
+ regions[chrom][index] = (label, rstart, rstop, rlen)
+ elif keepPeak:
+ regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight)
+ else:
+ regions[chrom][index] = (rstart, rstop, rlen)
+
+ mergeCount += 1
+ merged = True
+ break
+
+ if not merged:
+ if keepLabel and keepPeak:
+ regions[chrom].append((label, start, stop, length, peakPos, peakHeight))
+ elif keepLabel:
+ regions[chrom].append((label, start, stop, length))
+ elif keepPeak:
+ regions[chrom].append((start, stop, length, peakPos, peakHeight))
+ else:
+ regions[chrom].append((start, stop, length))
+
+ count += 1
+
+ if verbose and (count % 100000 == 0):
+ print count
+
+ regionCount = 0
+ for chrom in regions:
+ regionCount += len(regions[chrom])
+ if keepLabel:
+ regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1]))
+ else:
+ regions[chrom].sort()
+
+ if verbose:
+ print "merged %d times" % mergeCount
+ print "returning %d regions" % regionCount
+
+ return regions
+
+
+def regionsOverlap(start, stop, rstart, rstop):
+ if start > stop:
+ (start, stop) = (stop, start)
+
+ if rstart > rstop:
+ (rstart, rstop) = (rstop, rstart)
+
+ return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
+
+
+def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
+ if start > stop:
+ (start, stop) = (stop, start)
+
+ if rstart > rstop:
+ (rstart, rstop) = (rstop, rstart)
+
+ return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
+
+
+def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
+ shift=0, returnShift=False, maxshift=75):
+ """ find the peak in a list of reads (hitlist) in a region
+ of a given length and absolute start point. returns a
+ list of peaks, the number of hits, a triangular-smoothed
+ version of hitlist, and the number of reads that are
+ forward (plus) sense.
+ If doWeight is True, weight the reads accordingly.
+ If leftPlus is True, return the number of plus reads left of
+ the peak, taken to be the first TopPos position.
+ """
+
+ seqArray = array("f", [0.] * length)
+ smoothArray = array("f", [0.] * length)
+ numHits = 0.
+ numPlus = 0.
+ regionArray = []
+ if shift == "auto":
+ shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+
+ # once we have the best shift, compute seqArray
+ for read in hitList:
+ currentpos = read[0] - start
+ if read[1] == "+":
+ currentpos += shift
+ else:
+ currentpos -= shift
+
+ if (currentpos < 1 - readlen) or (currentpos >= length):
+ continue
+
+ hitIndex = 0
+ if doWeight:
+ weight = read[2]
+ else:
+ weight = 1.0
+
+ numHits += weight
+ if leftPlus:
+ regionArray.append(read)
+
+ while currentpos < 0:
+ hitIndex += 1
+ currentpos += 1
+
+ while hitIndex < readlen and currentpos < length:
+ seqArray[currentpos] += weight
+ hitIndex += 1
+ currentpos += 1
+
+ if read[1] == "+":
+ numPlus += weight
+
+ # implementing a triangular smooth
+ for pos in range(2,length -2):
+ smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
+
+ topNucleotide = 0
+ topPos = []
+ for currentpos in xrange(length):
+ if topNucleotide < smoothArray[currentpos]:
+ topNucleotide = smoothArray[currentpos]
+ topPos = [currentpos]
+ elif topNucleotide == smoothArray[currentpos]:
+ topPos.append(currentpos)
+
+ if leftPlus:
+ numLeftPlus = 0
+ maxPos = topPos[0]
+ for read in regionArray:
+ if doWeight:
+ weight = read[2]
+ else:
+ weight = 1.0
+
+ currentPos = read[0] - start
+ if currentPos <= maxPos and read[1] == "+":
+ numLeftPlus += weight
+
+ if returnShift:
+ return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift)
+ else:
+ return (topPos, numHits, smoothArray, numPlus, numLeftPlus)
+ else:
+ if returnShift:
+ return (topPos, numHits, smoothArray, numPlus, shift)
+ else:
+ return (topPos, numHits, smoothArray, numPlus)
+
+
+def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+ bestShift = 0
+ lowestScore = 20000000000
+ for testShift in xrange(maxShift + 1):
+ shiftArray = array("f", [0.] * length)
+ for read in hitList:
+ currentpos = read[0] - start
+ if read[1] == "+":
+ currentpos += testShift
+ else:
+ currentpos -= testShift
+
+ if (currentpos < 1) or (currentpos >= length):
+ continue
+
+ if doWeight:
+ weight = read[2]
+ else:
+ weight = 1.0
+
+ if read[1] == "+":
+ shiftArray[currentpos] += weight
+ else:
+ shiftArray[currentpos] -= weight
+
+ currentScore = 0
+ for score in shiftArray:
+ currentScore += abs(score)
+
+ print currentScore
+ if currentScore < lowestScore:
+ bestShift = testShift
+ lowestScore = currentScore
+
+ return bestShift
+
+
+def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
+ restrictList=[], regionComplement=False, maxStop=250000000):
+ """ return a dictionary of cistematic gene features. Requires
+ cistematic, obviously. Can filter-out pseudogenes. Will use
+ additional regions dict to supplement gene models, if available.
+ Can restrict output to a list of GIDs.
+ If regionComplement is set to true, returns the regions *outside* of the
+ calculated boundaries, which is useful for retrieving intronic and
+ intergenic regions. maxStop is simply used to define the uppermost
+ boundary of the complement region.
+ """
+ featuresDict = genomeObject.getallGeneFeatures()
+ restrictGID = False
+ if len(restrictList) > 0:
+ restrictGID = True
+
+ if len(additionalRegionsDict) > 0:
+ sortList = []
+ for chrom in additionalRegionsDict:
+ for (label, start, stop, length) in additionalRegionsDict[chrom]:
+ if label not in sortList:
+ sortList.append(label)
+
+ if label not in featuresDict:
+ featuresDict[label] = []
+ sense = "+"
+ else:
+ sense = featuresDict[label][0][-1]
+
+ featuresDict[label].append(("custom", chrom, start, stop, sense))
+
+ for gid in sortList:
+ featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+ featuresByChromDict = {}
+ for gid in featuresDict:
+ if restrictGID and gid not in restrictList:
+ continue
+
+ featureList = featuresDict[gid]
+ newFeatureList = []
+ isPseudo = False
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if ftype == "PSEUDO":
+ isPseudo = True
+
+ if (start, stop, ftype) not in newFeatureList:
+ notContained = True
+ containedList = []
+ for (fstart, fstop, ftype2) in newFeatureList:
+ if start >= fstart and stop <= fstop:
+ notContained = False
+
+ if start < fstart and stop > fstop:
+ containedList.append((fstart, fstop))
+
+ if len(containedList) > 0:
+ newFList = []
+ notContained = True
+ for (fstart, fstop, ftype2) in newFeatureList:
+ if (fstart, fstop) not in containedList:
+ newFList.append((fstart, fstop, ftype2))
+ if start >= fstart and stop <= fstop:
+ notContained = False
+
+ newFeatureList = newFList
+ if notContained:
+ newFeatureList.append((start, stop, ftype))
+
+ if ignorePseudo and isPseudo:
+ continue
+
+ if chrom not in featuresByChromDict:
+ featuresByChromDict[chrom] = []
+
+ for (start, stop, ftype) in newFeatureList:
+ featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
+
+ for chrom in featuresByChromDict:
+ featuresByChromDict[chrom].sort()
+
+ if regionComplement:
+ complementByChromDict = {}
+ complementIndex = 0
+ for chrom in featuresByChromDict:
+ complementByChromDict[chrom] = []
+ listLength = len(featuresByChromDict[chrom])
+ if listLength > 0:
+ currentStart = 0
+ for index in range(listLength):
+ currentStop = featuresByChromDict[chrom][index][0]
+ complementIndex += 1
+ if currentStart < currentStop:
+ complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
+
+ currentStart = featuresByChromDict[chrom][index][1]
+
+ currentStop = maxStop
+ complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
+
+ return (featuresByChromDict, complementByChromDict)
+ else:
+ return featuresByChromDict
+
+
+def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+ additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
+ lengthCDS=0, keepSense=False, adjustToNeighbor=True):
+ """ return a dictionary of gene loci. Can be used to retrieve additional
+ sequence upstream or downstream of gene, up to the next gene. Requires
+ cistematic, obviously.
+ Can filter-out pseudogenes and use additional regions outside of existing
+ gene models. Use upstreamSpanTSS to overlap half of the upstream region
+ over the TSS.
+ If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
+ lengthCDS < 0bp, return only the last X bp from CDS.
+ """
+ locusByChromDict = {}
+ if upstream == 0 and downstream == 0 and not useCDS:
+ print "getLocusByChromDict: asked for no sequence - returning empty dict"
+ return locusByChromDict
+ elif upstream > 0 and downstream > 0 and not useCDS:
+ print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+ return locusByChromDict
+ elif lengthCDS != 0 and not useCDS:
+ print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+ return locusByChromDict
+ elif upstreamSpanTSS and lengthCDS != 0:
+ print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+ return locusByChromDict
+ elif lengthCDS > 0 and downstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+ return locusByChromDict
+ elif lengthCDS < 0 and upstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+ return locusByChromDict
+
+ genome = genomeObject.genome
+ featuresDict = genomeObject.getallGeneFeatures()
+ if len(additionalRegionsDict) > 0:
+ sortList = []
+ for chrom in additionalRegionsDict:
+ for (label, start, stop, length) in additionalRegionsDict[chrom]:
+ if label not in sortList:
+ sortList.append(label)
+
+ if label not in featuresDict:
+ featuresDict[label] = []
+ sense = "+"
+ else:
+ sense = featuresDict[label][0][-1]
+
+ featuresDict[label].append(("custom", chrom, start, stop, sense))
+
+ for gid in sortList:
+ featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+ for gid in featuresDict:
+ featureList = featuresDict[gid]
+ newFeatureList = []
+ for (ftype, chrom, start, stop, sense) in featureList:
+ newFeatureList.append((start, stop))
+
+ if ignorePseudo and ftype == "PSEUDO":
+ continue
+
+ newFeatureList.sort()
+
+ sense = featureList[0][-1]
+ gstart = newFeatureList[0][0]
+ gstop = newFeatureList[-1][1]
+ glen = abs(gstart - gstop)
+ if sense == "F":
+ if not useCDS and upstream > 0:
+ if upstreamSpanTSS:
+ if gstop > (gstart + upstream / 2):
+ gstop = gstart + upstream / 2
+ else:
+ gstop = gstart
+ elif not useCDS and downstream > 0:
+ gstart = gstop
+
+ if upstream > 0:
+ if upstreamSpanTSS:
+ distance = upstream / 2
+ else:
+ distance = upstream
+
+ if adjustToNeighbor:
+ nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+ if nextGene < distance * 2:
+ distance = nextGene / 2
+
+ if distance < 1:
+ distance = 1
+
+ gstart -= distance
+
+ if downstream > 0:
+ distance = downstream
+ if adjustToNeighbor:
+ nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+ if nextGene < downstream * 2:
+ distance = nextGene / 2
+
+ if distance < 1:
+ distance = 1
+
+ gstop += distance
+
+ if lengthCDS > 0:
+ if lengthCDS < glen:
+ gstop = newFeatureList[0][0] + lengthCDS
+
+ if lengthCDS < 0:
+ if abs(lengthCDS) < glen:
+ gstart = newFeatureList[-1][1] + lengthCDS
+ else:
+ if not useCDS and upstream > 0:
+ if upstreamSpanTSS:
+ if gstart < (gstop - upstream / 2):
+ gstart = gstop - upstream / 2
+ else:
+ gstart = gstop
+ elif not useCDS and downstream > 0:
+ gstop = gstart
+
+ if upstream > 0:
+ if upstreamSpanTSS:
+ distance = upstream /2
+ else:
+ distance = upstream
+
+ if adjustToNeighbor:
+ nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+ if nextGene < distance * 2:
+ distance = nextGene / 2
+
+ if distance < 1:
+ distance = 1
+
+ gstop += distance
+
+ if downstream > 0:
+ distance = downstream
+ if adjustToNeighbor:
+ nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+ if nextGene < downstream * 2:
+ distance = nextGene / 2
+
+ if distance < 1:
+ distance = 1
+
+ gstart -= distance
+
+ if lengthCDS > 0:
+ if lengthCDS < glen:
+ gstart = newFeatureList[-1][-1] - lengthCDS
+
+ if lengthCDS < 0:
+ if abs(lengthCDS) < glen:
+ gstop = newFeatureList[0][0] - lengthCDS
+
+ glen = abs(gstop - gstart)
+ if chrom not in locusByChromDict:
+ locusByChromDict[chrom] = []
+
+ if keepSense:
+ locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
+ else:
+ locusByChromDict[chrom].append((gstart, gstop, gid, glen))
+
+ for chrom in locusByChromDict:
+ locusByChromDict[chrom].sort()
+
+ return locusByChromDict
+
+
+def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
+ normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
+ binLength=-1):
+ """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
+ a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
+ or a different weight / tag.
+ """
+ index = 0
+ regionsBins = {}
+ regionsLen = {}
+
+ if defaultRegionFormat:
+ regionIDField = 0
+ startField = 1
+ stopField = 2
+ lengthField = 3
+ else:
+ startField = 0
+ stopField = 1
+ regionIDField = 2
+ lengthField = 3
+
+ senseField = 4
+
+ print "entering computeRegionBins"
+ if len(regionList) > 0:
+ for readID in regionList:
+ regionsBins[readID] = [0.] * bins
+ else:
+ for chrom in regionsByChromDict:
+ for regionTuple in regionsByChromDict[chrom]:
+ regionID = regionTuple[regionIDField]
+ regionsBins[regionID] = [0.] * bins
+
+ for chrom in hitDict:
+ if chrom not in regionsByChromDict:
+ continue
+
+ for regionTuple in regionsByChromDict[chrom]:
+ regionID = regionTuple[regionIDField]
+ regionsLen[regionID] = regionTuple[lengthField]
+
+ print "%s\n" % chrom
+ startRegion = 0
+ for (tagStart, sense, weight) in hitDict[chrom]:
+ index += 1
+ if index % 100000 == 0:
+ print "read %d " % index,
+
+ stopPoint = tagStart + readlen
+ if startRegion < 0:
+ startRegion = 0
+
+ for regionTuple in regionsByChromDict[chrom][startRegion:]:
+ start = regionTuple[startField]
+ stop = regionTuple[stopField]
+ regionID = regionTuple[regionIDField]
+ rlen = regionTuple[lengthField]
+ try:
+ rsense = regionTuple[senseField]
+ except:
+ rsense = "F"
+
+ if tagStart > stop:
+ startRegion += 1
+ continue
+
+ if start > stopPoint:
+ startRegion -= 10
+ break
+
+ if start <= tagStart <= stop:
+ if binLength < 1:
+ regionBinLength = rlen / bins
+ else:
+ regionBinLength = binLength
+
+ startdist = tagStart - start
+ if rsense == "F":
+ # we are relying on python's integer division quirk
+ binID = startdist / regionBinLength
+ if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
+ binID = 0
+ elif fixedFirstBin > 0:
+ binID = 1
+
+ if binID >= bins:
+ binID = bins - 1
+
+ try:
+ regionsBins[regionID][binID] += normalizedTag * weight
+ except KeyError:
+ print "%s %s" % (regionID, str(binID))
+ else:
+ rdist = rlen - startdist
+ binID = rdist / regionBinLength
+ if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
+ binID = 0
+ elif fixedFirstBin > 0:
+ binID = 1
+
+ if binID >= bins:
+ binID = bins - 1
+
+ try:
+ regionsBins[regionID][binID] += normalizedTag * weight
+ except KeyError:
+ print "%s %s" % (regionID, str(binID))
+
+ stopPoint = stop
+
+ return (regionsBins, regionsLen)
+
+
+# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will
+# require going through all the code to make the changes needed. Major project for another
+# day, but it really needs to be done
+class readDataset:
+ """ Class for storing reads from experiments. Assumes that custom scripts
+ will translate incoming data into a format that can be inserted into the
+ class using the insert* methods. Default class subtype ('DNA') includes
+ tables for unique and multireads, whereas 'RNA' subtype also includes a
+ splices table.
+ """
+
+ def __init__(self, datafile, initialize=False, datasetType='', verbose=False,
+ cache=False, reportCount=True):
+ """ creates an rds datafile if initialize is set to true, otherwise
+ will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
+ """
+ self.dbcon = ""
+ self.memcon = ""
+ self.dataType = ""
+ self.rdsVersion = "1.1"
+ self.memBacked = False
+ self.memChrom = ""
+ self.memCursor = ""
+ self.cachedDBFile = ""
+
+ if cache:
+ if verbose:
+ print "caching ...."
+
+ self.cacheDB(datafile)
+ dbfile = self.cachedDBFile
+ else:
+ dbfile = datafile
+
+ self.dbcon = sqlite.connect(dbfile)
+ self.dbcon.row_factory = sqlite.Row
+ self.dbcon.execute("PRAGMA temp_store = MEMORY")
+ if initialize:
+ if datasetType == "":
+ self.dataType = "DNA"
+ else:
+ self.dataType = datasetType
+
+ self.initializeTables(self.dbcon)
+ else:
+ metadata = self.getMetadata("dataType")
+ self.dataType = metadata["dataType"]
+
+ try:
+ metadata = self.getMetadata("rdsVersion")
+ self.rdsVersion = metadata["rdsVersion"]
+ except:
+ try:
+ self.insertMetadata([("rdsVersion", currentRDSversion)])
+ except:
+ print "could not add rdsVersion - read-only ?"
+ self.rdsVersion = "pre-1.0"
+
+ if verbose:
+ if initialize:
+ print "INITIALIZED dataset %s" % datafile
+ else:
+ print "dataset %s" % datafile
+
+ metadata = self.getMetadata()
+ print "metadata:"
+ pnameList = metadata.keys()
+ pnameList.sort()
+ for pname in pnameList:
+ print "\t" + pname + "\t" + metadata[pname]
+
+ if reportCount:
+ ucount = self.getUniqsCount()
+ mcount = self.getMultiCount()
+ if self.dataType == "DNA" and not initialize:
+ try:
+ print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
+ except:
+ print "\n%s unique reads and %s multireads" % (ucount, mcount)
+ elif self.dataType == 'RNA' and not initialize:
+ scount = self.getSplicesCount()
+ try:
+ print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
+ except:
+ print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
+
+ print "default cache size is %d pages" % self.getDefaultCacheSize()
+ if self.hasIndex():
+ print "found index"
+ else:
+ print "not indexed"
+
+
+ def __len__(self):
+ """ return the number of usable reads in the dataset.
+ """
+ try:
+ total = self.getUniqsCount()
+ except:
+ total = 0
+
+ try:
+ total += self.getMultiCount()
+ except:
+ pass
+
+ if self.dataType == "RNA":
+ try:
+ total += self.getSplicesCount()
+ except:
+ pass
+
+ try:
+ total = int(total)
+ except:
+ total = 0
+
+ return total
+
+
+ def __del__(self):
+ """ cleanup copy in local cache, if present.
+ """
+ if self.cachedDBFile != "":
+ self.uncacheDB()
+
+
+ def cacheDB(self, filename):
+ """ copy geneinfoDB to a local cache.
+ """
+ self.cachedDBFile = tempfile.mktemp() + ".db"
+ shutil.copyfile(filename, self.cachedDBFile)
+
+
+ def saveCacheDB(self, filename):
+ """ copy geneinfoDB to a local cache.
+ """
+ shutil.copyfile(self.cachedDBFile, filename)
+
+
+ def uncacheDB(self):
+ """ delete geneinfoDB from local cache.
+ """
+ global cachedDBFile
+ if self.cachedDBFile != "":
+ try:
+ os.remove(self.cachedDBFile)
+ except:
+ print "could not delete %s" % self.cachedDBFile
+
+ self.cachedDB = ""
+
+
+ def attachDB(self, filename, asname):
+ """ attach another database file to the readDataset.
+ """
+ stmt = "attach '%s' as %s" % (filename, asname)
+ self.execute(stmt)
+
+
+ def detachDB(self, asname):
+ """ detach a database file to the readDataset.
+ """
+ stmt = "detach %s" % (asname)
+ self.execute(stmt)
+
+
+ def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+ """ import into current RDS the table (with columns destcolumns,
+ with default all columns) from the database file asname,
+ using the column specification of ascolumns (default all).
+ """
+ stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+ if flagged != "":
+ stmt += " where flag = '%s' " % flagged
+
+ self.execute(stmt, forceCommit=True)
+
+
+ def getTables(self, asname=""):
+ """ get a list of table names in a particular database file.
+ """
+ resultList = []
+
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ if asname != "":
+ asname += "."
+
+ stmt = "select name from %ssqlite_master where type='table'" % asname
+ sql.execute(stmt)
+ results = sql.fetchall()
+
+ for row in results:
+ resultList.append(row["name"])
+
+ return resultList
+
+
+ def hasIndex(self):
+ """ check whether the RDS file has at least one index.
+ """
+ stmt = "select count(*) from sqlite_master where type='index'"
+ count = int(self.execute(stmt, returnResults=True)[0][0])
+ if count > 0:
+ return True
+
+ return False
+
+
+ def initializeTables(self, acon, cache=100000):
+ """ creates table schema in database connection acon, which is
+ typically a database file or an in-memory database.
+ """
+ acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+ acon.execute("create table metadata (name varchar, value varchar)")
+ acon.execute("insert into metadata values('dataType','%s')" % self.dataType)
+ acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
+ acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
+ if self.dataType == "RNA":
+ acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)")
+
+ acon.commit()
+
+
+ def getFileCursor(self):
+ """ returns a cursor to file database for low-level (SQL)
+ access to the data.
+ """
+ return self.dbcon.cursor()
+
+
+ def getMemCursor(self):
+ """ returns a cursor to memory database for low-level (SQL)
+ access to the data.
+ """
+ return self.memcon.cursor()
+
+
+ def getMetadata(self, valueName=""):
+ """ returns a dictionary of metadata.
+ """
+ whereClause = ""
+ resultsDict = {}
+
+ if valueName != "":
+ whereClause = " where name = '%s' " % valueName
+
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ sql.execute("select name, value from metadata" + whereClause)
+ results = sql.fetchall()
+
+ for row in results:
+ pname = row["name"]
+ pvalue = row["value"]
+ if pname not in resultsDict:
+ resultsDict[pname] = pvalue
+ else:
+ trying = True
+ index = 2
+ while trying:
+ newName = pname + ":" + str(index)
+ if newName not in resultsDict:
+ resultsDict[newName] = pvalue
+ trying = False
+
+ index += 1
+
+ return resultsDict
+
+
+ def getReadSize(self):
+ """ returns readsize if defined in metadata.
+ """
+ metadata = self.getMetadata()
+ if "readsize" not in metadata:
+ print "no readsize parameter defined - returning 0"
+ return 0
+ else:
+ mysize = metadata["readsize"]
+ if "import" in mysize:
+ mysize = mysize.split()[0]
+
+ return int(mysize)
+
+
+ def getDefaultCacheSize(self):
+ """ returns the default cache size.
+ """
+ return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+
+
+ def getChromosomes(self, table="uniqs", fullChrom=True):
+ """ returns a list of distinct chromosomes in table.
+ """
+ statement = "select distinct chrom from %s" % table
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ sql.execute(statement)
+ results = []
+ for row in sql:
+ if fullChrom:
+ if row["chrom"] not in results:
+ results.append(row["chrom"])
+ else:
+ if len(row["chrom"][3:].strip()) < 1:
+ continue
+
+ if row["chrom"][3:] not in results:
+ results.append(row["chrom"][3:])
+
+ results.sort()
+
+ return results
+
+
+ def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+ doMulti=False, doSplices=False):
+ """ returns the maximum coordinate for reads on a given chromosome.
+ """
+ maxCoord = 0
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ if doUniqs:
+ try:
+ sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
+ maxCoord = int(sql.fetchall()[0][0])
+ except:
+ print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+ if doSplices:
+ sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
+ try:
+ spliceMax = int(sql.fetchall()[0][0])
+ if spliceMax > maxCoord:
+ maxCoord = spliceMax
+ except:
+ pass
+
+ if doMulti:
+ sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
+ try:
+ multiMax = int(sql.fetchall()[0][0])
+ if multiMax > maxCoord:
+ maxCoord = multiMax
+ except:
+ pass
+
+ if verbose:
+ print "%s maxCoord: %d" % (chrom, maxCoord)
+
+ return maxCoord
+
+
+ def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
+ withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
+ readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+ flagLike=False, strand="", entryDict=False, combine5p=False):
+ """ returns a dictionary of reads in a variety of formats
+ and which can be restricted by chromosome or custom-flag.
+ Returns unique reads by default, but can return multireads
+ with doMulti set to True.
+ """
+ whereClause = []
+ resultsDict = {}
+
+ if chrom != "" and chrom != self.memChrom:
+ whereClause.append("chrom = '%s'" % chrom)
+
+ if flag != "":
+ if flagLike:
+ flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+ whereClause.append(flagLikeClause)
+ else:
+ whereClause.append("flag = '%s'" % flag)
+
+ if start > -1:
+ whereClause.append("start > %d" % start)
+
+ if stop > -1:
+ whereClause.append("stop < %d" % stop)
+
+ if len(readLike) > 0:
+ readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
+ whereClause.append(readIDClause)
+
+ if hasMismatch:
+ whereClause.append("mismatch != ''")
+
+ if strand in ["+", "-"]:
+ whereClause.append("sense = '%s'" % strand)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ groupBy = []
+ if findallOptimize:
+ selectClause = ["select start, sense, sum(weight)"]
+ groupBy = ["GROUP BY start, sense"]
+ else:
+ selectClause = ["select ID, chrom, start, readID"]
+ if bothEnds:
+ selectClause.append("stop")
+
+ if not noSense:
+ selectClause.append("sense")
+
+ if withWeight:
+ selectClause.append("weight")
+
+ if withFlag:
+ selectClause.append("flag")
+
+ if withMismatch:
+ selectClause.append("mismatch")
+
+ if limit > 0 and not combine5p:
+ groupBy.append("LIMIT %d" % limit)
+
+ selectQuery = string.join(selectClause, ",")
+ groupQuery = string.join(groupBy)
+ if doUniqs:
+ stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
+ if doMulti:
+ stmt.append("UNION ALL")
+ stmt.append(selectQuery)
+ stmt.append("from multi")
+ stmt.append(whereQuery)
+ stmt.append(groupQuery)
+ else:
+ stmt = [selectQuery, "from multi", whereQuery]
+
+ if combine5p:
+ if findallOptimize:
+ selectQuery = "select start, sense, weight, chrom"
+
+ if doUniqs:
+ subSelect = [selectQuery, "from uniqs", whereQuery]
+ if doMulti:
+ subSelect.append("union all")
+ subSelect.append(selectQuery)
+ subSelect.append("from multi")
+ subSelect.append(whereQuery)
+ else:
+ subSelect = [selectQuery, "from multi", whereQuery]
+
+ sqlStmt = string.join(subSelect)
+ if findallOptimize:
+ selectQuery = "select start, sense, sum(weight)"
+
+ stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
+ selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
+
+ if findallOptimize:
+ if self.memBacked:
+ self.memcon.row_factory = None
+ sql = self.memcon.cursor()
+ else:
+ self.dbcon.row_factory = None
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by start")
+ elif readIDDict:
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by readID, start")
+ else:
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ stmt.append("order by chrom, start")
+
+ sqlQuery = string.join(stmt)
+ sql.execute(sqlQuery)
+
+ if findallOptimize:
+ resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql]
+ if self.memBacked:
+ self.memcon.row_factory = sqlite.Row
+ else:
+ self.dbcon.row_factory = sqlite.Row
+ else:
+ currentChrom = ""
+ currentReadID = ""
+ pairID = 0
+ for row in sql:
+ readID = row["readID"]
+ if fullChrom:
+ chrom = row["chrom"]
+ else:
+ chrom = row["chrom"][3:]
+
+ if not readIDDict and chrom != currentChrom:
+ resultsDict[chrom] = []
+ currentChrom = chrom
+ dictKey = chrom
+ elif readIDDict:
+ theReadID = readID
+ if "::" in readID:
+ (theReadID, multiplicity) = readID.split("::")
+
+ if "/" in theReadID and withPairID:
+ (theReadID, pairID) = readID.split("/")
+
+ if theReadID != currentReadID:
+ resultsDict[theReadID] = []
+ currentReadID = theReadID
+ dictKey = theReadID
+
+ if entryDict:
+ newrow = {"start": int(row["start"])}
+ if bothEnds:
+ newrow["stop"] = int(row["stop"])
+
+ if not noSense:
+ newrow["sense"] = row["sense"]
+
+ if withWeight:
+ newrow["weight"] = float(row["weight"])
+
+ if withFlag:
+ newrow["flag"] = row["flag"]
+
+ if withMismatch:
+ newrow["mismatch"] = row["mismatch"]
+
+ if withID:
+ newrow["readID"] = readID
+
+ if withChrom:
+ newrow["chrom"] = chrom
+
+ if withPairID:
+ newrow["pairID"] = pairID
+ else:
+ newrow = [int(row["start"])]
+ if bothEnds:
+ newrow.append(int(row["stop"]))
+
+ if not noSense:
+ newrow.append(row["sense"])
+
+ if withWeight:
+ newrow.append(float(row["weight"]))
+
+ if withFlag:
+ newrow.append(row["flag"])
+
+ if withMismatch:
+ newrow.append(row["mismatch"])
+
+ if withID:
+ newrow.append(readID)
+
+ if withChrom:
+ newrow.append(chrom)
+
+ if withPairID:
+ newrow.append(pairID)
+
+ resultsDict[dictKey].append(newrow)
+
+ return resultsDict
+
+
+ def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False,
+ withID=False, withChrom=False, withPairID=False, readIDDict=False,
+ splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+ stop=-1, strand="", entryDict=False):
+ """ returns a dictionary of spliced reads in a variety of
+ formats and which can be restricted by chromosome or custom-flag.
+ Returns unique spliced reads for now.
+ """
+ whereClause = []
+ resultsDict = {}
+
+ if chrom != "" and chrom != self.memChrom:
+ whereClause = ["chrom = '%s'" % chrom]
+
+ if flag != "":
+ if flagLike:
+ flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+ whereClause.append(flagLikeClause)
+ else:
+ whereClause.append("flag = '%s'" % flag)
+
+ if hasMismatch:
+ whereClause.append("mismatch != ''")
+
+ if strand != "":
+ whereClause.append("sense = '%s'" % strand)
+
+ if start > -1:
+ whereClause.append("startL > %d" % start)
+
+ if stop > -1:
+ whereClause.append("stopR < %d" % stop)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+ if not noSense:
+ selectClause.append("sense")
+
+ if withWeight:
+ selectClause.append("weight")
+
+ if withFlag:
+ selectClause.append("flag")
+
+ if withMismatch:
+ selectClause.append("mismatch")
+
+ selectQuery = string.join(selectClause, " ,")
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ if chrom == "" and not readIDDict:
+ stmt = "select distinct chrom from splices %s" % whereQuery
+ sql.execute(stmt)
+ for row in sql:
+ if fullChrom:
+ chrom = row["chrom"]
+ else:
+ chrom = row["chrom"][3:]
+
+ resultsDict[chrom] = []
+ elif chrom != "" and not readIDDict:
+ resultsDict[chrom] = []
+
+ stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
+ sql.execute(stmt)
+ currentReadID = ""
+ for row in sql:
+ pairID = 0
+ readID = row["readID"]
+ if fullChrom:
+ chrom = row["chrom"]
+ else:
+ chrom = row["chrom"][3:]
+
+ if readIDDict:
+ if "/" in readID:
+ (theReadID, pairID) = readID.split("/")
+ else:
+ theReadID = readID
+
+ if theReadID != currentReadID:
+ resultsDict[theReadID] = []
+ currentReadID = theReadID
+ dictKey = theReadID
+ else:
+ dictKey = chrom
+
+ if entryDict:
+ newrow = {"startL": int(row["startL"])}
+ newrow["stopL"] = int(row["stopL"])
+ newrow["startR"] = int(row["startR"])
+ newrow["stopR"] = int(row["stopR"])
+ if not noSense:
+ newrow["sense"] = row["sense"]
+
+ if withWeight:
+ newrow["weight"] = float(row["weight"])
+
+ if withFlag:
+ newrow["flag"] = row["flag"]
+
+ if withMismatch:
+ newrow["mismatch"] = row["mismatch"]
+
+ if withID:
+ newrow["readID"] = readID
+
+ if withChrom:
+ newrow["chrom"] = chrom
+
+ if withPairID:
+ newrow["pairID"] = pairID
+
+ if splitRead:
+ leftDict = newrow
+ del leftDict["startR"]
+ del leftDict["stopR"]
+ rightDict = newrow
+ del rightDict["start"]
+ del rightDict["stopL"]
+ resultsDict[dictKey].append(leftDict)
+ resultsDict[dictKey].append(rightDict)
+ else:
+ resultsDict[dictKey].append(newrow)
+ else:
+ newrow = [int(row["startL"])]
+ newrow.append(int(row["stopL"]))
+ newrow.append(int(row["startR"]))
+ newrow.append(int(row["stopR"]))
+ if not noSense:
+ newrow.append(row["sense"])
+
+ if withWeight:
+ newrow.append(float(row["weight"]))
+
+ if withFlag:
+ newrow.append(row["flag"])
+
+ if withMismatch:
+ newrow.append(row["mismatch"])
+
+ if withID:
+ newrow.append(readID)
+
+ if withChrom:
+ newrow.append(chrom)
+
+ if withPairID:
+ newrow.append(pairID)
+
+ if splitRead:
+ resultsDict[dictKey].append(newrow[:2] + newrow[4:])
+ resultsDict[dictKey].append(newrow[2:])
+ else:
+ resultsDict[dictKey].append(newrow)
+
+ return resultsDict
+
+
+ def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+ splices=False, reportCombined=True, sense="both"):
+ """ return read counts for a given region.
+ """
+ ucount = 0
+ mcount = 0
+ scount = 0
+ restrict = ""
+ if sense in ["+", "-"]:
+ restrict = " sense ='%s' " % sense
+
+ if uniqs:
+ try:
+ ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
+ except:
+ ucount = 0
+
+ if multi:
+ try:
+ mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
+ except:
+ mcount = 0
+
+ if splices:
+ try:
+ scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
+ except:
+ scount = 0
+
+ if reportCombined:
+ total = ucount + mcount + scount
+ return total
+ else:
+ return (ucount, mcount, scount)
+
+
+ def getTotalCounts(self, chrom="", rmin="", rmax=""):
+ return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
+
+
+ def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
+ """ returns the number of row in the uniqs table.
+ """
+ whereClause = []
+ count = 0
+
+ if chrom !="" and chrom != self.memChrom:
+ whereClause = ["chrom='%s'" % chrom]
+
+ if rmin != "":
+ whereClause.append("%s >= %s" % (startField, str(rmin)))
+
+ if rmax != "":
+ whereClause.append("%s <= %s" % (startField, str(rmax)))
+
+ if restrict != "":
+ whereClause.append(restrict)
+
+ if len(whereClause) > 0:
+ whereStatement = string.join(whereClause, " and ")
+ whereQuery = "where %s" % whereStatement
+ else:
+ whereQuery = ""
+
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ if distinct:
+ sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery))
+ else:
+ sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+
+ result = sql.fetchone()
+
+ try:
+ count = int(result[0])
+ except:
+ count = 0
+
+ return count
+
+
+ def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the number of row in the splices table.
+ """
+ return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+
+
+ def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the number of distinct readIDs in the uniqs table.
+ """
+ return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+
+
+ def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ """ returns the total weight of readIDs in the multi table.
+ """
+ return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+
+
+ def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
+ """ get readID's.
+ """
+ stmt = []
+ limitPart = ""
+ if limit > 0:
+ limitPart = "LIMIT %d" % limit
+
+ if uniqs:
+ stmt.append("select readID from uniqs")
+
+ if multi:
+ stmt.append("select readID from multi")
+
+ if splices:
+ stmt.append("select readID from splices")
+
+ if len(stmt) > 0:
+ selectPart = string.join(stmt, " union ")
+ else:
+ selectPart = ""
+
+ sqlQuery = "%s group by readID %s" (selectPart, limitPart)
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ sql.execute(sqlQuery)
+ result = sql.fetchall()
+
+ if paired:
+ return [x.split("/")[0][0] for x in result]
+ else:
+ return [x[0] for x in result]
+
+
+ def getMismatches(self, mischrom = None, verbose=False, useSplices=True):
+ """ returns the uniq and spliced mismatches in a dictionary.
+ """
+ revcomp = {"A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "N": "N"
+ }
+
+ readlen = self.getReadSize()
+ if mischrom:
+ hitChromList = [mischrom]
+ else:
+ hitChromList = self.getChromosomes()
+ hitChromList.sort()
+
+ snpDict = {}
+ for achrom in hitChromList:
+ if verbose:
+ print "getting mismatches from chromosome %s" % (achrom)
+
+ snpDict[achrom] = []
+ hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True)
+ if useSplices and self.dataType == "RNA":
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+ spliceIDList = spliceDict.keys()
+ for k in spliceIDList:
+ (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0]
+ spMismatchList = mismatches.split(",")
+ for mismatch in spMismatchList:
+ if "N" in mismatch:
+ continue
+
+ change_len = len(mismatch)
+ if sense == "+":
+ change_from = mismatch[0]
+ change_base = mismatch[change_len-1]
+ change_pos = int(mismatch[1:change_len-1])
+ elif sense == "-":
+ change_from = revcomp[mismatch[0]]
+ change_base = revcomp[mismatch[change_len-1]]
+ change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+ firsthalf = int(lefthalf)-int(startpos)+1
+ secondhalf = 0
+ if int(change_pos) <= int(firsthalf):
+ change_at = startpos + change_pos - 1
+ else:
+ secondhalf = change_pos - firsthalf
+ change_at = rightstart + secondhalf
+
+ snpDict[achrom].append([startpos, change_at, change_base, change_from])
+
+ if achrom not in hitDict:
+ continue
+
+ for (start, sense, mismatches) in hitDict[achrom]:
+ mismatchList = mismatches.split(",")
+ for mismatch in mismatchList:
+ if "N" in mismatch:
+ continue
+
+ change_len = len(mismatch)
+ if sense == "+":
+ change_from = mismatch[0]
+ change_base = mismatch[change_len-1]
+ change_pos = int(mismatch[1:change_len-1])
+ elif sense == "-":
+ change_from = revcomp[mismatch[0]]
+ change_base = revcomp[mismatch[change_len-1]]
+ change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+ change_at = start + change_pos - 1
+ snpDict[achrom].append([start, change_at, change_base, change_from])
+
+ return snpDict
+
+
+ def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
+ useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+ keepStrand="both", shiftValue=0):
+ """return a profile of the chromosome as an array of per-base read coverage....
+ keepStrand = 'both', 'plusOnly', or 'minusOnly'.
+ Will also shift position of unique and multireads (but not splices) if shift is a natural number
+ """
+ metadata = self.getMetadata()
+ readlen = int(metadata["readsize"])
+ dataType = metadata["dataType"]
+ scale = 1. / normalizationFactor
+ shift = {}
+ shift["+"] = int(shiftValue)
+ shift["-"] = -1 * int(shiftValue)
+
+ if cstop > 0:
+ lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
+ else:
+ lastNT = cstop - cstart + readlen + shift["+"]
+
+ chromModel = array("f", [0.] * lastNT)
+ hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
+ if cstart < 0:
+ cstart = 0
+
+ for (hstart, sense, weight) in hitDict[chromosome]:
+ hstart = hstart - cstart + shift[sense]
+ for currentpos in range(hstart,hstart+readlen):
+ try:
+ if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale * weight
+ elif sense == '-' and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale * weight
+ except:
+ continue
+
+ del hitDict
+ if useSplices and dataType == "RNA":
+ if cstop > 0:
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
+ else:
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
+
+ if chromosome in spliceDict:
+ for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]:
+ if (Rstop - cstart) < lastNT:
+ for index in range(abs(Lstop - Lstart)):
+ currentpos = Lstart - cstart + index
+ # we only track unique splices
+ if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale
+ elif rsense == "-" and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale
+
+ for index in range(abs(Rstop - Rstart)):
+ currentpos = Rstart - cstart + index
+ # we only track unique splices
+ if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+ chromModel[currentpos] += scale
+ elif rsense == "-" and keepStrand != "plusOnly":
+ chromModel[currentpos] -= scale
+
+ del spliceDict
+
+ return chromModel
+
+
+ def insertMetadata(self, valuesList):
+ """ inserts a list of (pname, pvalue) into the metadata
+ table.
+ """
+ self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def updateMetadata(self, pname, newValue, originalValue=""):
+ """ update a metadata field given the original value and the new value.
+ """
+ stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
+ if originalValue != "":
+ stmt += " and value='%s' " % str(originalValue)
+
+ self.dbcon.execute(stmt)
+ self.dbcon.commit()
+
+
+ def insertUniqs(self, valuesList):
+ """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+ into the uniqs table.
+ """
+ self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def insertMulti(self, valuesList):
+ """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+ into the multi table.
+ """
+ self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def insertSplices(self, valuesList):
+ """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+ into the splices table.
+ """
+ self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
+ def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
+ """ update reads on file database in a list region of regions for a chromosome to have a new flag.
+ regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
+ sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
+ """
+ restrict = ""
+ if sense != "both":
+ restrict = " and sense = ? "
+
+ if uniqs:
+ self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+ if multi:
+ self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+ if self.dataType == "RNA" and splices:
+ self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
+ self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
+
+ self.dbcon.commit()
+
+
+ def setFlags(self, flag, uniqs=True, multi=True, splices=True):
+ """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+ """
+ if uniqs:
+ self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+
+ if multi:
+ self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+
+ if self.dataType == 'RNA' and splices:
+ self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+
+ self.dbcon.commit()
+
+
+ def resetFlags(self, uniqs=True, multi=True, splices=True):
+ """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+ """
+ if uniqs:
+ self.dbcon.execute("UPDATE uniqs SET flag = ''")
+
+ if multi:
+ self.dbcon.execute("UPDATE multi SET flag = ''")
+
+ if self.dataType == "RNA" and splices:
+ self.dbcon.execute("UPDATE splices SET flag = ''")
+
+ self.dbcon.commit()
+
+
+ def reweighMultireads(self, readList):
+ self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
+
+
+ def setSynchronousPragma(self, value="ON"):
+ try:
+ self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
+ except:
+ print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+
+
+ def setDBcache(self, cache, default=False):
+ self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
+ if default:
+ self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache)
+
+
+ def execute(self, statement, returnResults=False, forceCommit=False):
+ if self.memBacked:
+ sql = self.memcon.cursor()
+ else:
+ sql = self.dbcon.cursor()
+
+ sql.execute(statement)
+ if returnResults:
+ result = sql.fetchall()
+ return result
+
+ if forceCommit:
+ if self.memBacked:
+ self.memcon.commit()
+ else:
+ self.dbcon.commit()
+
+
+ def buildIndex(self, cache=100000):
+ """ Builds the file indeces for the main tables.
+ Cache is the number of 1.5 kb pages to keep in memory.
+ 100000 pages translates into 150MB of RAM, which is our default.
+ """
+ if cache > self.getDefaultCacheSize():
+ self.setDBcache(cache)
+ self.setSynchronousPragma("OFF")
+ self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+ print "built uPosIndex"
+ self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
+ print "built uChromIndex"
+ self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+ print "built mPosIndex"
+ self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
+ print "built mChromIndex"
+
+ if self.dataType == "RNA":
+ self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
+ print "built sPosIndex"
+ self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
+ print "built sPosIndex2"
+ self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
+ print "built sChromIndex"
+
+ self.dbcon.commit()
+ self.setSynchronousPragma("ON")
+
+
+ def dropIndex(self):
+ """ drops the file indices for the main tables.
+ """
+ try:
+ self.setSynchronousPragma("OFF")
+ self.dbcon.execute("DROP INDEX uPosIndex")
+ self.dbcon.execute("DROP INDEX uChromIndex")
+ self.dbcon.execute("DROP INDEX mPosIndex")
+ self.dbcon.execute("DROP INDEX mChromIndex")
+
+ if self.dataType == "RNA":
+ self.dbcon.execute("DROP INDEX sPosIndex")
+ try:
+ self.dbcon.execute("DROP INDEX sPosIndex2")
+ except:
+ pass
+
+ self.dbcon.execute("DROP INDEX sChromIndex")
+
+ self.dbcon.commit()
+ except:
+ print "problem dropping index"
+
+ self.setSynchronousPragma("ON")
+
+
+ def memSync(self, chrom="", index=False):
+ """ makes a copy of the dataset into memory for faster access.
+ Can be restricted to a "full" chromosome. Can also build the
+ memory indices.
+ """
+ self.memcon = ""
+ self.memcon = sqlite.connect(":memory:")
+ self.initializeTables(self.memcon)
+ cursor = self.dbcon.cursor()
+ whereclause = ""
+ if chrom != "":
+ print "memSync %s" % chrom
+ whereclause = " where chrom = '%s' " % chrom
+ self.memChrom = chrom
+ else:
+ self.memChrom = ""
+
+ self.memcon.execute("PRAGMA temp_store = MEMORY")
+ self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
+ # copy metadata to memory
+ self.memcon.execute("delete from metadata")
+ results = cursor.execute("select name, value from metadata")
+ results2 = []
+ for row in results:
+ results2.append((row["name"], row["value"]))
+
+ self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
+ # copy uniqs to memory
+ results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause)
+ results2 = []
+ for row in results:
+ results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+ self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
+ # copy multi to memory
+ results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause)
+ results2 = []
+ for row in results:
+ results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+ self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
+ # copy splices to memory
+ if self.dataType == "RNA":
+ results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause)
+ results2 = []
+ for row in results:
+ results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+ self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2)
+ if index:
+ if chrom != "":
+ self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
+ self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
+ if self.dataType == "RNA":
+ self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
+ self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
+ else:
+ self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+ self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+ if self.dataType == "RNA":
+ self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
+ self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+
+ self.memBacked = True
+ self.memcon.row_factory = sqlite.Row
+ self.memcon.commit()
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+from cistematic.core.orthomatcher import orthoMatcher
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "version 1.1"
+ if len(argv) < 7:
+ print "usage: python %s prefix directory genome1 genefile1 genome2 genefile2 [genome3 genefile3 .....]" % argv[0]
+ sys.exit(1)
+
+ prefix = argv[1]
+ directory = argv[2]
+ matchFiles = {}
+
+ genomesToMatch = (len(argv) - 3) / 2
+ for index in range(genomesToMatch):
+ genome = argv[3 + index * 2]
+ print genome
+ if genome not in matchFiles:
+ matchFiles[genome] = []
+
+ matchFiles[genome].append(argv[4 + index * 2])
+
+ print matchFiles
+ orthoMatcher(matchFiles, prefix, directory, fileList=True)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# distalPairs.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 10/14/08.
+#
+
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+from commoncode import readDataset
+import sys, time, optparse
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "%prog: version 3.3"
+ print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
+ usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--maxDist", type="int", dest="maxDist")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ minDist = int(args[0])
+ rdsfile = args[1]
+ outfilename = args[2]
+
+ distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
+
+
+def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
+ if cachePages is not None:
+ doCache = True
+ else:
+ doCache = False
+ cachePages = -1
+
+ RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ if not RDS.hasIndex():
+ print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
+ sys.exit(1)
+
+ if cachePages > RDS.getDefaultCacheSize():
+ RDS.setDBcache(cachePages)
+
+ print time.ctime()
+
+ if doSplices:
+ print "getting splices"
+ splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+ print "got splices"
+
+ print "getting uniq reads"
+ uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+ print "got uniqs"
+
+ if doSplices:
+ for readID in splicesDict:
+ theRead = splicesDict[readID]
+ read0 = theRead[0]
+ del read0[1]
+ try:
+ uniqDict[readID].append(read0)
+ except:
+ if len(theRead) == 4:
+ read2 = theRead[2]
+ del read2[1]
+ uniqDict[readID] = [read0,read2]
+
+ if doVerbose:
+ print len(uniqDict), time.ctime()
+
+ outfile = open(outfilename,"w")
+
+ diffChrom = 0
+ distal = 0
+ total = 0
+ for readID in uniqDict:
+ readList = uniqDict[readID]
+ if len(readList) == 2:
+ total += 1
+ (start1, sense1, chrom1, pair1) = readList[0]
+ (start2, sense2, chrom2, pair2) = readList[1]
+
+ if chrom1 != chrom2:
+ diffChrom += 1
+ if sameChromOnly:
+ continue
+ else:
+ outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
+ outfile.write(outline + "\n")
+ if doVerbose:
+ print diffChrom, outline
+ else:
+ dist = abs(start1 - start2)
+
+ if minDist < dist < maxDist:
+ distal += 1
+ outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
+ outfile.write(outline + "\n")
+ if doVerbose:
+ print distal, outline
+
+ outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total))
+ total = float(total)
+ if total < 1:
+ total = 1.
+
+ outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)))
+ outfile.close()
+ print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total))
+ print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))
+ print time.ctime()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2007-09 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
--- /dev/null
+This is a description of the sqlite-based read storage
+files and of the scripts designed to import read
+mappings from supported short read mappers. The code
+should run on any Unix-like system supporting python 2.5
+or better. The code is developed on Linux and MacOS X on
+python 2.5.
+
+This code is made available as open-source, as described
+in the copyright file ERANGE.COPYRIGHT.
+
+1. REQUIREMENTS
+2. COMMAND LINE OPTIONS
+3. CREATING THE NECESSARY INPUT (RDS) FILES
+4. BUILDING EXPANDED GENOMES
+5. MAPPING READS WITH ELAND
+6. MAPPING READS WITH BOWTIE
+7. MAPPING READS WITH BLAT
+8. IMPORTING BED FILES
+9. COMBINING RDS FILES
+10. MANIPULATING RDS METADATA AND CACHING
+11. VISUALIZING THE DATA IN RDS FILES
+
+
+1. REQUIREMENTS
+
+See README.chip-seq or README.rna-seq to see the requirements
+for installing and running ERANGE specific to each
+application.
+
+
+2. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each script
+by typing:
+
+python $ERANGEPATH/<scriptname>
+
+to see the command line options, where ERANGEPATH is the
+environmental variable set to the path to the directory
+holding the ERANGE scripts. Note that the command line
+options are case sensitive and that they could well
+fail silently.
+
+
+3. CREATING THE NECESSARY INPUT (RDS) FILES
+
+Before you can use the rest of the ERANGE scripts to do
+CHiP-seq or RNA-seq analyses, you will need to first
+convert your read mappings to the native ERANGE read
+storage format, which is sqlite-based, and which is
+called RDS (Read DataSet). RDS files consist of four
+tables:
+- metadata (tracks required and optional metadata)
+- uniqs (stores uniquely mappable-reads)
+- multi (stores reads that map equally well to multiple
+locations in the genome)
+- splices (stores split reads)
+
+a readDataset python object (in commoncode.py) provides
+the encapsulation of the read database which is accessed
+through specific methods. Since an RDS file is a sqlite3
+database, you can additionally use any of the sqlite-based
+tools to look at the reads in the tables, if you wish to
+do so.
+
+You will need to first map your reads with one of the
+supported read mappers (see next paragraph) against a copy
+of the appropriate genome. For ChIP-seq, it will be your
+genome of interest, whereas for RNA-seq reads should be
+mapped against an expanded genome, which consists of
+chromosomes + splice junctions which depend on the read
+length used. Note that several parts of the code assume
+that your genomic sequences are labelled with the "chr"
+chromosomes prefix. For more information on creating
+expanded genomes, see BUILDING EXPANDED GENOMES.
+
+The currently supported read mappers are:
+- Eland (part of the Illumina GA pipeline)
+- Bowtie (bowtie-bio.sourceforge.net)
+- Blat (from UCSC)
+
+These are described in the sections on MAPPING READS WITH
+ELAND, MAPPING READS WITH BOWTIE, MAPPING READS WITH BLAT.
+
+For ChIP-seq, you can also import bed files of unique reads
+only using makerdsfrombed.py .
+
+Also see MANIPULATING RDS METADATA AND CACHING to learn about
+some important aspects of working with RDS files.
+
+
+4. BUILDING EXPANDED GENOMES
+
+For RNA-seq using ELAND or BOWTIE mappings, you will need to build
+an expanded genome consisting of genomic sequences, spike sequences,
+and splice-spanning sequences in order to run ERANGE on your own
+datasets. This expanded genome is specific to the read size used,
+i.e. there will be a different expanded genome for mouse when using
+25bp reads or 32bp reads. For reads longer than 32 bp, we recommend
+using BOWTIE. If your reads are longer than 50bp, consider using
+BLAT instead.
+
+Download the chromosomes from UCSC, as well as the knownGene.txt (or
+equivalent table) and a directory of repeatmask annotations for each
+chromosome (also from UCSC) for your genome of interest.
+
+You will need to build a splice fasta file using the script
+getsplicefa.py, which needs Cistematic, the knownGene table, and a
+paremeter for splice radius, which is 4 bp shorter than the length
+of the reads.
+
+Once you have the splice fasta file, drop it into the same directory
+as well as a fasta file for your spikes. Then use squashGenome
+(part of Eland) or bowtie-build (part of Bowtie), to build the
+expanded genome. Please refer to the documentation for each
+package to run the genome squasher/builder.
+
+You will also build a repeat database using buildrmaskdb.py for use
+in the candidate exon analysis from UCSC repeatmasker annotations.
+
+
+5. MAPPING READS WITH ELAND
+
+Please refer to the Illumina documentation for the details on
+running squashGenome and Eland. If you do not have access to the
+Illumina pipeline, use bowtie as described in the next section.
+
+For ChIP-seq, you could take the output of the Illumina pipeline,
+e.g. eland_multi.txt or eland_extended.txt and use them as inputs
+for makerdsfromeland2.py .
+
+Once you have run Eland with the --multi option (which we
+colloquially call "eland2") for each RNA-seq lane against the
+expanded genome, combine all of the outputs for one sample into a
+single file e.g. test.comb.eland2
+
+The makerdsfromeland2.py script is used to import the reads
+into RDS:
+
+python makerdsfromeland2.py label infilename outrdsfile [-append] [-RNA ucscGeneModels]
+[propertyName::propertyValue] [-index] [-paired 1 or 2] [-extended] [-verbose]
+[-olddelimiter] [-maxlines num] [-cache numPages]
+
+The first 3 arguments are required:
+- label is any label that you wish (a combination flowcell+lane#
+is a good choice)
+- infilename is the output of eland in eland_multi format
+(default) or eland_extended format (with the -extended flag)
+- outdbname is the name of the rds file, e.g. test.rds
+
+If the reads are from paired-end runs, enter each eland_multi
+(or extended) file separately with the "-paired 1" or "-paired 2"
+flag, as appropriate.
+
+If entering more than one lane, use -append for all subsequent
+lanes. Upon entering the last lane, use -index to build a read
+index. Refer to MANIPULATING RDS METADATA AND CACHING for
+information on the optional property::value pairs and caching.
+
+For RNA-seq, you must in addition specify the path to knownGene.txt
+using the -RNA flag, e.g.
+
+python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+
+
+6. MAPPING READS WITH BOWTIE
+
+Bowtie (bowtie-bio.sourceforge.net) is a new read-mapper that
+is very fast and friendly. ERANGE supports version 0.10.X
+and higher that allow you to control how many multireads
+are reported. We recommend the following settings:
+
+$BOWTIEDIR/bowtie zzz -v 2 -k 11 -m 10 -t --strata --best -f s1.query32.txt --un s1.unm.fa --max s1.max.fa s1.zzz.bowtie.txt
+
+where zzz is the genome prefix that you gave when building the
+genome. In particular, we ask bowtie to map all multireads up
+to 11 ("-k") with up to 2 mismatches ("-v" and "--best"), however
+we will only import all multireads up to 10x multiplicity ("-m").
+Note that bowtie is multithreaded and can use multiple cpu based
+on the -p flag (e.g. use "-p 4" to use 4 CPUs). Unmapped reads
+are saved in unmapped.fa for later analysis.
+
+Once reads are mapped, they can be imported using:
+
+python $ERANGEPATH/makerdsfrombowtie.py testLabel s1.mm9.bowtie.txt bowtietest.rds
+
+The options for the script are:
+
+python makerdsfrombowtie.py label infilename outrdsfile
+[-RNA ucscGeneModels] [-append] [-index] [propertyName::propertyValue]
+[-rawreadID] [-verbose] [-cache numPages]
+
+Refer to "MAPPING READS WITH ELAND" for a description of label,
+infilename, outdbname, '-append', '-index', and '-cache'.
+
+****REMEMBER TO USE -index WHEN LOADING THE LAST LANE OF YOUR
+DATASET.****
+
+The script assumes that the read ID are from Illumina, i.e. that
+they have multiple fields separated by ':' and that paired-end
+reads have an additional '/1' or '/2' depending on the end.
+It will by default strip the first part of the readID (up to the
+first ':') and replace it with the label. If you want raw readIDs
+because you mapped raw reads that do not have an associated ID or
+an ID that doesn't follow Illumina's conventions, use -rawreadID.
+
+If not using Illumina readIDs, use any identifier of the format
+
+throw_away:uniqueid if unpaired
+throw_away:uniqueid/1 and throw_away:uniqueid/2 for paired-ends.
+
+For RNA-seq, you must in addition specify the path to knownGene.txt
+using the -RNA flag, e.g.
+
+python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+
+
+7. MAPPING READS WITH BLAT
+
+BLAT SUPPORT IN ERANGE IS STILL UNDER DEVELOPMENT AND THE
+SCRIPTS AND SETTINGS BELOW MAY BE OPTIMIZED FURTHER IN
+FUTURE RELEASES OF ERANGE.
+
+Reads longer than 40-50bp can be fruitfully mapped with BLAT
+against the reference genome without needing to provide the
+exon junctions. While BLAT is much slower than BOWTIE, it
+has the great advantage of seeing novel splices (i.e.
+splices not present in knownGene models).
+
+We use the following settings to map 75bp reads with BLAT and
+filter them with pslReps:
+
+$BLATPATH/blat /tmp/hg18.fa s3_1.query75.txt -out=pslx s3_1.hg18.blat
+$BLATPATH/pslReps -minNearTopSize=70 s3_1.hg18.blat s3_1.hg18.blatbetter s3_1.blatpsr
+
+where the binaries are in $BLATPATH anywhere on your system.
+
+Once the reads have been filtered, the makerdsfromblat.py
+script is used to import the mapped reads (in the example
+above s3_1.hg18.blatbetter) into RDS:
+
+python makerdsfromblat.py label infilename outrdsfile [-append] [-index] [propertyName::propertyValue]
+[-rawreadID] [-forceRNA] [-flag] [-strict minSpliceLen] [-spliceonly] [-verbose] [-cache numPages]
+
+If you are using BLAT for RNA-seq, please be sure to use
+-forceRNA in order to import spliced reads and consider
+using -strict to require a minimum length of bases on
+each side of the splice.
+
+You can combine BOWTIE and BLAT by mapping reads with BOWTIE
+first, and then using BLAT to map the unmapped reads. In
+that case, you may want to only load the spliced reads
+using the -spliceonly flag. To track those reads in the RDS
+file, use -flag ; you can then retrieve those reads using
+the options "-flag blat -flagLike" with the makebedfromrds.py
+script.
+
+
+8. IMPORTING BED FILES
+
+If you do not have the raw read data, you can import unique
+reads only using the script makerdsfrombed.py . Note that
+this is not particularly useful for RNA-seq since you will
+have neither the multireads nor the spliced reads.
+
+The command line options are similar to those for other
+scripts described in part 5-7:
+
+python makerdsfrombed.py label bedfile outrdsfile [-append] [-index] [propertyName::propertyValue] [-cache numPages]
+
+
+9. COMBINING RDS FILES
+
+Previously created RDS files can be combined into a new RDS
+dataset using the combinerds.py command with the granularity
+of importing all tables or specific ones (e.g. uniqs, splices).
+
+The combinerds.py command options are:
+
+python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [-init] [-initrna] [-index] [-cache pages]
+
+
+10. MANIPULATING RDS METADATA AND CACHING
+
+One of the advantages of RDS over bed, is the possibility of
+attaching arbitrary sets of annotations with the data, which
+are then carried along. Both the makerds* scripts and
+rdsmetadata.py allows you to both enter key::value
+combinations. Entering a key multiple times will cause the
+same instance to be recorded multiple times, which is
+appropriate in some settings (e.g. to enter flowcell info).
+In addition rdsmetadata.py allows you to inspect various
+attributes of your RDS files such as # of reads and size
+of the default cache size.
+
+Sqlite files have a certain amount of RAM set aside as cache
+for lookups, indexes, etc.... where the amount is measured in
+1.5kb pages. Each RDS instance come with a default of 100000
+pages (150MB) of cache, which is needlessly small in most
+situations. Whenever appropriate, try using more cache (e.g.
+750000 pages on a 2GB RAM machine, much more if more RAM is
+available) for a significant speed increase in indexing and
+lookups. You can change the default value for each RDS file
+by using the -defaultcache option of rdsmetadata.py.
+
+Note that sqlite can be very slow over NFS. Wherever
+possible, copy your RDS file locally before running an I/O
+intensive script.
+
+
+11. VISUALIZING THE DATA IN RDS FILES
+
+You can output bed-files of the raw reads using
+makebedfromrds.py. A more practical way to look at the data
+might be to ouput it as a bedGraph file using makewiggle.py .
+
+Note that UCSC has a hard limit on the size of their files
+and you will likely need to break the wiggles on a per-chromosome
+basis for mammalian genomes.
+
+RELEASE HISTORY
+
+version 3.2 October 2009 - added combinerds.py
+version 3.01 February 2009 - bug fixes
+version 3.0 January 2009 - added logging to buildrdsfrom*
+version 3.0rc1 December 2008 - added blat support
+
+
--- /dev/null
+This is an updated version of the core of the ChIP-seq
+analysis code described in Johnson et al (2007). It
+should run on any Unix-like system supporting python 2.5
+or better. The code is developed on Linux and MacOS X on
+python 2.5.
+
+These scripts in the ChIPSeqMini package are now part of
+the ERANGE package, but are still available as a
+standalone package for now.
+
+This code is made available as open-source, as described
+in the copyright file ERANGE.COPYRIGHT.
+
+
+1. REQUIREMENTS
+2. COMMAND LINE OPTIONS
+3. MAKING THE NECESSARY INPUT (RDS) FILES
+4. WEIGHING MULTIREADS
+5. RUNNING THE PEAK FINDER
+6. DISPLAYING DATA ONTO THE UCSC GENOME BROWSER
+7. DOWNSTREAM ANALYSES
+
+
+1. REQUIREMENTS
+
+1) Python 2.5 is required because some of the scripts and
+Cistematic (see below) need pysqlite, which is now bundled in
+Python.
+
+2) You will also need to use Cistematic 2.3 (available at
+cistematic.caltech.edu) for all of the scripts that are
+part of the downstream analyses.
+
+(optional) Use of the psyco module (psyco.sf.net) on 32-bit
+Linux or Mac Intel machines is highly recommended.
+
+(optional) Three visualization scripts also depend on the
+additional package pylab (matplotlib). These scripts are:
+- getgosig.py
+- plotbardist.py
+- scatterfields.py
+You do not need to install pylab if you will be
+visualizing some of your analysis results differently.
+
+
+2. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each script
+by typing:
+
+python $ERANGEPATH/<scriptname>
+
+to see the command line options, where ERANGEPATH is the
+environmental variable set to the path to the directory
+holding the ERANGE scripts. Note that the command line
+options are case sensitive and that they could well
+fail silently.
+
+
+3. MAKING THE NECESSARY INPUT (RDS) FILES
+
+You will want to first convert your read mappings to the
+native ERANGE read store. Please see the file
+README.build-rds for instructions on how to do this.
+
+Build an RDS file for both the ChIP, and if available and
+appropriate, the control. Note that we *HIGHLY* recommend
+the use of a matched control sample to account for some
+of the general background artifacts that can be present
+in ChIP-seq samples (e.g. DNAse hypersensitivity,
+assembly collapse of some sattelite repeats, etc....).
+
+
+4. WEIGHING MULTIREADS
+
+Version 3.0 of the peak finder can use multireads, i.e.
+reads that map equally well to more than one location
+in the genome, to find binding sites that are in low
+copy-number on-unique regions (typically less than 10).
+
+ERANGE offers 3 ways to analyze these regions:
+(a) default weighing of 1/multiplicity
+(b) ignoring multireads
+(c) weighing of multireads based on unique reads in a
+given radius
+
+(a) is the default in the current release of ERANGE.
+Simply proceed to RUNNING THE PEAK FINDER for (a) and
+(a). You can ignore multireads (b) by using the -nomulti
+flag with findall.py. For (c), use weighMultireads.py
+to weigh multireads based on a unique reads in the
+respective radius of each potential location. Once run,
+proceed to the section below.
+
+
+5. RUNNING THE PEAK FINDER
+
+To run the peak finder without read shifting, use the
+following command:
+
+python $ERANGEPATH/findall.py label chip.rds chip.regions.txt -control control.rds -listPeak -revbackground
+
+which will run the peak finder on chip.rds / control.rds ,
+store the enriched region coordinates in chip.regions.txt,
+also store the actual local maximum in each region in the
+same file, and also calculate an FDR by running the
+finder on control.rds / chip.rds .
+
+A log file (findall.log by default, change with -log)
+tracks the settings used to run the program as well as
+some of the summary statistics, which are also stored
+at the bottom of the regions.txt output file.
+
+findall.py is tuned to conservative settings for 10-12M
+mappable read IPs of static, sequence-specific
+transcription factors in mammals with very short
+fragment sizes, on the order of 40-60 bp.
+
+You will *NEED* to change some of the default parameters
+if working in smaller genomes (e.g. use smaller -spacing),
+if working with certain types of IPs such as histones and
+polymerases (test with and without -notrim and
+-nodirectionality), if working with rather weak IPs
+(e.g. -minimum and -ratio), or if working with larger
+fragment sizes (see the paragraph below discussing read
+shifting).
+
+findall.py returns a per-peak p-value. By default, this
+is calculated using a Poisson distribution of peak RPMs
+(or counts, if using -raw) for each chromosome in the IP.
+P-value calculations can be turned off using
+'-pvalue none '. Alternatively, the p-value can be
+calculated from the background using the option
+'-pvalue back ', which must be combined with the option
+-revbackground.
+
+By default, findall.py does not try to adjust the location
+of the reads based on half the size of the expected fragment
+length (the "shift"). If you believe that you need to shift
+your peaks, findall.py can try to pick the best shift based
+on the best shift for strong sites using the parameter
+'-shift learn '. You can also either manually specify a
+shift value using '-shift #bp ' or ou can calculate a
+"best shift" for each region using '-autoshift'. If you
+need to using the shift options, the recommended usage is:
+(i) first run findall.py with '-shift learn ', which will
+peak a shift if there are at least 30 regions that meet
+its training criteria.
+(ii) if (i) couldn't pick a shift, run findall.py with
+-autoshift and -reportshift
+(iii) look at the mode (most common #) for the shift
+(iv) rerun findall.py with -shift #bp where #bp is the mode
+
+If you are storing the RDS files on an network-mounted
+directory, make sure to use '-cache XXXXX' to enable
+local caching, where is as large as appropriate as
+described in section 9 of README.build-rds .
+
+Note that ERANGE will cache by default to /tmp, but this
+can be redirected to any directory pointed to by the
+environmental variable CISTEMATIC_TEMP.
+
+To find out the current default settings and options,
+simply type:
+
+python $ERANGEPATH/findall.py
+
+for more information.
+
+
+6. DISPLAYING DATA ONTO THE UCSC GENOME BROWSER
+
+You can output bed-files of the raw reads using
+makebedfromrds.py and BEDGRAPH file using
+makewiggle.py as described in README.build-rds .
+
+You can create bed files of regions and sites (see
+below) using regiontobed.py and makesitetrack.py .
+
+
+7. DOWNSTREAM ANALYSES
+
+Recall that Cistematic 2.3 is a required to do motif
+and gene-level analyses of the output of findall.py.
+
+Use getallgenes.py to find the nearest gene within a
+radius of each binding site.
+
+Use analyzego.py to do a Gene Ontology enrichment
+analysis of a gene list (such as from getallgenes.py).
+You can look at a heatmap of your GO enrichments using
+getgosig.py. You can also use getGOgenes.py to look at
+the genes with particular GO annotations.
+
+To do motif-finding, use getfasta.py to get the sequences
+centered on the peaks of your regions of interest. For
+the sake of a pleasant experience, try limiting yourself
+to less than 100kb of combined sequence (the easiest being
+by picking your regions with the strongest signals).
+
+Once you have a fasta file of the regions of interest, you
+can use findMotifs.py to find motifs using either
+cisGreedy (bundled with Cistematic 2.2) which is good for
+shorter motifs or Meme (must be installed separately -
+refer to the instructions on cistematic.caltech.edu for
+more information), which is better for longer motifs.
+findmotifs.py will return a set motifs in Cistematic format
+with a .mot extension. These motifs can then be used with
+getallsites.py to get the coordinates and instances of each
+motif in all of the regions found by the peak finder.
+
+The sites can be checked against repeat-masker annotations
+(preloaded from UCSC with buildrmaskdb.py) using
+checkrmask.py. The sites for each motif can also be fed
+back into getallgenes.py to get genes, redo the GO analyses,
+etc....
+
+You can use the intersect scripts (intersects.py,
+gointersects.py, and siteintersects.py) to compare different
+sets of genes/GO/site results across multiple experiments,
+for example.
+
+
+RELEASE HISTORY
+
+version 3.1 February 2009 - support for read shifting
+version 3.0 February 2009 - support for UCSC narrowPeak format in regiontobed.py
+version 3.0rc1 December 2008 - added parameter to control peak-trimming
+version 3.0b2 December 2008 - added per-peak p-value
+version 3.0b November 2008 - initial release of RDS-based code
+with support for eland and bowtie.
+
--- /dev/null
+This is a description of the pipeline designed to analyze single
+nucleotide changes found in the mapped reads. The code should run
+on any Unix-like system supporting python 2.5 or better. The code
+is developed on MacOS X on python 2.5.
+
+1. COMMAND LINE OPTIONS
+2. BUILDING THE SNP DATABASE
+3. RUNNING THE SNP PIPELINE
+
+
+1. COMMAND LINE OPTIONS
+
+To find out more about the settings for each script, type:
+
+python $ERANGEPATH/<scriptname>
+
+to see the command line options. Note that all ERANGE command-line
+options are case-sensitive & that the scripts typically ignore
+command-line arguments that they do not recognize!
+
+
+2. BUILDING THE SNP DATABASE
+
+In order to check the candidate SNPs versus known SNPs, you will need
+to first download the corresponding dbSNP database file from UCSC and
+then build a sqlite version of it using:
+
+python $ERANGEPATH/buildsnpdb.py ucscSNPfile outdb
+
+e.g.
+
+python buildsnpdb.py snp128.txt dbSNP128
+
+
+3. RUNNING THE SNP PIPELINE
+
+The runSNPAnalysis.sh shell script is designed to retrieve SNPs, filter
+them against repeat annotations, cross-check them against known SNPs and
+annotate the novel SNPs. It will automatically run a set of python scripts
+that are required for the SNPs analysis using the RDS (Read DataSet) file.
+This script assumes the existence of a known SNP database as described in
+the previous section as well as of a repeatmask database
+
+Usage: $ERANGEPATH/runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
+
+where ERANGEPATH is the environmental variable set to the path to the directory holding the ERANGE scripts.
+
+Parameters:
+- genome: the name of the organism in the analysis.
+- rdsfile: read DataSet file. See README.build-rds for
+more information.
+- label: the file name of your choice for the analysis.
+- rmaskdbfile: repeat mask database, a sqlite database file. See
+README.rna-seq for more information on creating the database.
+- dbsnpfile: dbsnp database, a sqlite database file, built from the
+dbSNP database text file from UCSC. Please see command line option
+for building dbsnp sqlite database using buildsnpdb.py .
+- uniqStartMin: the ratio of the number of unique reads supporting a
+SNP at base s and the maximum number of unique read coverage at base s .
+5 is a good number to start with.
+- totalRatio: the ratio of the number of reads supporting an
+expressed SNP at s and the total read coverage at s . 0.75 should allow
+you to get the homozygous SNPs.
+- rpkmfile: rpkm file can be generated using the RNA-seq pipeline as
+described in README.rna-seq. If you do not have that file, you can
+set it to NONE.
+- cachepages: cache pages. Make sure to use as much caching as your
+system will accomodate. See README.build-rds for more information.
+
+Example: $ERANGEPATH/runSNPAnalysis.sh mouse 24T4spike.rds 24Tspike rmask.db dbSNP128.db 5 0.75 c2c12rna.24R.final.rpkm 5000000
+
+version 3.0 January 2009 - logging
+version 3.0rc1 December 2008 - major rewrite and speed-up of getSNPs.py and chksnp.py
+version 3.0b2 December 2008 - bug fixes & ERANGEPATH variable
+
--- /dev/null
+The latest version of this software is available at
+
+http://woldlab.caltech.edu/rnaseq
+
+please check the website for updates.
+
+This is the core of the RNA-seq analysis code described in Mortazavi
+et al (2008). Please make sure that you have read Figure 3 and the
+methods / supplemental methods of that paper before attempting to
+use this package for RNA-Seq data analysis.
+
+ERANGE should run on any Unix-like system supporting python 2.5 or
+better. The code is developed on Linux and MacOS X on python 2.5.
+
+Historically, the code for ERANGE grew out of the ChIPSeqMini
+package from Johnson et al (2007), and some of the key scripts
+(findallnocontrol.py and getallgenes.py) are shared between the two.
+This is why ERANGE is "dual-use" and is also why the code for both
+analyses were kept in common as much as possible. This should be
+helpful when someone tries to combine ChIP-seq and RNA-seq
+analyses !
+
+This code is made available as open-source, as described in the
+copyright file ERANGE.COPYRIGHT.
+
+1. SETTING EXPECTATIONS
+2. REQUIREMENTS
+3. COMMAND LINE OPTIONS
+4. DISPLAYING DATA
+5. ANALYSIS
+6. PIPELINE
+7. CUSTOM CISTEMATIC GENOME ANNOTATIONS
+8. PAIRED-END RNA-SEQ ANALYSIS
+9. EXPRESSED SNP ANALYSIS
+
+1. SETTING EXPECTATIONS
+
+ERANGE is not a point-and-click, turn-key package.
+
+It is a set of python scripts that, when run in order as a pipeline
+on the "right" input, will take read data in RDS format and
+calculate gene expression levels in RPKM (Reads Per kb per Million
+reads). This pipeline for unpaired reads is embodied in a shell
+script called runStandardAnalysis.sh, which only takes a few inputs,
+described in the ANALYSIS and PIPELINE section below.
+
+You should be able to download the data from our website and run the
+analysis through the pipeline. You will need to map the reads and
+import them into an RDS dataset as described in README.build-rds.
+
+Because you will likely want to run this package on other genomes
+(or builds) than the one described in our original paper, you will
+need to do several additional steps, such as:
+
+- build expanded genomes with splices and spikes
+- check overlap of RNAFAR predictions with repeats
+
+This will require some comfort with running and, if necessary,
+editing scripts. While the code is sparsely documented, we are
+making it available so that you can *read it*. We'll be happy to
+help modifying and updating the code within a reasonable extent
+and will try to provide more in depth documentation and tutorials
+on our web site.
+
+While the scripts produce several forms of RPKM, we suggest that
+the "final" RPKM are the values that most people will be interested
+in.
+
+*WARNING* A couple of these scripts are pretty memory hungry. If
+you are going to analyze datasets with > 20M reads or reads with
+high error rates, you will easily need > 8 GB RAM. We'll rewrite
+these scripts before releasing 3.0 final to lower the memory
+footprint.
+
+2. REQUIREMENTS
+
+1) Python 2.5+ is required because some of the scripts and
+Cistematic (see below) need pysqlite, which is now bundled in
+Python.
+
+2) You will also need to use Cistematic 3.0 for some of the scripts
+marked below that use genes and genomic sequence; in particular, you
+will also likely need the Cistematic version of the genomes, unless
+providing your own custom genome and annotations.
+
+Cistematic is available at http://cistematic.caltech.edu
+
+3) You will need genomic sequences to build the expanded genome, as
+well as gene models from UCSC.
+
+(Optional) Python is very slow on large datasets. Use of the psyco
+module (psyco.sf.net) on 32-bit Linux or all Mac Intel machines to
+significantly speed up runtime is highly recommended.
+
+(Optional) Several of the ploting scripts also rely on Matplotlib,
+which is available at matplotlib.sf.net.
+
+
+3. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each python script by
+typing:
+
+python $ERANGEPATH/<scriptname>
+
+to see the command line options, where ERANGEPATH is the
+environmental variable set to the path to the directory
+holding the ERANGE scripts.
+
+
+For example, if you wanted to know the command line options of the
+script used to generate supplementary datasets 2-4, combineRPKMs.py ,
+you would type:
+
+python $ERANGEPATH/combineRPKMs.py
+
+and get back a version number and all possible command line options:
+
+version 1.0
+usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [-withmultifraction]
+
+where fields in brackets are optional.
+
+
+4. DISPLAYING DATA
+
+You can output bed-files of the raw reads in the RDS file
+using makebedfromrds.py and WIG file using makewiggle.py as
+described in README.build-rds .
+
+
+5. ANALYSIS
+
+The main steps of a typical, unpaired analysis using ERANGE
+is shown in RNA-seq.analysisSteps.txt, where each script
+would be run in order, with the caveat that there are two
+ways to do the candidate exon analysis (RNAFAR), creatively
+called "alternative 1" and "alternative 2".
+
+In alternative 1, we use reads that did not match an existing gene
+model to identify candidate regions:
+
+# Alternative 1: find new regions outside of gene models with reads piled up
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+# use "none" if you don't have a repeatmask database
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+
+In alternative 2, we pool multiple RNA-seq datasets into a single
+RDS database, run it through the two scripts of alternative 1 above,
+and then use these precomputed candidates to count reads falling in
+these regions:
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
+
+Alternative 1 is the one used by the pipeline script described below.
+
+The scripts will generate a set of intermediate files, the most
+interesting of which are the final RPKM values. These will be in the
+following files for the test example:
+
+test.firstpass.rpkm (the unique reads only)
+test.expanded.rpkm (the unique reads + spliced reads + RNAFAR)
+test.final.rpkm (uniques + spliced + RNAFAR + multireads)
+
+
+6. PIPELINE
+
+IF YOU ARE STORING THE RDS FILE ON A NETWORK-MOUNTED DIRECTORY,
+PLEASE ALSO READ SECTION 7.
+
+Most of the analysis steps described in the section above are
+automated in a pipeline shell script called runStandardAnalysis.sh .
+Note that the pipeline assumes that it will call its own RNAFAR
+regions, which is called "alternative 1" in the ANALYSIS section,
+which is a good starting point. You can modify the pipeline script
+to use alternative 2, if appropriate.
+
+The pipeline assumes that one RDS database containing the appropriate
+uniq, multi, and spliced reads exists as desribed in README.build-rds.
+
+We assume that Cistematic 2.3 is installed, including a version of
+the appropriate Cistematic genome. You will need to build your own
+Cistematic genome for any unsupported genome.
+
+We will also need a radius (e.g. 20000 bp) within which a candidate
+exon will be consolidated with an existing gene.
+
+For example, for the test.rds dataset from the ANALYSIS section, we
+would run the pipeline as:
+
+. $ERANGEPATH/runStandardAnalysis.sh mouse test ../mm9repeats/rmask.db 20001
+
+where ERANGEPATH is the environmental variable set to the path to
+the directory holding the ERANGE scripts. Remember that you can
+replace '../mm9repeats/rmask.db' with 'none' if you don't have a
+repeatmask database.
+
+This could run from an hour to a whole day depending on how many
+reads are involved (1M vs 80M) and how big a consolidation radius
+is used.
+
+
+7. CUSTOM CISTEMATIC GENOME ANNOTATIONS
+
+Cistematic 3.0 added support for generic genomes and loadable
+(or alternative) annotations. While this support is still
+experimental, the general idea is to take a GTF/GFF3 file,
+convert it into the format that cistematic expects using
+
+$ERANGEPATH/gfftocis.py infile.gff outfile.cis
+
+NOTE THAT YOU WILL MOST LIKELY HAVE TO EDIT THIS FILE TO
+ACCOMODATE YOUR SPECIFIC GFF FORMAT TO THE CISTEMATIC
+FORMAT, WHICH IS
+
+geneID<tab>uniqRef<tab>chrom<tab>start<tab>stop<tab>sense<tab>type<return>
+
+where type is one of 'CDS','5UTR','3UTR'.
+
+You can then run the standard analysis script with the additional
+flag " -models outfile.cis ", e.g.
+
+. runStandardAnalysis.sh generic asteph none 1000 -models agambiae.base.cis
+
+Custom annotation support will be extended to other PIPELINE
+scripts as part of 3.2 final.
+
+
+8. PAIRED-END RNA-SEQ ANALYSIS
+
+We are now experimentally supporting paired-end RNA-seq, as
+implemented in the pipeline script runRNAPairedAnalysis.sh and
+is only provided as a "work-in-progress" snapshot.
+
+This is done primarily by marking all of the reads that map in a
+known exon or a novel RNAFAR region in the RDS database, which
+is a slow and time-consuming step (and is off by default for
+single-ended RNA-seq). This mapping step is done without
+accounting for paired-end information.
+
+The paired-end information is then used to connect RNAFAR
+regions to known genes or to other RNAFAR regions using
+reads with one end in a given region and the other end
+in different (known or novel) region, as implemented in
+rnafarPairs.py ; note that there is currently a default
+limit of 500000 bp maximum distance between the two pairs.
+
+
+9. EXPRESSED SNP ANALYSIS
+
+ERANGE3 now supports SNP analysis in RNA-seq data as described
+in README.rna-esnp .
+
+RELEASE HISTORY
+
+version 3.2 December 2009 - support for custom genome annotations with Cistematic 3.0
+version 3.1 April 2009 - modified normalizeFinalExonic.py to remove genome
+version 3.0 January 2009 - added logging to shell pipelines
+version 3.0rc1 December 2008 - added blat support
+version 3.0b2 December 2008 - bug fixes & ERANGEPATH variable
+version 3.0b November 2008 - Support for paired end analysis
+version 3.0a October 2008 - Preview release of ERANGE3.0
+version 2.0 May 2008 - First public release of ERANGE
+
--- /dev/null
+This is a description of the pipeline designed to do scaffolding
+of fragmented genomes using RNA-seq. The code should run
+on any Unix-like system supporting python 2.6 or better. The code
+is developed on MacOS X on python 2.6.
+
+Note that RNAPATH is not currently optimized for running on machines with
+small or medium amounts of RAM. 32 Gb minimum is recommended for the current
+version.
+
+1. COMMAND LINE OPTIONS
+2. MAPPING THE READS AND BUILDING THE RDS FILES
+3. GETTING THE SCAFFOLDING READS
+4. RUNNING RNAPATH.py
+
+
+1. COMMAND LINE OPTIONS
+
+To find out more about the settings for each script, type:
+
+python $ERANGEPATH/<scriptname>
+
+to see the command line options. Note that all ERANGE command-line
+options are case-sensitive & that the scripts typically ignore
+command-line arguments that they do not recognize!
+
+
+2. MAPPING THE READS AND BUILDING THE RDS FILES
+
+Before running the RNAPATH script on a genome (assumed to be in fasta format),
+you will need to first map the RNA-seq reads using BLAT and import those reads
+into an RDS file, as described in README.build-rds .
+
+3. GETTING THE SCAFFOLDING READS
+
+Once you have an indexed RDS file, use the scriipit distalPairs.py to output
+the list of paired reads that do not map to the same contig. This involves
+specifying a distance to distalPairs.py that is greater than the length of the
+largest existing genomic contig. For example:
+
+python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs -splices -cache 20000000
+
+4. RUNNING RNAPATH.py
+
+You can now run RNAPATH.py. I suggest optionallly using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
+
+Example: $ERANGEPATH/rnapath/RNAPATH.py genomic_contigs.fa rna_on_genomic.crosspairs RNAPATH.log genome.RNAPATH.fa
+
+version 3.2 May 2010 - first release
+
--- /dev/null
+# analysis steps for an ERANGE analysis of RNA-seq data
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+
+# preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
+# export PYTHONPATH=/my/path/to/cistematic
+#
+# preliminary: set CISTEMATIC_ROOT to the directory that contains the genome directories (such as H_sapiens or M_musculus), e.g.
+# export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes
+#
+# preliminary: set ERANGEPATH, e.g.
+# export ERANGEPATH=/proj/genome/experiments/commoncode
+#
+# preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g.
+# export CISTEMATIC_TEMP=/any/local/dir
+#
+# preliminary: create splice file using getsplicefa.py with maxBorder set to 4 bp shorter than the read length, e.g.
+# python $ERANGEPATH/getsplicefa.py hsapiens /my/path/to/human/knownGene.txt hg18splice32.fa 28
+#
+# preliminary: build expanded genome using Eland's squashGenome or Bowtie's bowtie-build (see README.build-rds)
+# a slower alternative is to use blat just on the genome.
+#
+# preliminary: build repeatmask database using buildrmaskdb.py, e.g.
+# python $ERANGEPATH/buildrmaskdb.py /path/to/hg19repeats /path/to/hg18repeats/rmask.db
+# if you don't have an repeatmask database, just use "none" for the rmask database below
+
+# run bowtie on expanded genome or just blat on the regular genome
+# as described in README.build-rds
+#
+
+# create rds file with one lane's worth of data (add -index if using only one lane)
+# The example below sets the default cache to 1000000
+# The name::value pairs are optional documentart metadata, and can be set to any desired name or value
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX
+
+# can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups
+# rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer
+#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount
+
+# append more data (only add -index when adding last lane)
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index
+
+# count the unique reads falling on the gene models ; the nomatch files are
+# mappable reads that fell outside of the Cistematic gene models and not the
+# unmappable of Eland (i.e, the "NM" reads)
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1
+
+# There is a choice of either identifying new regions from the data alone
+# (Alternative 1), or using a pre-computed list of new regions (presumably
+# pooled from multiple nomatch.bed files, or literature) against the nomatch.bed
+# file (Alternative 2)
+
+# Alternative 1: find new regions outside of gene models with reads piled up
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+# use "none" if you don't have a repeatmask database
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+#python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
+
+# map all candidate regions that are within a 20kb radius of a gene in bp
+# take out -cache if running locally
+python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache
+
+# create bed file of accepted candidate regions
+python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache
+
--- /dev/null
+#!/bin/bash
+echo 'buildMatrix.sh: version 1.1'
+
+indexPrev=0
+indexCur=0
+
+truncateRPKM=""
+if [ $# -eq 3 ]; then
+ truncateRPKM="-truncate "$3
+fi
+
+if [ $# -eq 4 ]; then
+ truncateRPKM="-rescale -truncate "$3
+fi
+
+if [ $# -lt 2 ]; then
+ echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [-rescale]'
+ echo
+ echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
+ echo
+else
+ python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "with parameters: $1 $2 $truncateRPKM"
+ while read line
+ do
+ prefix=`echo $line | cut -f 1 -d ','`
+ filename=$prefix.partcount
+ if [ -e $filename ]; then
+ if [ $indexCur -lt 1 ]; then
+ echo "building $1.step0"
+ echo -e '\t' > $1.step0
+ cut -f 1 $filename >> $1.step0
+ indexCur=1
+ fi
+ python $ERANGEPATH/buildMatrix.py $1.step$indexPrev $filename $1.step$indexCur $truncateRPKM
+ rm $1.step$indexPrev
+ let indexPrev=indexPrev+1
+ let indexCur=indexCur+1
+ else
+ echo "could not find $filename - skipping"
+ python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "could not find $rds - skipping"
+ fi
+ done < $2
+ mv $1.step$indexPrev $1.matrix.tab
+fi
--- /dev/null
+# an example shell script to combine multiple region calls into one partition
+#
+
+if [ -z "$1" ]; then
+ PARTNAME=comb
+else
+ PARTNAME=$1
+fi
+
+if [ -z "$2" ]; then
+ MINSIZE=400
+else
+ MINSIZE=$2
+fi
+
+N=0
+if [ $# -lt 2 ]; then
+ echo 'usage: partition.sh name minSize datalist.file'
+ echo
+ echo 'where the datalist file is a list of region files'
+ echo
+else
+ while read line
+ do
+ if [ $N -lt 1 ]; then
+ FILELIST=''
+ else
+ FILELIST=$FILELIST,
+ fi
+ FILELIST=$FILELIST$line
+ let N=N+1
+ done < $3
+ python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part -minFeature $MINSIZE -nomerge -locid -norandom
+fi
--- /dev/null
+#!/bin/bash
+echo 'regionCounts.sh: version 1.0'
+
+cachepages=""
+if [ $# -eq 3 ]; then
+ cachepages="-cache "$3
+fi
+
+if [ $# -lt 2 ]; then
+ echo 'usage: regionCounts.sh partitionfile datalist.file [cachevalue]'
+ echo
+ echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
+ echo
+else
+ arguments=$1' '$2' '$cachepages
+ python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "with parameters: $arguments"
+ while read line
+ do
+ prefix=`echo $line | cut -f 1 -d ','`
+ rds=`echo $line | cut -f 2 -d ','`
+ if [ -e $rds ]; then
+ python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount -force -nomerge -rpkm $cachepages
+ else
+ echo "could not find $rds - skipping"
+ python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "could not find $rds - skipping"
+ fi
+ done < $2
+fi
--- /dev/null
+#!/bin/bash
+#
+# runRNAPairedAnalysis.sh
+# ENRAGE
+#
+# example: . ../commoncode/runRNAPairedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db
+#
+# assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run.
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+ ERANGEPATH='../commoncode'
+fi
+
+echo 'runRNAPairedAnalysis.sh: version 3.7'
+
+models=""
+if [ $# -eq 5 ]; then
+ models=" -models "$5
+fi
+
+replacemodels=""
+if [ $# -eq 6 ]; then
+ replacemodels=" -models $5 -replacemodels "
+fi
+
+if [ -z "$1" ]
+then
+ echo
+ echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [-replacemodels]'
+ echo
+ echo 'where rdsprefix is the name of the rds file without the .rds extension'
+ echo 'use "none" for the repeatmaskdb if you do not have one'
+ echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$models' '$5
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runRNAPairedAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are
+# mappable reads that fell outside of the Cistematic gene models and not the
+# unmappable of Eland (i.e, the "NM" reads)
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -markGID -cache 1 $models $replacemodels
+
+# find new regions outside of gene models with reads piled up
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# filter out new regions that overlap repeats more than a certain fraction
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked -startField 1 -log rna.log -cache 1
+
+# calculate the read densities
+python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good -markRDS -cache -log rna.log
+
+# map all candidate regions that have paired ends overlapping with known genes
+python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt -cache $models $replacemodels
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+
+fi
--- /dev/null
+#!/bin/bash
+#
+# runSNPAnalysis.sh
+#
+# Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
+# Example: /getSNPs.sh mouse /woldlab/trog/sdc/alim/24T4spike_10212/24T4spike.rds 24Tspike /woldlab/trog/data1/wlee/db/rmask.db /woldlab/trog/data1/wlee/db/dbSNP128.db 5 0.75 ~/proj/c2c12rna24R/c2c12rna.24R.final.rpkm 5000000
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+ ERANGEPATH='../commoncode'
+fi
+
+echo 'runSNPAnalysis.sh: version 3.1'
+
+cachepages=""
+if [ $# -eq 9 ]; then
+ cachepages="-cache "$9
+fi
+
+nosplices=""
+if [ $# -eq 10 ]; then
+ nosplices=" -nosplices "
+fi
+
+if [ $# -lt 8 ]; then
+ echo 'runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]'
+ echo 'where for each position S:'
+ echo ' uniqStartMin = # independent reads supporting base change at S'
+ echo ' totalRatio = total # reads supporting base change at S / total # reads that pass through S'
+else
+# log the parameters
+arguments=$1' '$2' '$3' '$4' '$5' '$6' '$7' '$8' '$cachepages$nosplices
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py snp.log runSNPAnalysis.sh "with parameters: $arguments"
+
+# get all SNPs by extracting it from the RDS
+python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt -enforceChr $cachepages $nosplices
+
+# get SNPs in non-repeat regions only
+python $ERANGEPATH/chkSNPrmask.py $4 $3.snps.txt $3.nr_snps.txt $cachepages
+
+# Check to see if SNPs are found in dbSNP
+# if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC
+# usage: python2.5 buildsnpdb.py snpdbdir snpdbname
+# the database flat file must be in the snpdbdir directory
+# To build dbSNP database file, run the following command
+# python2.5 buildsnpdb.py snp128.txt dbSNP128
+
+# get dbSNP info for SNPs that are found in the dbSNP database
+python $ERANGEPATH/chksnp.py $5 $3.nr_snps.txt $3.nr_dbsnp.txt $cachepages
+
+# get gene info for the snps found in dbSNP
+python $ERANGEPATH/getSNPGeneInfo.py $1 $3.nr_dbsnp.txt $8 $3.nr_dbsnp_geneinfo.txt $cachepages
+
+# get gene info for snps that are not found in dbSNP
+python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt
+
+# make bed file for displaying the snps on UCSC genome browser
+python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
+fi
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+#
+# runStandardAnalysis.sh
+# ENRAGE
+#
+# example: . $ERANGEPATH/runStandardAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000
+#
+# assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run.
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+ ERANGEPATH='../commoncode'
+fi
+
+echo 'runStandardAnalysis.sh: version 4.2'
+
+models=""
+if [ $# -eq 5 ]; then
+ models=" -models "$5
+fi
+
+replacemodels=""
+if [ $# -eq 6 ]; then
+ replacemodels=" -models $5 -replacemodels "
+fi
+
+if [ -z "$1" ]
+then
+ echo
+ echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [-replacemodels]'
+ echo
+ echo 'where rdsprefix is the name of the rds file without the .rds extension'
+ echo 'use "none" for the repeatmaskdb if you do not have one'
+ echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$4' '$models' '$6
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runStandardAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are
+# mappable reads that fell outside of the Cistematic gene models and not the
+# unmappable of Eland (i.e, the "NM" reads)
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+
+# recount the unique reads with weights calculated during the first pass
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+
+# count splice reads
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels
+
+# Alternative 1: find new regions outside of gene models with reads piled up
+echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1"
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -cache 1"
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -log rna.log -startField 1 -cache 1
+
+# map all candidate regions that are within a given radius of a gene in bp
+echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels"
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels
+
+# make sure candidates.txt file exists
+echo "touch $2.candidates.txt"
+touch $2.candidates.txt
+
+# calculate expanded exonic read density
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+
+# weigh multi-reads
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+
+# calculate final exonic read density
+echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache"
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+
+fi
\ No newline at end of file
--- /dev/null
+#!/bin/bash
+#
+# runStrandedAnalysis.sh
+# ENRAGE
+#
+# example: . ../commoncode/runStrandedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000
+#
+# assuming that we have rds database with the prefix c2c12rna.24R.
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+ ERANGEPATH='../commoncode'
+fi
+
+echo 'runStrandedAnalysis.sh: version 4.1'
+
+if [ -z "$1" ]
+then
+ echo
+ echo 'usage:runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius'
+ echo
+ echo 'where rdsprefix is the name of the rds file without the .rds extension'
+ echo 'use "none" for the repeatmaskdb if you do not have one'
+ echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$4
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runStrandedAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are
+# mappable reads that fell outside of the Cistematic gene models and not the
+# unmappable of Eland (i.e, the "NM" reads)
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -stranded -markGID -cache 1
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -stranded -uniq -cache 1
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -stranded -splices -noUniqs -cache 1
+
+# find new regions outside of gene models with reads piled up
+python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter plus -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter minus -log rna.log -cache 1 -append
+
+# filter out new regions that overlap repeats more than a certain fraction
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -log rna.log -cache 1
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+#python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
+#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
+
+# map all candidate regions that are within a given radius of a gene in bp
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -stranded -cache
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -stranded -multi -cache 1
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+
+fi
--- /dev/null
+#
+# farPairs.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 7/13/10.
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, time
+import optparse
+from commoncode import readDataset
+
+print "%prog: version 1.3"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--minDist", type="int", dest="minDist")
+ parser.add_option("--maxDist", type="int", dest="maxDist")
+ parser.add_option("--minCount", type="int", dest="minCount")
+ parser.add_option("--label", dest="label")
+ parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
+ minDist=1000, maxDist=500000, minCount=2, label=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ print "\tIs both slow and takes up large amount of RAM"
+ sys.exit(1)
+
+ rdsfile = args[0]
+ outfilename = args[1]
+ outbedname = args[2]
+
+ farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+ options.cachePages, options.minDist, options.maxDist, options.minCount,
+ options.label)
+
+
+def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+ cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
+
+ doCache = False
+ if cachePages is not None:
+ doCache = True
+ else:
+ cachePages = 0
+
+ if label is None:
+ label = rdsfile
+
+ RDS = readDataset(rdsfile, verbose=True, cache=doCache)
+ rdsChromList = RDS.getChromosomes()
+
+ if doVerbose:
+ print time.ctime()
+
+ total = 0
+ outfile = open(outfilename, "w")
+ outbed = open(outbedname, "w")
+ outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+
+ readlen = RDS.getReadSize()
+ flagDict = {}
+ for chromosome in rdsChromList:
+ if doNotProcessChromosome(chromosome):
+ continue
+
+ print chromosome
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+ if doVerbose:
+ print len(uniqDict), time.ctime()
+
+ for readID in uniqDict:
+ readList = uniqDict[readID]
+ if len(readList) == 2:
+ total += 1
+ (start1, flag1, pair1) = readList[0]
+ (start2, flag2, pair2) = readList[1]
+
+ if flag1 != flag2:
+ dist = abs(start1 - start2)
+ startList = [start1, start2]
+ stopList = [start1 + readlen, start2 + readlen]
+ startList.sort()
+ stopList.sort()
+ if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
+ outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
+ outbed.write(outputLine)
+ if doVerbose:
+ print flag1, flag2, dist
+
+ try:
+ flagDict[flag1].append((flag2, start1, start2))
+ except KeyError:
+ flagDict[flag1] = [(flag2, start1, start2)]
+
+ try:
+ flagDict[flag2].append((flag1, start1, start2))
+ except KeyError:
+ flagDict[flag2] = [(flag2, start1, start2)]
+
+ print "%d connected regions" % len(flagDict)
+
+ for region in flagDict:
+ flagDict[region].sort()
+ regionConnections = {}
+ for (region2, start1, start2) in flagDict[region]:
+ try:
+ regionConnections[region2] += 1
+ except KeyError:
+ regionConnections[region2] = 1
+
+ for region2 in regionConnections:
+ if regionConnections[region2] >= minCount:
+ outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
+ if doVerbose:
+ print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+
+ outfile.close()
+ outbed.close()
+ if doVerbose:
+ print "finished: ", time.ctime()
+
+
+def doNotProcessChromosome(chrom):
+ return chrom == "chrM"
+
+
+def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
+ readSizes = "%d" % (stopList[0] - startList[0])
+ readCoords = "0"
+ leftStart = startList[0] - 1
+ rightStop = stopList[-1]
+ for index in range(1, numPieces):
+ readSizes += ",%d" % (stopList[index] - startList[index] + 1)
+ readCoords += ",%d" % (startList[index] - startList[0])
+
+ if rsense == "+":
+ senseCode = plusSense
+ else:
+ senseCode = minusSense
+
+ outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+ return outline
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# featureIntersects.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from cistematic.core import featuresIntersecting
+
+print "%prog: version 1.0"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s tabfile [--cistype type] [--radius radius]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cistype", action="store_false", dest="cistype")
+ parser.add_option("--radius", type="int", dest="radius")
+ parser.set_defaults(cistype="TFBSCONSSITES", radius=100)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 1:
+ print usage
+ sys.exit(1)
+
+ tabfile = args[0]
+
+ featureIntersects(tabfile, options.cistype, options.radius)
+
+
+def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
+ tabfile = open(tabFileName)
+ previous = ""
+
+ posList = []
+ for line in tabfile:
+ fields = line.split("\t")
+ current = fields[0]
+ if previous == current:
+ continue
+
+ previous = current
+ chrom = fields[1][3:]
+ posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
+
+ feats = featuresIntersecting("human", posList, radius, cistype)
+ featkeys = feats.keys()
+ featkeys.sort()
+ for (chrom, pos) in featkeys:
+ print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# findMotifs.py
+# ENRAGE
+#
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, os, optparse
+from cistematic.experiments.fasta import Fasta
+from cistematic.programs.meme import Meme
+from cistematic.programs.cisGreedy import CisGreedy
+#TODO: cisSampler is not supported yet!
+#from cistematic.programs.cisSampler import CisSampler
+
+print "%prog: version 3.4"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog explabel regions.fsa [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--meme", action="store_true", dest="doMeme")
+ parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
+ parser.add_option("--logo", action="store_true", dest="saveLogo")
+ parser.add_option("--threshold", type="float", dest="threshold")
+ parser.add_option("--prefix", dest="motifPrefix")
+ parser.add_option("--numMotifs", dest="numMotifs")
+ parser.add_option("--maxWidth", type="int", dest="maxWidth")
+ parser.add_option("--maskLower", action="store_true", dest="maskLower")
+ parser.set_defaults(doMeme=False, doCisGreedy=False, saveLogo=False,
+ threshold=75., numMotifs="10", maxWidth=28, maskLower=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ print "\n\twhere at least one of the motif finders (meme or cisGreedy) must be specified\n"
+ sys.exit(1)
+
+ expbase = args[0]
+ fsafile = args[1]
+
+ doCisSampler = False
+ if "--cisSampler" in sys.argv:
+ print "cisSampler is not supported yet! avoid using it for now"
+ doCisSampler = True
+
+ findMotifs(expbase, fsafile, options.doMeme, options.doCisGreedy, options.saveLogo,
+ options.threshold, options.numMotifs, options.maxWidth, options.maskLower,
+ doCisSampler)
+
+
+def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
+ numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False):
+
+ motifPrefix = expbase
+
+ #TODO: cisSampler is not supported yet!
+ #if doMeme or doCisGreedy or doCisSampler:
+ if not (doMeme or doCisGreedy):
+ print "error: must specify at least one motif finder - exiting"
+ sys.exit(1)
+
+ exp = Fasta(expbase, "%s.db" % expbase)
+
+ exp.initialize()
+ if maskLower:
+ exp.setMaskLowerCase(True)
+
+ if doMeme:
+ prog4 = Meme()
+ prog4.setMaxWidth(maxWidth)
+ prog4.setNumMotifs(numMotifs)
+ prog4.setModel("zoops")
+ exp.appendProgram(prog4)
+
+ if doCisGreedy:
+ prog5 = CisGreedy()
+ prog5.setGenExpOptions([])
+ prog5.setMaxWidth(maxWidth)
+ prog5.setNumMotifs(numMotifs)
+ exp.appendProgram(prog5)
+
+ #TODO: cisSampler is not supported yet!
+ #if doCisSampler:
+ # prog6 = CisSampler()
+ # prog6.setGenExpOptions([])
+ # prog6.setMaxWidth(maxWidth)
+ # prog6.setNumMotifs(numMotifs)
+ # exp.appendProgram(prog6)
+
+ exp.run(fsafile)
+ exp.createAnalysis()
+ exp.loadAnalysis()
+ exp.mapMotifs(threshold, verbose=False)
+ exp.exportMotifs(prefix = motifPrefix)
+ if saveLogo:
+ exp.exportLogos(prefix = motifPrefix)
+
+ exp.draw("%s.png" % expbase, maxOccurences=4000)
+ print "deleting database..."
+ del exp
+ os.remove("%s.db" % expbase)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+ usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
+ [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+ [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
+ [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
+ [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
+ [--revbackground] [--pvalue self|back|none] [--nodirectionality]
+ [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
+ [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
+
+ where values in brackets are optional and label is an arbitrary string.
+
+ Use -ratio (default 4 fold) to set the minimum fold enrichment
+ over the control, -minimum (default 4) is the minimum number of reads
+ (RPM) within the region, and -spacing (default readlen) to set the maximum
+ distance between reads in the region. -listPeak lists the peak of the
+ region. Peaks mut be higher than -minPeak (default 0.5 RPM).
+ Pvalues are calculated from the sample (change with -pvalue),
+ unless the -revbackground flag and a control RDS file are provided.
+
+ By default, all numbers and parameters are on a reads per
+ million (RPM) basis. -raw will treat all settings, ratios and reported
+ numbers as raw counts rather than RPM. Use -notrim to turn off region
+ trimming and -trimvalue to control trimming (default 10% of peak signal)
+
+ The peak finder uses minimal directionality information that can
+ be turned off with -nodirectionality ; the fraction of + strand reads
+ required to be to the left of the peak (default 0.3) can be set with
+ -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
+ of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
+
+ Use -shift to shift reads either by half the expected
+ fragment length (default 0 bp) or '-shift learn ' to learn the shift
+ based on the first chromosome. If you prefer to learn the shift
+ manually, use -autoshift to calculate a per-region shift value, which
+ can be reported using -reportshift. -strandfilter should only be used
+ when explicitely calling unshifted stranded peaks from non-ChIP-seq
+ data such as directional RNA-seq. regionoutfile is written over by
+ default unless given the -append flag.
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import math
+import string
+import optparse
+from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
+
+
+versionString = "%s: version 3.2" % sys.argv[0]
+print versionString
+
+def usage():
+ print __doc__
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = __doc__
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--control", dest="mockfile")
+ parser.add_option("--minimum", type="float", dest="minHits")
+ parser.add_option("--ratio", type="float", dest="minRatio")
+ parser.add_option("--spacing", type="int", dest="maxSpacing")
+ parser.add_option("--listPeak", action="store_true", dest="listPeak")
+ parser.add_option("--shift", dest="shift")
+ parser.add_option("--learnFold", type="float", dest="stringency")
+ parser.add_option("--noshift", action="store_true", dest="noShift")
+ parser.add_option("--autoshift", action="store_true", dest="autoshift")
+ parser.add_option("--reportshift", action="store_true", dest="reportshift")
+ parser.add_option("--nomulti", action="store_true", dest="noMulti")
+ parser.add_option("--minPlus", type="float", dest="minPlusRatio")
+ parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
+ parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
+ parser.add_option("--minPeak", type="float", dest="minPeak")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
+ parser.add_option("--pvalue", dest="ptype")
+ parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
+ parser.add_option("--strandfilter", dest="strandfilter")
+ parser.add_option("--trimvalue", type="float", dest="trimValue")
+ parser.add_option("--notrim", action="store_false", dest="doTrim")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--log", dest="logfilename")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--append", action="store_true", dest="doAppend")
+ parser.add_option("--RNA", action="store_true", dest="rnaSettings")
+ parser.add_option("--combine5p", action="store_true", dest="combine5p")
+ parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
+ stringency=4.0, noshift=False, autoshift=False, reportshift=False,
+ minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+ normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
+ trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
+ cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
+ strandfilter=None, combine5p=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ usage()
+ sys.exit(2)
+
+ factor = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
+ options.stringency, options.noshift, options.autoshift, options.reportshift,
+ options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
+ options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
+ options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
+ options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+ options.strandfilter, options.combine5p)
+
+
+def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
+ stringency=4.0, noshift=False, autoshift=False, reportshift=False,
+ minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+ normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
+ trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
+ cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
+ strandfilter=None, combine5p=False):
+
+ shiftValue = 0
+ if autoshift:
+ shiftValue = "auto"
+
+ if shift is not None:
+ try:
+ shiftValue = int(shift)
+ except ValueError:
+ if shift == "learn":
+ shiftValue = "learn"
+ print "Will try to learn shift"
+
+ if noshift:
+ shiftValue = 0
+
+ if trimValue is not None:
+ trimValue = float(trimValue) / 100.
+ trimString = "%2.1f%s" % ((100. * trimValue), "%")
+ else:
+ trimValue = 0.1
+ trimString = "10%"
+
+ if not doTrim:
+ trimString = "none"
+
+ if doRevBackground:
+ print "Swapping IP and background to calculate FDR"
+ pValueType = "back"
+
+ doControl = False
+ if mockfile is not None:
+ doControl = True
+
+ doPvalue = True
+ if ptype is not None:
+ ptype = ptype.upper()
+ if ptype == "NONE":
+ doPvalue = False
+ pValueType = "none"
+ p = 1
+ poissonmean = 0
+ elif ptype == "SELF":
+ pValueType = "self"
+ elif ptype == "BACK":
+ if doControl and doRevBackground:
+ pValueType = "back"
+ else:
+ print "must have a control dataset and -revbackground for pValue type 'back'"
+ else:
+ print "could not use pValue type : %s" % ptype
+ else:
+ pValueType = "self"
+
+ if cachePages is not None:
+ doCache = True
+ else:
+ doCache = False
+ cachePages = -1
+
+ if withFlag != "":
+ print "restrict to flag = %s" % withFlag
+
+ useMulti = True
+ if noMulti:
+ print "using unique reads only"
+ useMulti = False
+
+ if rnaSettings:
+ print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
+ shiftValue = 0
+ doTrim = False
+ doDirectionality = False
+
+ stranded = ""
+ if strandfilter is not None:
+ if strandfilter == "plus":
+ stranded = "+"
+ minPlusRatio = 0.9
+ maxPlusRatio = 1.0
+ print "only analyzing reads on the plus strand"
+ elif strandfilter == "minus":
+ stranded = "-"
+ minPlusRatio = 0.0
+ maxPlusRatio = 0.1
+ print "only analyzing reads on the minus strand"
+
+ stringency = max(stringency, 1.0)
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ if doControl:
+ print "\ncontrol:"
+ mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
+
+ if cachePages > mockRDS.getDefaultCacheSize():
+ mockRDS.setDBcache(cachePages)
+
+ print "\nsample:"
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ readlen = hitRDS.getReadSize()
+ if rnaSettings:
+ maxSpacing = readlen
+
+ print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
+ print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
+ print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ hitRDSsize = len(hitRDS) / 1000000.
+ if doControl:
+ mockRDSsize = len(mockRDS) / 1000000.
+
+ if normalize:
+ if doControl:
+ mockSampleSize = mockRDSsize
+
+ hitSampleSize = hitRDSsize
+
+ if doAppend:
+ outfile = open(outfilename, "a")
+ else:
+ outfile = open(outfilename, "w")
+
+ outfile.write("#ERANGE %s\n" % versionString)
+ if doControl:
+ outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
+ else:
+ outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
+
+ if withFlag != "":
+ outfile.write("#restrict to Flag = %s\n" % withFlag)
+
+ outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
+ outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
+ outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
+ if normalize:
+ print "Normalizing to RPM"
+ countLabel = "RPM"
+ else:
+ countLabel = "COUNT"
+
+ headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
+ if doDirectionality:
+ headerList.append("plus%\tleftPlus%")
+
+ if listPeak:
+ headerList.append("peakPos\tpeakHeight")
+
+ if reportshift:
+ headerList.append("readShift")
+
+ if doPvalue:
+ headerList.append("pValue")
+
+ headline = string.join(headerList, "\t")
+ print >> outfile, headline
+
+ statistics = {"index": 0,
+ "total": 0,
+ "mIndex": 0,
+ "mTotal": 0,
+ "failed": 0
+ }
+
+ if minRatio < minPeak:
+ minPeak = minRatio
+
+ hitChromList = hitRDS.getChromosomes()
+ if doControl:
+ mockChromList = mockRDS.getChromosomes()
+
+ hitChromList.sort()
+
+ for chromosome in hitChromList:
+ if doNotProcessChromosome(chromosome, doControl, mockChromList):
+ continue
+
+ print "chromosome %s" % (chromosome)
+ hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
+ maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+ if shiftValue == "learn":
+ shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
+ stringency, readlen, minHits, logfilename, outfile, outfilename)
+
+ regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
+ normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+ shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+ noMulti, doControl, factor, trimValue, outputRegionList=True)
+
+ statistics["index"] += regionStats["index"]
+ statistics["total"] += regionStats["total"]
+ statistics["failed"] += regionStats["failed"]
+ if not doRevBackground:
+ if doPvalue:
+ p, poissonmean = calculatePValue(allRegionWeights)
+
+ print headline
+ shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+ continue
+
+ #now do background swapping the two samples around
+ print "calculating background..."
+ backgroundTrimValue = 1/20.
+ backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
+ normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+ shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+ noMulti, doControl, factor, backgroundTrimValue)
+
+ statistics["mIndex"] += backgroundRegionStats["index"]
+ statistics["mTotal"] += backgroundRegionStats["total"]
+ statistics["failed"] += backgroundRegionStats["failed"]
+ print statistics["mIndex"], statistics["mTotal"]
+ if doPvalue:
+ if pValueType == "self":
+ p, poissonmean = calculatePValue(allRegionWeights)
+ else:
+ p, poissonmean = calculatePValue(backgroundRegionWeights)
+
+ print headline
+ shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+
+ footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
+ print footer
+ outfile.write(footer)
+ outfile.close()
+
+ writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+
+
+def doNotProcessChromosome(chromosome, doControl, mockChromList):
+ skipChromosome = False
+ if chromosome == "chrM":
+ skipChromosome = True
+
+ if doControl and (chromosome not in mockChromList):
+ skipChromosome = True
+
+ return skipChromosome
+
+
+def calculatePValue(dataList):
+ dataList.sort()
+ listSize = float(len(dataList))
+ try:
+ poissonmean = sum(dataList) / listSize
+ except ZeroDivisionError:
+ poissonmean = 0
+
+ print "Poisson n=%d, p=%f" % (listSize, poissonmean)
+ p = math.exp(-poissonmean)
+
+ return p, poissonmean
+
+
+def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
+ stringency, readlen, minHits, logfilename, outfile, outfilename):
+
+ print "learning shift.... will need at least 30 training sites"
+ previousHit = -1 * maxSpacing
+ hitList = [-1]
+ weightList = [0]
+ readList = []
+ shiftDict = {}
+ count = 0
+ numStarts = 0
+ for (pos, sense, weight) in hitDict[chrom]:
+ if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
+ sumAll = sum(weightList)
+ if normalize:
+ sumAll /= hitSampleSize
+
+ regionStart = hitList[0]
+ regionStop = hitList[-1]
+ regionLength = regionStop - regionStart
+ # we're going to require stringent settings
+ if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
+ foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
+
+ if foldRatio >= minRatio:
+ localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+ try:
+ shiftDict[localshift] += 1
+ except KeyError:
+ shiftDict[localshift] = 1
+
+ count += 1
+
+ hitList = []
+ weightList = []
+ readList = []
+ numStarts = 0
+
+ if pos not in hitList:
+ numStarts += 1
+
+ hitList.append(pos)
+ weightList.append(weight)
+ readList.append((pos, sense, weight))
+ previousHit = pos
+
+ bestShift = 0
+ bestCount = 0
+ outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
+ print outline
+ writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+ if count < 30:
+ outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
+ print outline
+ writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+ shiftValue = 0
+ else:
+ for shift in sorted(shiftDict):
+ if shiftDict[shift] > bestCount:
+ bestShift = shift
+ bestCount = shiftDict[shift]
+
+ shiftValue = bestShift
+ print shiftDict
+
+ outline = "#picked shiftValue to be %d" % shiftValue
+ print outline
+ print >> outfile, outline
+ writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+
+ return shiftValue
+
+
+def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
+ if doControl:
+ foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+ else:
+ foldRatio = minRatio
+
+ return foldRatio
+
+
+def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
+ numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+ if normalize:
+ numMock /= sampleSize
+
+ foldRatio = sumAll / numMock
+
+ return foldRatio
+
+
+def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
+ normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+ shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+ noMulti, doControl, factor, trimValue, outputRegionList=False):
+
+ index = 0
+ total = 0
+ failedCounter = 0
+ previousHit = - 1 * maxSpacing
+ currentHitList = [-1]
+ currentWeightList = [0]
+ currentReadList = []
+ regionWeights = []
+ outregions = []
+ numStarts = 0
+ hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
+ maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
+ for (pos, sense, weight) in hitDict[chrom]:
+ if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
+ sumAll = sum(currentWeightList)
+ if normalize:
+ sumAll /= rdsSampleSize
+
+ regionStart = currentHitList[0]
+ regionStop = currentHitList[-1]
+ regionWeights.append(int(sumAll))
+ if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
+ sumMulti = 0.
+ #first pass uses getFoldRatio on mockRDS as there may not be control
+ foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+ if foldRatio >= minRatio:
+ # first pass, with absolute numbers
+ if doDirectionality:
+ (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
+ else:
+ (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
+
+ bestPos = topPos[0]
+ peakScore = smoothArray[bestPos]
+ if normalize:
+ peakScore /= rdsSampleSize
+
+ if doTrim:
+ minSignalThresh = trimValue * peakScore
+ start = 0
+ stop = regionStop - regionStart - 1
+ startFound = False
+ while not startFound:
+ if smoothArray[start] >= minSignalThresh or start == bestPos:
+ startFound = True
+ else:
+ start += 1
+
+ stopFound = False
+ while not stopFound:
+ if smoothArray[stop] >= minSignalThresh or stop == bestPos:
+ stopFound = True
+ else:
+ stop -= 1
+
+ regionStop = regionStart + stop
+ regionStart += start
+ try:
+ if doDirectionality:
+ (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
+ else:
+ (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
+ except:
+ continue
+
+ if normalize:
+ sumAll /= rdsSampleSize
+
+ foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+ if outputRegionList:
+ sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
+ # just in case it changed, use latest data
+ try:
+ bestPos = topPos[0]
+ peakScore = smoothArray[bestPos]
+ except:
+ continue
+
+ # normalize to RPM
+ if normalize:
+ peakScore /= rdsSampleSize
+
+ elif outputRegionList:
+ sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
+
+ if outputRegionList:
+ # normalize to RPM
+ if normalize:
+ sumMulti /= rdsSampleSize
+
+ try:
+ multiP = 100. * (sumMulti / sumAll)
+ except:
+ break
+
+ if noMulti:
+ multiP = 0.
+
+ # check that we still pass threshold
+ if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen:
+ plusRatio = float(numPlus)/numHits
+ if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+ if outputRegionList:
+ peak = ""
+ if listPeak:
+ peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
+
+ if doDirectionality:
+ if leftPlusRatio < numLeft / numPlus:
+ index += 1
+ if outputRegionList:
+ plusP = plusRatio * 100.
+ leftP = 100. * numLeft / numPlus
+ # we have a region that passes all criteria
+ outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
+
+ total += sumAll
+ else:
+ failedCounter += 1
+ else:
+ # we have a region, but didn't check for directionality
+ index += 1
+ total += sumAll
+ if outputRegionList:
+ outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
+
+ currentHitList = []
+ currentWeightList = []
+ currentReadList = []
+ numStarts = 0
+
+ if pos not in currentHitList:
+ numStarts += 1
+
+ currentHitList.append(pos)
+ currentWeightList.append(weight)
+ currentReadList.append((pos, sense, weight))
+ previousHit = pos
+
+ statistics = {"index": index,
+ "total": total,
+ "failed": failedCounter
+ }
+
+ if outputRegionList:
+ return statistics, regionWeights, outregions
+ else:
+ return statistics, regionWeights
+
+
+def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
+ bestShift = 0
+ shiftDict = {}
+ for region in outregions:
+ # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+ if doPvalue:
+ sumAll = int(region[5])
+ for i in xrange(sumAll):
+ pValue *= poissonmean
+ pValue /= i+1
+
+ if shiftValue == "auto" and reportshift:
+ try:
+ shiftDict[region[-1]] += 1
+ except KeyError:
+ shiftDict[region[-1]] = 1
+
+ try:
+ if reportshift:
+ outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
+ else:
+ outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
+ except:
+ if reportshift:
+ outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
+ else:
+ outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
+
+ if doPvalue:
+ outputList.append("%1.2g" % pValue)
+
+ outline = string.join(outputList, "\t")
+ print outline
+ print >> outfile, outline
+
+ if shiftValue == "auto" and reportshift:
+ bestCount = 0
+ for shift in sorted(shiftDict):
+ if shiftDict[shift] > bestCount:
+ bestShift = shift
+ bestCount = shiftDict[shift]
+
+ return bestShift
+
+
+def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
+ footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
+ if doDirectionality:
+ footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+
+ if doRevBackground:
+ try:
+ percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
+ except (ValueError, ZeroDivisionError):
+ percent = 0.
+
+ footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+
+ if shiftValue == "auto" and reportshift:
+ footerList.append("#mode of shift values: %d" % shiftModeValue)
+
+ footer = string.join(footerList, "\n")
+
+ return footer
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# fraction.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+from random import random
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(sys.argv) < 4:
+ print "usage: python %s fraction infile outfile" % sys.argv[0]
+ sys.exit(1)
+
+ fraction = float(sys.argv[1])
+ infile = sys.argv[2]
+ outfile = argv[3]
+
+ doFraction(fraction, infile, outfile)
+
+
+def doFraction(fraction, inFileName, outFileName):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+
+ totalIndex = 0
+ fractionIndex = 0
+ for line in infile:
+ totalIndex += 1
+ if random() <= fraction:
+ outfile.write(line)
+ fractionIndex += 1
+
+ infile.close()
+ outfile.close()
+
+ print "%d / %d = %.2f" % (fractionIndex, totalIndex, float(fractionIndex) / totalIndex)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneDownstreamBins.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+# originally from version 1.3 of geneDnaDownstreamCounts.py
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--max", type="int", dest="standardMinDist",
+ help="maximum region in bp")
+ parser.set_defaults(standardMinDist=3000)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
+
+
+def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
+ bins = 10
+ standardMinThresh = standardMinDist / bins
+
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ normalizationFactor = 1.0
+ if normalize:
+ hitDictSize = len(hitRDS)
+ normalizationFactor = hitDictSize / 1000000.
+
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=True)
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ featuresDict = hg.getallGeneFeatures()
+
+ outfile = open(outfilename, "w")
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for gid in gidList:
+ symbol = "LOC" + gid
+ geneinfo = ""
+ featureList = []
+ try:
+ geneinfo = geneinfoDict[gid]
+ featureList = featuresDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ print gid
+
+ if len(featureList) == 0:
+ continue
+
+ newfeatureList = []
+ for (ftype, chrom, start, stop, fsense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+
+ if chrom not in hitDict:
+ continue
+
+ newfeatureList.sort()
+ if len(newfeatureList) < 1:
+ continue
+
+ glen = standardMinDist
+ if fsense == "F":
+ nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+
+ if glen < 1:
+ glen = 1
+
+ gstart = newfeatureList[-1][1]
+ else:
+ nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+
+ if glen < 1:
+ glen = 1
+
+ gstart = newfeatureList[0][0] - glen
+ if gstart < 0:
+ gstart = 0
+
+ tagCount = 0
+ if glen < standardMinDist:
+ continue
+
+ binList = [0.] * bins
+ for (tagStart, sense, weight) in hitDict[chrom]:
+ tagStart -= gstart
+ if tagStart >= glen:
+ break
+
+ if tagStart > 0:
+ tagCount += weight
+ if fsense == "F":
+ # we are relying on python's integer division quirk
+ binID = tagStart / standardMinThresh
+ binList[binID] += weight
+ else:
+ rdist = glen - tagStart
+ binID = rdist / standardMinThresh
+ binList[binID] += weight
+
+ if tagCount < 2:
+ continue
+
+ tagCount *= normalizationFactor
+ print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
+ outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
+ for binAmount in binList:
+ outfile.write("\t%.2f" % binAmount)
+
+ outfile.write("\n")
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneLocusBins.py
+# ENRAGE
+#
+
+# originally from version 1.3 of geneDownstreamBins.py
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print '%s: version 2.1' % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--bins", type="int", dest="bins",
+ help="number of bins to use [default: 10]")
+ parser.add_option("--flank", type="int", dest="flankBP",
+ help="number of flanking BP on both upstream and downstream [default: 0]")
+ parser.add_option("--upstream", type="int", dest="upstreamBP",
+ help="number of upstream flanking BP [default: 0]")
+ parser.add_option("--downstream", type="int", dest="downstreamBP",
+ help="number of downstream flanking BP [default: 0]")
+ parser.add_option("--nocds", action="store_false", dest="doCDS",
+ help="do not CDS")
+ parser.add_option("--raw", action="store_false", dest="normalizeBins",
+ help="do not normalize results")
+ parser.add_option("--force", action="store_false", dest="limitNeighbor",
+ help="limit neighbor region")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--cache", action="store_true", dest="doCache",
+ help="use cache")
+ parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ upstreamBp = 0
+ downstreamBp = 0
+ doFlank = False
+ if options.flankBP is not None:
+ upstreamBp = options.flankBP
+ downstreamBp = options.flankBP
+ doFlank = True
+
+ if options.upstreamBP is not None:
+ upstreamBp = options.upstreamBP
+ doFlank = True
+
+ if options.downstreamBP is not None:
+ downstreamBp = options.downstreamBP
+ doFlank = True
+
+ geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
+
+
+def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
+ if acceptfile is None:
+ acceptDict = {}
+ else:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+ hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ readlen = hitRDS.getReadSize()
+ normalizationFactor = 1.0
+ if normalizeBins:
+ totalCount = len(hitRDS)
+ normalizationFactor = totalCount / 1000000.
+
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=doCache)
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ if doFlank:
+ locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
+ else:
+ locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for chrom in acceptDict:
+ for (label, start, stop, length) in acceptDict[chrom]:
+ if label not in gidList:
+ gidList.append(label)
+
+ (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+
+ outfile = open(outfilename,'w')
+
+ for gid in gidList:
+ if 'FAR' not in gid:
+ symbol = 'LOC' + gid
+ geneinfo = ''
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = gid
+ if gid in gidBins and gid in gidLen:
+ tagCount = 0.
+ for binAmount in gidBins[gid]:
+ tagCount += binAmount
+ outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+ for binAmount in gidBins[gid]:
+ if normalizeBins:
+ if tagCount == 0:
+ tagCount = 1
+ outfile.write('\t%.1f' % (100. * binAmount / tagCount))
+ else:
+ outfile.write('\t%.1f' % binAmount)
+ outfile.write('\n')
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneLocusCounts.py
+# ENRAGE
+#
+""" usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
+ where upstream and downstream are in bp and and optional
+ using noCDS requires either upstream or downstream (but not both)
+ to be nonzero. Using -locuslength will report the first bplength
+ or the last bplength of the gene region depending on whether it
+ is positive or negative.
+ will by default only count the uniq reads (use -noUniqs to turn off)
+ but can also count multi and splice reads given the appropriate flags
+"""
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print '%s: version 3.0' % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome readDB outfilename [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+ help="do not count unique reads")
+ parser.add_option("--multi", action="store_true", dest="doUniqs",
+ help="count multi reads")
+ parser.add_option("--splices", action="store_true", dest="doUniqs",
+ help="count splice reads")
+ parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--noCDS", action="store_false", dest="useCDS")
+ parser.add_option("--locusLength", type="int", dest="bplength",
+ help="number of bases to report")
+ parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print __doc__
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ upstream = 0
+ downstream = 0
+ try:
+ upstream = int(args[3])
+ except ValueError:
+ pass
+ except IndexError:
+ pass
+
+ try:
+ if "-" not in args[3]:
+ downstream = int(args[4])
+ except ValueError:
+ pass
+
+ geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
+ print 'returning only up to %d bp from gene locus' % bplength
+ print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+
+ if acceptfile:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+ hitRDS = readDataset(hitfile, verbose = True)
+
+ totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=True)
+
+ gidCount = {}
+ gidList = []
+ gidLen = {}
+ geneinfoDict = idb.getallGeneInfo(genome)
+ locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
+
+ locusChroms = locusByChromDict.keys()
+ chromList = hitRDS.getChromosomes(fullChrom=False)
+ chromList.sort()
+ for chrom in chromList:
+ if chrom == 'M' or chrom not in locusChroms:
+ continue
+
+ print 'chr' + chrom
+ fullchrom = 'chr' + chrom
+ hitRDS.memSync(fullchrom, index=True)
+ for (start, stop, gid, length) in locusByChromDict[chrom]:
+ if gid not in gidList:
+ gidList.append(gid)
+ gidCount[gid] = 0
+ gidLen[gid] = length
+
+ gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+ outfile = open(outfilename,'w')
+
+ totalCount /= 1000000.
+
+ outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+ gidList.sort()
+ for gid in gidList:
+ if 'FAR' not in gid:
+ symbol = 'LOC' + gid
+ geneinfo = ''
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = gid
+
+ if gid in gidCount and gid in gidLen:
+ rpm = gidCount[gid] / totalCount
+ rpkm = 1000. * rpm / gidLen[gid]
+ outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneLocusPeaks.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+import sys, optparse
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--up", type="int", dest="upstream")
+ parser.add_option("--down", type="int", dest="downstream")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ print "\twhere upstream and downstream are in bp and and optional"
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
+
+
+def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
+ acceptDict = {}
+
+ if acceptfile:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+ print "upstream = %d downstream = %d" % (upstream, downstream)
+
+ hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ readlen = hitRDS.getReadSize()
+ normalizationFactor = 1.0
+ if normalize:
+ totalCount = len(hitRDS)
+ normalizationFactor = totalCount / 1000000.
+
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=True)
+
+ gidCount = {}
+ gidPos = {}
+ geneinfoDict = idb.getallGeneInfo(genome)
+ locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for chrom in acceptDict:
+ for (label, start, stop, length) in acceptDict[chrom]:
+ if label not in gidList:
+ gidList.append(label)
+
+ for gid in gidList:
+ gidCount[gid] = 0
+
+ for chrom in hitDict:
+ if chrom not in locusByChromDict:
+ continue
+
+ print chrom
+ for (start, stop, gid, glen) in locusByChromDict[chrom]:
+ gidCount[gid] = 0.
+ (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
+ if len(topPos) > 0:
+ gidCount[gid] = smoothArray[topPos[0]]
+ gidPos[gid] = (chrom, start + topPos[0])
+ else:
+ gidPos[gid] = (chrom, start)
+
+ outfile = open(outfilename, "w")
+
+ for gid in gidList:
+ if "FAR" not in gid:
+ symbol = "LOC" + gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = gid
+
+ if gid in gidCount and gid in gidPos:
+ (chrom, pos) = gidPos[gid]
+ outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print "psyco not running"
+
+import sys
+import optparse
+from commoncode import readDataset, getFeaturesByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%s: version 5.1" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome rdsfile outfilename [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--stranded", action="store_true", dest="trackStrand")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+ parser.add_option("--multi", action="store_true", dest="doMulti")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.add_option("--searchGID", action="store_true", dest="searchGID")
+ parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--markGID", action="store_true", dest="markGID")
+ parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
+ extendGenome="", replaceModels=False, searchGID=False,
+ countFeats=False, cachePages=None, markGID=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genomeName = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
+ options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
+ options.searchGID, options.countFeats, options.cachePages, options.markGID)
+
+
+def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+ doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
+ searchGID=False, countFeats=False, cachePages=None, markGID=False):
+
+ if trackStrand:
+ print "will track strandedness"
+ doStranded = "track"
+ else:
+ doStranded = "both"
+
+ if extendGenome:
+ if replaceModels:
+ print "will replace gene models with %s" % extendGenome
+ else:
+ print "will extend gene models with %s" % extendGenome
+ else:
+ replaceModels = False
+
+ if cachePages is not None:
+ doCache = True
+ else:
+ cachePages = 100000
+ doCache = False
+
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ genome = Genome(genomeName, inRAM=True)
+ if extendGenome != "":
+ genome.extendFeatures(extendGenome, replace=replaceModels)
+
+ print "getting gene features...."
+ featuresByChromDict = getFeaturesByChromDict(genome)
+
+ seenFeaturesByChromDict = {}
+ print "getting geneIDs...."
+ gidList = genome.allGIDs()
+ gidList.sort()
+ gidCount = {}
+ for gid in gidList:
+ gidCount[gid] = 0
+
+ chromList = hitRDS.getChromosomes(fullChrom=False)
+ if len(chromList) == 0 and doSplices:
+ chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
+
+ if markGID:
+ print "Flagging all reads as NM"
+ hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+ for chrom in chromList:
+ if chrom not in featuresByChromDict:
+ continue
+
+ if countFeats:
+ seenFeaturesByChromDict[chrom] = []
+
+ print "\nchr%s" % chrom
+ fullchrom = "chr%s" % chrom
+ regionList = []
+ print "counting GIDs"
+ for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
+ try:
+ if doStranded == "track":
+ checkSense = "+"
+ if featureSense == "R":
+ checkSense = "-"
+
+ regionList.append((gid, fullchrom, start, stop, checkSense))
+ count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+ else:
+ regionList.append((gid, fullchrom, start, stop))
+ count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ if count != 0:
+ print count
+
+ gidCount[gid] += count
+ if countFeats:
+ if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
+ seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
+ except:
+ print "problem with %s - skipping" % gid
+
+ if markGID:
+ print "marking GIDs"
+ hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
+ print "finished marking"
+
+ print " "
+ if countFeats:
+ numFeatures = countFeatures(seenFeaturesByChromDict)
+ print "saw %d features" % numFeatures
+
+ writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
+ if markGID and doCache:
+ hitRDS.saveCacheDB(hitfile)
+
+
+def countFeatures(seenFeaturesByChromDict):
+ count = 0
+ for chrom in seenFeaturesByChromDict.keys():
+ try:
+ count += len(seenFeaturesByChromDict[chrom])
+ except TypeError:
+ pass
+
+ return count
+
+
+def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+ geneAnnotDict = genome.allAnnotInfo()
+ genomeName = genome.genome
+ outfile = open(outfilename, "w")
+ idb = geneinfoDB(cache=True)
+ geneInfoDict = idb.getallGeneInfo(genomeName)
+ for gid in gidList:
+ symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
+ if gid in gidCount:
+ outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
+ else:
+ outfile.write("%s\t%s\t0\n" % (gid, symbol))
+
+ outfile.close()
+
+
+def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
+ lookupGID = gid
+ if searchGID and gid not in geneInfoDict:
+ actualGeneID = idb.getGeneID(genomeName, gid)
+ if len(actualGeneID) > 0:
+ lookupGID = actualGeneID[1]
+
+ try:
+ geneinfo = geneInfoDict[lookupGID]
+ symbol = geneinfo[0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genomeName, gid)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % gid
+
+ return symbol
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+
+print '%s: version 4.1' % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--stranded", action="store_false", dest="ignoreSense")
+ parser.add_option("--uniq", action="store_true", dest="withUniqs")
+ parser.add_option("--multi", action="store_true", dest="withMulti")
+ parser.add_option("--record", action="store_true", dest="recording",
+ help="ignored with uniq reads")
+ parser.add_option("--accept", dest="acceptfile")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
+ acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
+ replaceModels=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ countfile = args[2]
+ outfilename = args[3]
+
+ geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+ options.withUniqs, options.withMulti, options.recording,
+ options.acceptfile, options.cachePages, options.doVerbose,
+ options.extendGenome, options.replaceModels)
+
+
+def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+ withUniqs=False, withMulti=False, recording=False, acceptfile=None,
+ cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
+
+ if (not withUniqs and not withMulti) or (withUniqs and withMulti):
+ print "must have either one of -uniq or -multi set. Exiting"
+ sys.exit(1)
+
+ if cachePages is not None:
+ cacheGeneDB(genome)
+ hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+ idb = geneinfoDB(cache=True)
+ print "%s cached" % genome
+ doCache = True
+ else:
+ doCache = False
+ cachePages = 0
+ hg = Genome(genome, inRAM=True)
+ idb = geneinfoDB()
+
+ if acceptfile is not None:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+ else:
+ acceptDict = {}
+
+ if recording and withUniqs:
+ recording = False
+
+ if extendGenome:
+ if replaceModels:
+ print "will replace gene models with %s" % extendGenome
+ else:
+ print "will extend gene models with %s" % extendGenome
+ else:
+ replaceModels = False
+
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace = replaceModels)
+
+ hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ readlen = hitRDS.getReadSize()
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ geneannotDict = hg.allAnnotInfo()
+ gidCount = {}
+ gidReadDict = {}
+
+ featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
+ gidList = hg.allGIDs()
+
+ gidList.sort()
+ for chrom in acceptDict:
+ for (label, start, stop, length) in acceptDict[chrom]:
+ if label not in gidList:
+ gidList.append(label)
+
+ for gid in gidList:
+ gidCount[gid] = 0
+ gidReadDict[gid] = []
+
+ uniqueCountDict = {}
+ read2GidDict = {}
+
+ uniquecounts = open(countfile)
+ for line in uniquecounts:
+ fields = line.strip().split()
+ # add a pseudo-count here to ease calculations below
+ uniqueCountDict[fields[0]] = float(fields[-1]) + 1
+
+ uniquecounts.close()
+
+ outfile = open(outfilename, "w")
+
+ index = 0
+ if withMulti and not withUniqs:
+ chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
+ else:
+ chromList = hitRDS.getChromosomes(fullChrom=False)
+
+ for achrom in chromList:
+ if achrom not in featuresByChromDict:
+ continue
+
+ print "\n" + achrom + " ",
+ startFeature = 0
+ fullchrom = "chr" + achrom
+ hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
+ featList = featuresByChromDict[achrom]
+ if ignoreSense:
+ for (tagStart, tagReadID) in hitDict[fullchrom]:
+ index += 1
+ if index % 100000 == 0:
+ print "read %d" % index,
+
+ stopPoint = tagStart + readlen
+ if startFeature < 0:
+ startFeature = 0
+
+ for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+ if tagStart > stop:
+ startFeature += 1
+ continue
+
+ if start > stopPoint:
+ startFeature -= 100
+ break
+
+ if start <= tagStart <= stop:
+ try:
+ gidReadDict[gid].append(tagReadID)
+ if tagReadID in read2GidDict:
+ if gid not in read2GidDict[tagReadID]:
+ read2GidDict[tagReadID].append(gid)
+ else:
+ read2GidDict[tagReadID] = [gid]
+
+ gidCount[gid] += 1
+ except:
+ print "gid %s not in gidReadDict" % gid
+
+ stopPoint = stop
+ else:
+ for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
+ index += 1
+ if index % 100000 == 0:
+ print "read %d" % index,
+
+ stopPoint = tagStart + readlen
+ if startFeature < 0:
+ startFeature = 0
+
+ for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+ if tagStart > stop:
+ startFeature += 1
+ continue
+
+ if start > stopPoint:
+ startFeature -= 100
+ break
+
+ if sense == "R":
+ sense = "-"
+ else:
+ sense = "+"
+
+ if start <= tagStart <= stop and sense == tSense:
+ try:
+ gidReadDict[gid].append(tagReadID)
+ if tagReadID in read2GidDict:
+ if gid not in read2GidDict[tagReadID]:
+ read2GidDict[tagReadID].append(gid)
+ else:
+ read2GidDict[tagReadID] = [gid]
+
+ gidCount[gid] += 1
+ except:
+ print "gid %s not in gidReadDict" % gid
+
+ stopPoint = stop
+
+ for gid in gidList:
+ if "FAR" not in gid:
+ symbol = "LOC" + gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ if genome == "celegans":
+ symbol = geneinfo[0][1]
+ else:
+ symbol = geneinfo[0][0]
+ except:
+ try:
+ symbol = geneannotDict[(genome, gid)][0]
+ except:
+ symbol = "LOC" + gid
+ else:
+ symbol = gid
+
+ tagCount = 0.
+ for readID in gidReadDict[gid]:
+ try:
+ tagValue = uniqueCountDict[gid]
+ except:
+ tagValue = 1
+
+ tagDenom = 0.
+ for aGid in read2GidDict[readID]:
+ try:
+ tagDenom += uniqueCountDict[aGid]
+ except:
+ tagDenom += 1
+
+ try:
+ tagCount += tagValue / tagDenom
+ except ZeroDivisionError:
+ tagCount = 0
+
+ if doVerbose:
+ print "%s %s %f" % (gid, symbol, tagCount)
+
+ outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
+
+ outfile.close()
+
+ if doCache:
+ uncacheGeneDB(genome)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneNeighbors.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.4"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--regions", dest="acceptFile")
+ parser.add_option("--downstream", type="int", dest="downMax")
+ parser.add_option("--upstream", type="int", dest="upMax")
+ parser.add_option("--mindist", type="int", dest="minDist")
+ parser.add_option("--minlocus", type="int", dest="minLocus")
+ parser.add_option("--maxlocus", type="int", dest="maxLocus")
+ parser.add_option("--samesense", action="store_true", dest="checkSense")
+ parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000,
+ upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ outfilename = args[1]
+
+ index = geneNeighbors(genome, outfilename, options.acceptFile, options.checkSense,
+ options.downMax, options.upMax, options.minDist, options.minLocus,
+ options.maxLocus)
+
+ print "\n%d genes matched" % index
+
+
+def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
+ downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
+ maxLocus=10000000):
+
+ acceptDict = {}
+ if acceptfile:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=True)
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for chrom in acceptDict:
+ for (label, start, stop, length) in acceptDict[chrom]:
+ if label not in gidList:
+ gidList.append(label)
+
+ index = 0
+ outfile = open(outfilename,"w")
+ chromList = locusByChromDict.keys()
+ chromList.sort()
+ for chrom in chromList:
+ if len(locusByChromDict[chrom]) < 3 or "NT" in chrom or "MT" in chrom:
+ continue
+
+ print chrom + " ",
+
+ prevStop = locusByChromDict[chrom][0][1]
+ prevGID = locusByChromDict[chrom][0][2]
+ if "FAR" not in prevGID:
+ symbol = "LOC" + prevGID
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[prevGID]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = prevGID
+
+ prevGID = symbol
+ prevSense = locusByChromDict[chrom][0][4]
+
+ currentStart = locusByChromDict[chrom][1][0]
+ currentStop = locusByChromDict[chrom][1][1]
+ currentGID = locusByChromDict[chrom][1][2]
+ if "FAR" not in currentGID:
+ symbol = "LOC" + currentGID
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[currentGID]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = currentGID
+
+ currentGID = symbol
+ currentGlen = locusByChromDict[chrom][1][3]
+ currentSense = locusByChromDict[chrom][1][4]
+
+ for (nextStart, nextStop, nextGID, nextGlen, nextSense) in locusByChromDict[chrom][2:]:
+ if "FAR" not in nextGID:
+ symbol = "LOC" + nextGID
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[nextGID]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = nextGID
+
+ nextGID = symbol
+ leftDist = currentStart - prevStop
+ rightDist = nextStart - currentStop
+ if (currentSense == "F" and minDist < leftDist < upMax and minDist < rightDist < downMax) or (currentSense == "R" and minDist < rightDist < upMax and minDist < leftDist < downMax):
+ if not checkSense or currentSense == nextSense:
+ if minLocus <= currentGlen <= maxLocus:
+ outfile.write("%s\t%s\t%s\t%s\t%d\t%s\t%s\t%d\n" % (currentGID, currentSense, prevGID, prevSense, leftDist, nextGID, nextSense, rightDist))
+ index += 1
+
+ prevStop = currentStop
+ prevGID = currentGID
+ prevSense = currentSense
+ currentStart = nextStart
+ currentStop = nextStop
+ currentGID = nextGID
+ currentGlen = nextGlen
+ currentSense = nextSense
+
+ outfile.close()
+ return index
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+#
+# geneStallingBins.py
+# ENRAGE
+#
+
+# originally from geneLocusBins.py
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 1.3"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--upstream", type="int", dest="upstreamBp")
+ parser.add_option("--downstream", type="int", dest="downstreamBp")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--normalize", action="store_true", dest="normalize")
+ parser.add_option("--tagCount", action="store_true", dest="doTagCount")
+ parser.add_option("--bins", type="int", dest="bins")
+ parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
+ doCache=False, normalize=False, doTagCount=False, bins=4)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ controlfile = args[2]
+ outfilename = args[3]
+
+ geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
+ options.downstreamBp, options.acceptfile, options.doCache,
+ options.normalize, options.doTagCount, options.bins)
+
+
+def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
+ downstreamBp=0, acceptfile="", doCache=False, normalize=False,
+ doTagCount=False, bins=4):
+
+ acceptDict = {}
+ if acceptfile:
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+ doCDS = True
+ limitNeighbor = False
+
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ readlen = hitRDS.getReadSize()
+ hitNormalizationFactor = 1.0
+ if normalize:
+ hitDictSize = len(hitRDS)
+ hitNormalizationFactor = hitDictSize / 1000000.
+
+ controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ controlNormalizationFactor = 1.0
+ if normalize:
+ controlDictSize = len(hitRDS)
+ controlNormalizationFactor = controlDictSize / 1000000.
+
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+ controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=doCache)
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for chrom in acceptDict:
+ for (label, start, stop, length) in acceptDict[chrom]:
+ if label not in gidList:
+ gidList.append(label)
+
+ (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
+ (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
+
+ outfile = open(outfilename, "w")
+
+ for gid in gidList:
+ if "FAR" not in gid:
+ symbol = "LOC" + gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ pass
+ else:
+ symbol = gid
+
+ if gid in gidBins and gid in gidLen:
+ tagCount = 0.
+ controlCount = 0.
+ for binAmount in gidBins[gid]:
+ tagCount += binAmount
+
+ for binAmount in controlBins[gid]:
+ controlCount += abs(binAmount)
+
+ diffCount = tagCount + controlCount
+ if diffCount < 0:
+ diffCount = 0
+
+ outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
+ if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
+ outfile.write("\tshort\n")
+ continue
+
+ TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
+ finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
+ if finalbin <= 0.:
+ finalbin = 0.01
+
+ if TSSbins < 0:
+ TSSbins = 0
+
+ ratio = float(TSSbins)/float(finalbin)
+ for binAmount in gidBins[gid]:
+ if doTagCount:
+ binAmount = binAmount * tagCount / 100.
+
+ if normalize:
+ if tagCount == 0:
+ tagCount = 1
+
+ outfile.write("\t%.1f" % (100. * binAmount / tagCount))
+ else:
+ outfile.write("\t%.1f" % binAmount)
+
+ outfile.write("\t%.2f\n" % ratio)
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# geneStartBins.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+# originally from version 1.3 of geneDownstreamBins.py
+from commoncode import *
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+import sys
+
+print '%s: version 2.0' % sys.argv[0]
+if len(sys.argv) < 4:
+ print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
+ print '\n\twhere regionSize is the optional maximum region in bp\n'
+ sys.exit(1)
+
+genome = sys.argv[1]
+hitfile = sys.argv[2]
+outfilename = sys.argv[3]
+
+standardMinDist = 3000
+if '-max' in sys.argv:
+ standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
+
+if '-raw' in sys.argv:
+ normalize = False
+ normalizeBins = False
+else:
+ normalize = True
+ normalizeBins = True
+
+doCache = False
+if '-cache' in sys.argv:
+ doCache = True
+
+bins = 10
+standardMinThresh = standardMinDist / bins
+
+hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+readlen = hitRDS.getReadSize()
+normalizationFactor = 1.0
+if normalize:
+ totalCount = len(hitRDS)
+ normalizationFactor = totalCount / 1000000.
+
+hg = Genome(genome)
+idb = geneinfoDB(cache=True)
+
+gidDict = {}
+geneinfoDict = idb.getallGeneInfo(genome)
+featuresDict = hg.getallGeneFeatures()
+
+#infile = open(infilename)
+outfile = open(outfilename,'w')
+
+gidList = hg.allGIDs()
+gidList.sort()
+for gid in gidList:
+ symbol = 'LOC' + gid
+ geneinfo = ''
+ featureList = []
+ try:
+ geneinfo = geneinfoDict[gid]
+ featureList = featuresDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ print geneinfo
+ newfeatureList = []
+ if len(featureList) == 0:
+ continue
+ for (ftype, chrom, start, stop, fsense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+ if chrom not in hitDict:
+ continue
+ newfeatureList.sort()
+ if len(newfeatureList) < 1:
+ #print '%s %s %d' % (gid, symbol, -1)
+ #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
+ continue
+ glen = standardMinDist / 2
+ if fsense == 'F':
+ nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+ if glen < 1:
+ glen = 1
+ gstart = newfeatureList[0][0] - glen
+ if gstart < 0:
+ gstart = 0
+ gstop = newfeatureList[0][0] + glen
+ else:
+ nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+ if glen < 1:
+ glen = 1
+ gstart = newfeatureList[-1][1] - glen
+ gstop = newfeatureList[-1][1] + glen
+ tagCount = 0
+ if glen < standardMinDist / 2:
+ continue
+ binList = [0] * bins
+ for (tagStart, sense, weight) in hitDict[chrom]:
+ tagStart -= gstart
+ if tagStart >= 2 * glen:
+ break
+ if tagStart > 0:
+ tagCount += weight
+ if fsense == 'R':
+ # we are relying on python's integer division quirk
+ binID = tagStart / standardMinThresh
+ binList[binID] += weight
+ else:
+ rdist = 2 * glen - tagStart
+ binID = rdist / standardMinThresh
+ binList[binID] += weight
+ if tagCount < 2:
+ continue
+ print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
+ outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
+ for binAmount in binList:
+ outfile.write('\t%d' % binAmount)
+ outfile.write('\n')
+#infile.close()
+outfile.close()
+
--- /dev/null
+#
+# geneUpstreamBins.py
+# ENRAGE
+#
+# originally from version 1.3 of geneDownstreamBins.py
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--raw", action="store_false", dest="normalize",
+ help="maximum region in bp")
+ parser.add_option("--max", type="int", dest="standardMinDist")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ outfilename = args[3]
+
+ geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
+
+
+def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
+ bins = 10
+ standardMinThresh = standardMinDist / bins
+
+ hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ normalizationFactor = 1.0
+ if normalize:
+ totalCount = len(hitRDS)
+ normalizationFactor = totalCount / 1000000.
+
+ hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+ hg = Genome(genome)
+ idb = geneinfoDB(cache=True)
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ featuresDict = hg.getallGeneFeatures()
+
+ outfile = open(outfilename,"w")
+
+ gidList = hg.allGIDs()
+ gidList.sort()
+ for gid in gidList:
+ symbol = "LOC" + gid
+ geneinfo = ""
+ featureList = []
+ try:
+ geneinfo = geneinfoDict[gid]
+ featureList = featuresDict[gid]
+ symbol = geneinfo[0][0]
+ except:
+ print geneinfo
+
+ newfeatureList = []
+ if len(featureList) == 0:
+ continue
+
+ for (ftype, chrom, start, stop, fsense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+
+ if chrom not in hitDict:
+ continue
+
+ newfeatureList.sort()
+ if len(newfeatureList) < 1:
+ continue
+
+ glen = standardMinDist
+ if fsense == "F":
+ nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+
+ if glen < 1:
+ glen = 1
+
+ gstart = newfeatureList[0][0] - glen
+ if gstart < 0:
+ gstart = 0
+
+ else:
+ nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+ if nextGene < glen * 2:
+ glen = nextGene / 2
+
+ if glen < 1:
+ glen = 1
+
+ gstart = newfeatureList[-1][1]
+
+ tagCount = 0
+ if glen < standardMinDist:
+ continue
+
+ binList = [0] * bins
+ for (tagStart, sense, weight) in hitDict[chrom]:
+ tagStart -= gstart
+ if tagStart >= glen:
+ break
+
+ if tagStart > 0:
+ tagCount += weight
+ if fsense == "R":
+ # we are relying on python's integer division quirk
+ binID = tagStart / standardMinThresh
+ binList[binID] += weight
+ else:
+ rdist = glen - tagStart
+ binID = rdist / standardMinThresh
+ binList[binID] += weight
+
+ if tagCount < 2:
+ continue
+
+ print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
+ outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
+ for binAmount in binList:
+ outfile.write("\t%d" % binAmount)
+ outfile.write("\n")
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys, optparse
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome GOID1 [GOID2 ....] [--outfile outfilename] [--append] [--restrict genefile]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--outfile", dest="outfilename")
+ parser.add_option("--append", action="store_true", dest="append")
+ parser.add_option("--restrict", dest="restrictfilename")
+ parser.set_defaults(outfilename=None, restrictfilename=None, append=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+
+ GOIDlist = []
+ for arg in args:
+ if "GO:" in arg:
+ GOIDlist.append(arg)
+
+ getGOgenes(genome, GOIDlist, options.outfilename, options.restrictfilename, options.append)
+
+
+def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append=False):
+ writeOut = False
+ if outfilename is not None:
+ writeOut = True
+
+ restrict = False
+ if restrictfilename is not None:
+ restrict = True
+
+ hg = Genome(genome)
+ idb = geneinfoDB()
+
+ print sys.argv
+ print GOIDlist
+
+ firstGeneList = []
+ for GOID in GOIDlist:
+ testList = hg.allGIDsbyGOID(GOID)
+ print "GOID: %s (%d)" % (GOID, len(testList))
+ firstGeneList += testList
+
+ geneDict = {}
+ for gid in firstGeneList:
+ geneDict[gid] = 1
+
+ geneList = geneDict.keys()
+ print len(geneList)
+ geneInfoList = idb.getallGeneInfo(genome)
+
+ if writeOut:
+ if append:
+ outfile = open(outfilename, "a")
+ else:
+ outfile = open(outfilename, "w")
+
+ for GOID in GOIDlist:
+ outfile.write("#%s\n" % GOID)
+
+ restrictList = []
+ restrictDict = {}
+ if restrict:
+ restrictFile = open(restrictfilename)
+ for line in restrictFile:
+ fields = line.strip().split()
+ restrictList.append(fields[0])
+ restrictDict[fields[0]] = line
+
+ outList = []
+ symbolDict = {}
+ for gid in geneList:
+ symbol = "LOC" + gid
+ if restrict and gid not in restrictList:
+ continue
+
+ try:
+ symbol = geneInfoList[gid][0][0]
+ except:
+ pass
+
+ if restrict:
+ symbolDict[symbol] = restrictDict[gid]
+
+ outList.append(symbol)
+
+ outList.sort()
+ for symbol in outList:
+ if writeOut:
+ if restrict:
+ outfile.write(symbolDict[symbol])
+ else:
+ outfile.write(symbol + "\n")
+ else:
+ print symbol
+
+ if writeOut:
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# getNovelSNPs.py
+# ENRAGE
+#
+# This script attempts to annotate the novel sncs/snps from the snp summary file
+# Written by: Wendy Lee
+# Written on: Aug 7th, 2008
+
+import sys
+import string
+from cistematic.genomes import Genome
+from commoncode import writeLog
+
+print "%prog: version 1.5"
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome snpsfile nondbsnp_geneinfo_outfile" % argv[0]
+
+ if len(argv) < 4:
+ print usage
+ sys.exit(2)
+
+ genome = argv[1]
+ snpfile = argv[2]
+ outfilename = argv[3]
+
+ getNovelSNPsFromFile(genome, snpfile, outfilename)
+
+
+def getNovelSNPsFromFile(genome, snpfile, outfilename):
+ infile = file(snpfile, "r")
+ writeNovelSNPFile(genome, infile, outfilename)
+ writeLog("snp.log", sys.argv[0], "outputfile: %s" % outfilename)
+ infile.close()
+
+
+def writeNovelSNPFile(genome, snpPropertiesList, outfilename):
+ hg = Genome(genome)
+ outString = ""
+ outfile = open(outfilename, "w")
+ outfile.write("#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n")
+ for line in snpPropertiesList:
+ if doNotProcessLine(line):
+ continue
+
+ outString = getNovelSNPInfo(genome, line, hg)
+ if outString == line:
+ outfile.write(outString)
+ else:
+ outfile.write("%s\n" % outString)
+
+ outfile.close()
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def getNovelSNPInfo(genome, snpEntry, hg):
+ fields = snpEntry.split()
+ #TODO: refactor naming. is fields[8] rpkm?
+ if fields[8].find("N\A") == -1:
+ return snpEntry
+ else:
+ snpInfo = ""
+ gid = fields[11]
+ snc_start = int(fields[3])
+ featuresList = hg.getGeneFeatures((genome, gid))
+ func = "N\A"
+ for (ftype, chromosome, start, stop, orientation) in featuresList:
+ if int(start) <= snc_start <= int(stop):
+ func = ftype
+ break
+
+ for i in range (0, 9):
+ snpInfo = string.join([snpInfo, fields[i]], "\t")
+
+ snpInfo = string.join([snpInfo, func], "\t")
+ for i in range (10, 13):
+ snpInfo = string.join([snpInfo, fields[i]], "\t")
+
+ return snpInfo
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# getSNPGeneInfo.py
+# ENRAGE
+#
+# This script look for the gene info and expression level for the snps.
+# Written by: Wendy Lee
+# Written on: August 7th, 2008
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys
+import optparse
+import string
+from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 4.5"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", action="store_true", dest="cachePages")
+ parser.add_option("--withoutsense", action="store_false", dest="withSense")
+ parser.add_option("--flank", type="int", dest="flankBP")
+ parser.set_defaults(doCache=False, withSense=True, flankBP=0)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(2)
+
+ genome = args[0]
+ infilename = args[1]
+ rpkmfilename = args[2]
+ outfilename = args [3]
+
+ writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
+
+
+def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
+
+ outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
+ outfile = open(outfilename, "w")
+
+ for outputLine in outList:
+ outfile.write("%s\n" % outputLine)
+
+ outfile.close()
+
+
+def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
+
+ rpkmDict = {}
+ rpkmField = 3
+ if rpkmfilename != "NONE":
+ rpkmfile = open(rpkmfilename, "r")
+ for line in rpkmfile:
+ lineFields = line.split()
+ rpkmDict[lineFields[0]] = lineFields[rpkmField]
+
+ rpkmfile.close()
+
+ infile = open(infilename)
+ snpPositionList = []
+ snpDict = {}
+
+ for line in infile:
+ if doNotProcessLine(line):
+ continue
+
+ fields = line.split("\t")
+ chrom = fields[2][3:]
+ start = int(fields[3])
+ chromosomePosition = (chrom, start)
+ snpPositionList.append(chromosomePosition)
+ snpDict[chromosomePosition] = line
+
+ if doCache:
+ cacheGeneDB(genome)
+ idb = geneinfoDB(cache=True)
+ print "cached %s" % genome
+ else:
+ idb = geneinfoDB()
+
+ geneinfoDict = idb.getallGeneInfo(genome)
+ geneDict = {}
+
+ if flankBP > 0:
+ matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
+ else:
+ matchingGenesDict = genesIntersecting(genome, snpPositionList)
+
+ for pos in matchingGenesDict:
+ geneID = matchingGenesDict[pos][0][0]
+ try:
+ symbol = geneinfoDict[geneID][0][0]
+ except:
+ symbol = "LOC%s" % geneID
+
+ geneDescriptor = (symbol, geneID)
+ if geneDict.has_key(geneDescriptor):
+ geneDict[geneDescriptor]["position"].append(pos)
+ else:
+ geneDict[geneDescriptor] = {"position": [pos],
+ "sense": matchingGenesDict[pos][0][-1]}
+
+ if doCache:
+ uncacheGeneDB(genome)
+
+ return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
+ snpGeneOutputList = []
+ snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
+
+ for snpEntry in snpGeneInfoList:
+ outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
+ if withSense:
+ outputItems.append(snpEntry["sense"])
+
+ line = string.join(outputItems, "\t")
+ snpGeneOutputList.append(line)
+
+ snpGeneOutputList.sort(reverse=True)
+
+ return snpGeneOutputList
+
+
+def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
+
+ snpGeneInfoList = []
+
+ for geneDescriptor in geneDict.keys():
+ alreadyDoneList = []
+ (symbol, geneID) = geneDescriptor
+ genePositionList = geneDict[geneDescriptor]["position"]
+ genePositionList.sort()
+
+ for position in genePositionList:
+ if snpDict[position] in alreadyDoneList:
+ continue
+
+ snpGeneInfoDict = {"symbol": symbol,
+ "geneID": geneID}
+
+ rpkm = "N\A"
+ if rpkmDict.has_key(geneID):
+ rpkm = str(rpkmDict[geneID])
+
+ snpGeneInfoDict["rpkm"] = rpkm
+ snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
+ if withSense:
+ snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
+
+ alreadyDoneList.append(snpDict[position])
+ snpGeneInfoList.append(snpGeneInfoDict)
+
+ snpGeneInfoList.sort(reverse=True)
+
+ return snpGeneInfoList
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# getSNPs.py
+# ENRAGE
+#
+# Originally written by: Wendy Lee
+# Last modified: May 11th, 2009 by Ali Mortazavi
+
+"""
+ Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl).
+ For each mismatch, choose the base change that occur most frequently (ie: has the highest number
+ of independent reads)
+ Threshold of Sl and Cl are from user input
+ Sl = # of independent reads supporting a base change at position S
+ Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
+
+ usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
+
+ uniqStartMin = # of independent reads supporting a base change at position S
+ totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
+"""
+
+import sys, optparse
+from commoncode import readDataset, writeLog
+
+print "%prog: version 3.5"
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print "psyco is not running"
+ pass
+
+def usage():
+ print __doc__
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = __doc__
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nosplices", action="store_false", dest="doSplices")
+ parser.add_option("--enforceChr", action="store_true", dest="forceChr")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ usage()
+ sys.exit(2)
+
+ hitfile = args[0]
+ uniqStartMin = float(args[1])
+ totalRatioMin = float(args[2])
+ outfilename = args[3]
+
+ if options.cachePages > 0:
+ doCache = True
+ else:
+ doCache = False
+
+ writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
+
+
+def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
+ writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
+
+ outfile = open(outfilename, "w")
+ header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange"
+ outfile.write(header + "\n")
+
+ snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
+ for snpEntry in snpPropertiesList:
+ outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
+ print outline
+ outfile.write(outline + "\n")
+ outfile.flush()
+
+ outfile.close()
+
+ writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
+
+
+def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
+
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ if cachePages > 20000:
+ hitRDS.setDBcache(cachePages)
+
+ snpPropertiesList = []
+ readLength = hitRDS.getReadSize()
+ chromList = hitRDS.getChromosomes()
+
+ for chrom in chromList:
+ if doNotProcessChromosome(forceChr, chrom):
+ continue
+
+ matchDict = getMatchDict(hitRDS, chrom, doSplices)
+ print "got match dict for %s " % chrom
+ mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
+ print "got mismatch dict for %s " % chrom
+ mismatchPositions = mismatchDict.keys()
+ mismatchPositions.sort()
+ for position in mismatchPositions:
+ totalCount = mismatchDict[position]["totalCount"]
+ uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
+ totalBaseDict = mismatchDict[position]["totalBaseDict"]
+ highestCount = 0
+ highestBaseChange = "N-N"
+ highestTotalCount = 0
+ for baseChange in uniqBaseDict:
+ if totalBaseDict[baseChange] > highestTotalCount:
+ highestBaseChange = baseChange
+ highestCount = uniqBaseDict[baseChange]
+ highestTotalCount = totalBaseDict[baseChange]
+
+ Cl = 0.
+ matchCount = 0
+ if highestCount >= uniqStartMin:
+ for matchpos in xrange(position - readLength + 1, position + 1):
+ try:
+ matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
+ except:
+ pass
+
+ matchCount -= totalCount
+ if matchCount < 0:
+ matchCount = 0
+
+ Sl = highestCount/float(readLength)
+ Cl = highestTotalCount/float(highestTotalCount + matchCount)
+ if Cl >= totalRatioMin:
+ snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
+ snpPropertiesList.append(snpProperties)
+
+ return snpPropertiesList
+
+
+def doNotProcessChromosome(forceChr, chromosome):
+ if forceChr:
+ if chromosome[:3] != "chr":
+ return True
+ else:
+ return False
+
+
+def getMatchDict(rds, chrom, withSplices=True):
+ spliceDict = {}
+ readDict = {}
+ finalDict = {}
+
+ try:
+ readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
+ except:
+ readDict[chrom] = []
+
+ for (start, stop) in readDict[chrom]:
+ if finalDict.has_key(start):
+ finalDict[start].append(stop)
+ else:
+ finalDict[start] = [stop]
+
+ if withSplices:
+ try:
+ spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
+ except:
+ spliceDict[chrom] = []
+
+ for (start, stop) in spliceDict[chrom]:
+ if finalDict.has_key(start):
+ finalDict[start].append(stop)
+ else:
+ finalDict[start] = [stop]
+
+ return finalDict
+
+
+def getMismatchDict(rds, chrom, withSplices=True):
+ mismatchDict = {}
+ spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
+ for (start, change_at, change_base, change_from) in spliceDict[chrom]:
+ change = "%s-%s" % (change_base, change_from)
+ uniqueReadCount = 1
+ totalCount = 1
+ back = "%s:%s" % (str(start), change)
+ uniqBaseDict = {change: 1}
+ totalBaseDict = {change: 1}
+ if mismatchDict.has_key(change_at):
+ (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at]
+ pos = "%s:%s" % (str(start), change)
+ totalCount += 1
+ if totalBaseDict.has_key(change):
+ totalBaseDict[change] += 1
+
+ if pos not in back:
+ uniqueReadCount += 1
+ if uniqBaseDict.has_key(change):
+ uniqBaseDict[change] += 1 # dict contains total unique read counts
+
+ back = "%s,%s" % (back, pos)
+
+ mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
+ "totalCount": totalCount,
+ "back": back,
+ "uniqBaseDict": uniqBaseDict,
+ "totalBaseDict": totalBaseDict
+ }
+
+ return mismatchDict
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys, optparse
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+from cistematic.core import complement
+from cistematic.core.motif import Motif
+from cistematic.genomes import Genome
+from commoncode import readDataset, getMergedRegions, findPeak
+from pylab import *
+import matplotlib
+
+print '%s: version 3.4' % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome regionfile siteOutfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--dataset", dest="chipfilename")
+ parser.add_option("--min", type="float", dest="minHeight")
+ parser.add_option("--minfraction", type="float", dest="minFraction")
+ parser.add_option("--plot", dest="plotname")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+ parser.add_option("--peakdist", type="int", dest="maxpeakdist")
+ parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
+ parser.add_option("--motifdir", dest="motifDir")
+ parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
+ doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
+ maxpeakdist=None, fullOnly=False, motifDir="./")
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = argv[0]
+ infilename = args[1]
+ outfilename = args[2]
+
+ getallNRSE(genome, infilename, outfilename, options.chipfilename,
+ options.minHeight, options.minFraction, options.plotname,
+ options.doCache, options.normalize, options.doVerbose,
+ options.doMarkov1, options.maxpeakdist, options.fullOnly,
+ options.motifDir)
+
+
+def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
+ minFraction=-2., plotname="", doCache=False, normalize=True,
+ doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
+ motifDir="./"):
+
+ doPlot = False
+ if plotname:
+ matplotlib.use("Agg")
+ doPlot = True
+
+ if motifDir[-1] != "/":
+ motifDir += "/"
+
+ doDataset = False
+ normalizeBy = 1
+ if chipfilename:
+ hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
+ doDataset = True
+ if normalize:
+ normalizeBy = len(hitRDS) / 1000000.
+
+ if minFraction > 1.:
+ minFraction /= 100.
+ print "scaling minFraction to %.2f" % minFraction
+
+ if maxpeakdist is not None:
+ enforcePeakDist = True
+ else:
+ enforcePeakDist = False
+ maxpeakdist = 101
+
+ mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
+ motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
+ motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
+ bestScore = mot.bestConsensusScore()
+ bestLeft = motL.bestConsensusScore()
+ bestRight = motR.bestConsensusScore()
+
+ hg = Genome(genome)
+
+ regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
+
+ outfile = open(outfilename, "w")
+ outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
+ outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
+ outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
+
+ index = 0
+ regionList = []
+
+ for rchrom in regions:
+ if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
+ continue
+
+ for (start, stop, length) in regions[rchrom]:
+ regionList.append((rchrom, start, length))
+
+ notFoundIndex = 0
+ currentChrom = ""
+ for (rchrom, start, length) in regionList:
+ seq = hg.sequence(rchrom, start, length)
+ if doDataset:
+ if rchrom != currentChrom:
+ fullchrom = "chr" + rchrom
+ hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
+ currentChrom = rchrom
+
+ (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
+ if len(topPos) == 0:
+ print "topPos error"
+
+ peakpos = topPos[0]
+ peakscore = smoothArray[peakpos]
+ if peakscore == 0.:
+ peakscore = -1.
+
+ if normalize:
+ numHits /= normalizeBy
+ peakscore /= normalizeBy
+ else:
+ peakpos = length
+ peakscore = -1
+ numHits = 0
+ smoothArray = [0.] * length
+
+ found = []
+ if doMarkov1:
+ lefts = motL.locateMarkov1(seq, 3.)
+ rights = motR.locateMarkov1(seq, 3.)
+ else:
+ lefts = motL.locateMotif(seq, 70)
+ rights = motR.locateMotif(seq, 70)
+
+ allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
+ allhalfs.sort()
+
+ # look for canonicals and non-canonicals
+ if len(allhalfs) > 1:
+ (firstpos, firstsense, firsttype) = allhalfs[0]
+ for (secondpos, secondsense, secondtype) in allhalfs[1:]:
+ if enforcePeakDist:
+ withinDistance = False
+ for aPos in topPos:
+ if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
+ withinDistance = True
+ if not withinDistance:
+ firstpos = secondpos
+ firstsense = secondsense
+ firsttype = secondtype
+ continue
+
+ if firsttype == "L":
+ dist = secondpos - firstpos + 2
+ else:
+ dist = secondpos - firstpos -1
+
+ if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
+ if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
+ found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
+
+ if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
+ found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
+
+ firstpos = secondpos
+ firstsense = secondsense
+ firsttype = secondtype
+
+ # did we miss any 70%+ matches ?
+ if doMarkov1:
+ matches = mot.locateMarkov1(seq, 3.5)
+ else:
+ matches = mot.locateMotif(seq, 70)
+
+ for (pos, sense) in matches:
+ alreadyFound = False
+ for (fpos, fpeakdist, fdist) in found:
+ if pos + start == fpos:
+ alreadyFound = True
+
+ if not alreadyFound:
+ if enforcePeakDist:
+ withinDistance = False
+ for aPos in topPos:
+ if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
+ withinDistance = True
+ thePos = aPos
+
+ if withinDistance:
+ found.append((start + pos, pos - thePos + 10, 11))
+
+ else:
+ found.append((start + pos, pos - peakpos + 10, 11))
+
+ # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
+ if len(found) == 0 and not fullOnly:
+ bestone = -1
+ if not doDataset:
+ bestdist = maxpeakdist
+ else:
+ bestdist = length
+
+ index = 0
+ for (pos, sense, type) in allhalfs:
+ if doDataset:
+ for aPos in topPos:
+ if abs(pos - aPos) < bestdist:
+ bestdist = abs(pos - aPos)
+ bestone = index
+ peakpos = aPos
+ else:
+ found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
+
+ index += 1
+
+ if (doDataset and bestdist < 101):
+ try:
+ found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
+ except:
+ continue
+
+ # see if we found an acceptable match
+ foundValue = False
+ for (foundpos, posdist, dist) in found:
+ # get a score for 21-mer, report
+ seq = hg.sequence(rchrom, foundpos, 21)
+ # height will be measured from the center of the motif
+ height = -2.
+ for pos in range(10 + dist):
+ try:
+ currentHeight = smoothArray[int(peakpos + posdist + pos)]
+ except:
+ pass
+
+ if currentHeight > height:
+ height = currentHeight
+
+ if normalize:
+ height /= normalizeBy
+
+ fractionHeight = height / peakscore
+ if height < minHeight or fractionHeight < minFraction:
+ continue
+
+ foundValue = True
+ (front, back) = mot.scoreMotif(seq)
+ sense = "+"
+ if front > back:
+ score = int(100 * front / bestScore)
+ theseq = hg.sequence(rchrom, foundpos, 10 + dist)
+ else:
+ score = int(100 * back / bestScore)
+ theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
+ sense = "-"
+ foundpos + 1
+
+ leftScore = -1.
+ rightScore = -1.
+ leftseq = ""
+ rightseq = ""
+ if dist > 0:
+ testseq = hg.sequence(rchrom, foundpos, 10 + dist)
+ if sense == "-":
+ testseq = complement(testseq)
+
+ leftseq = testseq[:9]
+ rightseq = testseq[dist-2:]
+ elif dist == 0:
+ testseq = hg.sequence(rchrom, foundpos, 12)
+ if sense == "-":
+ testseq = complement(testseq)
+ leftseq = testseq[3:]
+ else:
+ leftseq = testseq[:9]
+
+ rightseq = testseq
+
+ (lfront, lback) = motL.scoreMotif(leftseq)
+ (rfront, rback) = motR.scoreMotif(rightseq)
+ if lfront > lback:
+ leftScore = int(100 * lfront) / bestLeft
+ leftSense = "+"
+ else:
+ leftScore = int(100 * lback) / bestLeft
+ leftSense = "-"
+
+ if rfront > rback:
+ rightScore = int(100 * rfront) / bestRight
+ rightSense = "+"
+ else:
+ rightScore = int(100 * rback) / bestRight
+ rightSense = "-"
+
+ if dist != 11:
+ if rightScore > leftScore:
+ sense = rightSense
+ else:
+ sense = leftSense
+
+ if sense == "-" and dist > 0:
+ theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
+
+ outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
+ if doVerbose:
+ print outline
+
+ outfile.write(outline + "/n")
+
+ # we didn't find a site - draw region
+ if not foundValue and doVerbose:
+ outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
+ print outline
+ outfile.write(outline + "\n")
+
+ if not foundValue and doPlot:
+ drawarray = [val + notFoundIndex for val in smoothArray]
+ drawpos = [drawarray[val] for val in topPos]
+ plot(drawarray, "b")
+ plot(topPos, drawpos, "r.")
+ goodmatches = mot.locateMotif(seq, 75)
+ if len(goodmatches) > 0:
+ print topPos
+ print goodmatches
+ drawgood = []
+ drawgoody = []
+ for (mstart, sense) in goodmatches:
+ drawgood.append(mstart)
+ drawgoody.append(drawarray[mstart])
+
+ plot(drawgood, drawgoody, "g.")
+
+ notFoundIndex -= 30
+
+ outfile.close()
+ if doPlot:
+ savefig(plotname)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys, optparse
+from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+
+print "%prog: version 5.5"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--radius", type="int", dest="maxRadius")
+ parser.add_option("--nomatch", dest="nomatchfilename")
+ parser.add_option("--trackfar", action="store_true", dest="trackFar")
+ parser.add_option("--stranded", action="store_true", dest="trackStrand")
+ parser.add_option("--cache", action="store_true", dest="cachePages")
+ parser.add_option("--compact", action="store_true", dest="compact")
+ parser.add_option("--step", type="int", dest="step")
+ parser.add_option("--startField", type="int", dest="colID")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
+ trackStrand=False, compact=False, colID=1, doCache=False,
+ extendGenome="", replaceModels=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(2)
+
+ genome = args[0]
+ infilename = args[1]
+ outfilename = args[2]
+
+ getallgenes(genome, infilename, outfilename, options.maxRadius,
+ options.nomatchfilename, options.step, options.trackFar,
+ options.trackStrand, options.compact, options.colID,
+ options.doCache, options.extendgenome, options.replaceModels)
+
+
+def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
+ step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
+ doCache=False, extendGenome="", replaceModels=False):
+
+ if doCache:
+ idb = geneinfoDB(cache=True)
+ else:
+ idb = geneinfoDB()
+
+ if not step:
+ step = maxRadius - 2
+
+ if extendGenome and replaceModels:
+ replaceModels = True
+ else:
+ replaceModels = False
+
+ infile = open(infilename)
+ outfile = open(outfilename,"w")
+
+ if genome == "dmelanogaster":
+ geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+ else:
+ geneinfoDict = idb.getallGeneInfo(genome)
+
+ posList = []
+ altPosDict = {}
+ altPosRevDict = {}
+ posLine = {}
+ posStrand = {}
+ altPosList = []
+
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ fields = line.split("\t")
+ if compact:
+ (chrom, pos) = fields[colID].split(":")
+ chrom = chrom[3:]
+ (start, stop) = pos.split("-")
+ pos = (chrom, int(start))
+ altPos = (chrom, int(stop))
+ else:
+ try:
+ chrom = fields[colID][3:]
+ except:
+ print line
+ continue
+
+ pos = (chrom, int(fields[colID + 1]))
+ altPos = (chrom, int(fields[colID + 2]))
+
+ altPosDict[pos] = altPos
+ altPosRevDict[altPos] = pos
+ posList.append(pos)
+ posList.append(altPos)
+ altPosList.append(altPos)
+ posLine[pos] = line
+ if trackStrand:
+ if "RNAFARP" in line:
+ posStrand[pos] = "+"
+ posStrand[altPos] = "+"
+ else:
+ posStrand[pos] = "-"
+ posStrand[altPos] = "-"
+
+ geneList = []
+ geneDict = {}
+ if maxRadius < step:
+ step = maxRadius - 2
+
+ hg = Genome(genome, inRAM=True)
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace = replaceModels)
+
+ geneannotDict = hg.allAnnotInfo()
+
+ for radius in range(1, maxRadius, step):
+ print "radius %d" % radius
+ print len(posList)
+ if radius == 1:
+ posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
+ else:
+ posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels)
+ posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
+ for apos in posDict2:
+ try:
+ posDict[apos] += posDict2[apos]
+ posDict[apos].sort()
+ except:
+ posDict[apos] = posDict2[apos]
+
+ for pos in posDict:
+ geneID = ""
+ if len(posDict[pos]) == 1:
+ if trackStrand:
+ if posStrand[pos] == posDict[pos][0][-1]:
+ geneID = posDict[pos][0][0]
+ else:
+ geneID = posDict[pos][0][0]
+ elif len(posDict[pos]) > 1 and not trackStrand:
+ (chrom, loc) = pos
+ bestres = posDict[pos][0]
+ dist1 = abs(bestres[3] - loc)
+ dist2 = abs(bestres[4] - loc)
+ if dist1 < dist2:
+ bestdist = dist1
+ else:
+ bestdist = dist2
+
+ for testres in posDict[pos]:
+ testdist1 = abs(testres[3] - loc)
+ testdist2 = abs(testres[4] - loc)
+ if testdist1 < testdist2:
+ testdist = testdist1
+ else:
+ testdist = testdist2
+
+ if testdist < bestdist:
+ bestdist = testdist
+ bestres = testres
+
+ geneID = bestres[0]
+ elif len(posDict[pos]) > 1:
+ (chrom, loc) = pos
+ bestres = posDict[pos][0]
+ dist1 = abs(bestres[3] - loc)
+ dist2 = abs(bestres[4] - loc)
+ bestStrand = posDict[pos][-1]
+ if dist1 < dist2:
+ bestdist = dist1
+ else:
+ bestdist = dist2
+
+ for testres in posDict[pos]:
+ testdist1 = abs(testres[3] - loc)
+ testdist2 = abs(testres[4] - loc)
+ testStrand = testres[-1]
+ if testdist1 < testdist2:
+ testdist = testdist1
+ else:
+ testdist = testdist2
+
+ if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
+ bestdist = testdist
+ bestres = testres
+ bestStrand = testStrand
+ elif testdist < bestdist:
+ bestdist = testdist
+ bestres = testres
+
+ if bestStrand == posStrand[pos]:
+ geneID = bestres[0]
+
+ if geneID != "":
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneinfoDict["Dmel_" + geneID][0][0]
+ else:
+ symbol = geneinfoDict[geneID][0][0]
+ except:
+ try:
+ symbol = geneannotDict[(genome, geneID)][0]
+ except:
+ symbol = "LOC" + geneID
+ else:
+ continue
+
+ if pos in altPosList and pos in posList:
+ posList.remove(pos)
+ if pos not in altPosRevDict:
+ continue
+
+ if altPosRevDict[pos] in posList:
+ posList.remove(altPosRevDict[pos])
+
+ pos = altPosRevDict[pos]
+ elif pos in posList:
+ posList.remove(pos)
+ if pos not in altPosDict:
+ print pos
+ continue
+
+ if altPosDict[pos] in posList:
+ posList.remove(altPosDict[pos])
+ else:
+ continue
+
+ if (symbol, geneID) not in geneList:
+ geneList.append((symbol, geneID))
+ geneDict[(symbol, geneID)] = []
+
+ if pos not in geneDict[(symbol, geneID)]:
+ geneDict[(symbol, geneID)].append(pos)
+
+ for (symbol, geneID) in geneList:
+ geneDict[(symbol, geneID)].sort()
+ seenLine = []
+ for pos in geneDict[(symbol, geneID)]:
+ if pos in altPosRevDict:
+ pos = altPosRevDict[pos]
+
+ if posLine[pos] in seenLine:
+ continue
+
+ if "\t" in symbol:
+ symbol = symbol.replace("\t","|")
+
+ if " " in symbol:
+ symbol = symbol.replace(" ","_")
+
+ line = "%s %s %s" % (symbol, geneID, posLine[pos])
+ seenLine.append(posLine[pos])
+ outfile.write(line)
+
+ matchIndex = 0
+ if nomatchfilename != "":
+ nomatchfile = open(nomatchfilename, "w")
+
+ prevStart = 0
+ prevChrom = ""
+ farIndex = 0
+ start = 0
+ for pos in posList:
+ if pos not in altPosList:
+ if nomatchfilename != "":
+ nomatchfile.write(posLine[pos])
+
+ matchIndex += 1
+ # need to add strand tracking here.....
+ if trackFar:
+ (chrom, start) = pos
+ if chrom != prevChrom:
+ farIndex += 1
+ prevChrom = chrom
+ elif abs(int(start) - prevStart) > maxRadius:
+ farIndex += 1
+
+ line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
+ outfile.write(line)
+ prevStart = int(start)
+
+ if nomatchfilename != "":
+ nomatchfile.close()
+
+ print "%d sites without a gene within radius of %d" % (matchIndex, radius)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys, optparse
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+from cistematic.core.motif import Motif, hasMotifExtension
+from cistematic.core import complement
+from cistematic.genomes import Genome
+from commoncode import readDataset, getMergedRegions, findPeak
+
+print "%prog: version 2.4"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--dataset", dest="chipfilename")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--best", action="store_true", dest="bestOnly",
+ help="only report the best position for each region")
+ parser.add_option("--usepeak", action="store_true", dest="usePeak",
+ help="use peak position and height from regions file")
+ parser.add_option("--printseq", action="store_true", dest="printSeq")
+ parser.add_option("--nomerge", action="store_true", dest="noMerge")
+ parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+ parser.add_option("--rank", type="int", dest="useRank",
+ help="return region ranking based on peak height ranking [requires --usepeak]")
+ parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
+ printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 5:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ motfilename = args[1]
+ motThreshold = float(args[2])
+ infilename = args[3]
+ outfilename = args[4]
+
+ getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
+ options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
+ options.useRank, options.noMerge)
+
+
+def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
+ doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
+ useRank=False, noMerge=False):
+
+ if motThreshold < 1.0 and doMarkov1:
+ print "motThreshold should be between 1.0 and 10.0 for markov1"
+ sys.exit(1)
+ elif motThreshold < 55.0 and not doMarkov1:
+ print "motThreshold should be between 55 and 99 for a regular PSFM"
+ sys.exit(1)
+
+ if hasMotifExtension:
+ print "will use cistematic.core.motif C-extension to speed up motif search"
+
+ if useRank and usePeak:
+ print "will return region ranking based on peak height ranking"
+ useRank = True
+ else:
+ print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
+ useRank = False
+
+ mot = Motif("", motifFile=motfilename)
+ motLen = len(mot)
+ bestScore = mot.bestConsensusScore()
+
+ hg = Genome(genome)
+
+ # minHits=-1 will force regions to be used regardless
+ # maxDist= 0 prevents merging of non-overlapping regions
+ if noMerge:
+ regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
+ else:
+ regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
+
+ doRDS = False
+ if chipfilename:
+ doRDS = True
+
+ if doRDS:
+ hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
+
+ outfile = open(outfilename, "w")
+
+ regionList = []
+
+ for chrom in regions:
+ if "rand" in chrom or "M" in chrom:
+ continue
+
+ if usePeak:
+ for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
+ regionList.append((peakHeight, chrom, start, length, peakPos))
+ else:
+ for (start, stop, length) in regions[chrom]:
+ regionList.append((chrom, start, length))
+
+ if usePeak:
+ regionList.sort()
+ regionList.reverse()
+
+ notFoundIndex = 0
+ currentChrom = ""
+ count = 0
+ for tuple in regionList:
+ if usePeak:
+ (rpeakheight, rchrom, start, length, rpeakpos) = tuple
+ else:
+ (rchrom, start, length) = tuple
+
+ try:
+ seq = hg.sequence(rchrom, start, length)
+ except:
+ print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
+ continue
+
+ count += 1
+ numHits = -1
+ if usePeak:
+ peakpos = rpeakpos
+ if useRank:
+ numHits = count
+ else:
+ numHits = rpeakheight
+ elif doRDS:
+ if rchrom != currentChrom:
+ fullchrom = "chr" + rchrom
+ hitDict = hitRDS.getReadsDict(chrom=fullchrom)
+ currentChrom = rchrom
+
+ (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
+ if len(topPos) == 0:
+ print "topPos error"
+
+ peakpos = topPos[0]
+
+ found = []
+ if doMarkov1:
+ matches = mot.locateMarkov1(seq, motThreshold)
+ else:
+ matches = mot.locateMotif(seq, motThreshold)
+
+ for (pos, sense) in matches:
+ alreadyFound = False
+ for (fpos, fdist) in found:
+ if pos + start == fpos:
+ alreadyFound = True
+
+ if not alreadyFound:
+ if usePeak:
+ found.append((start + pos, start + pos + motLen/2 - peakpos))
+ elif doRDS:
+ found.append((start + pos, pos + motLen/2 - peakpos))
+ else:
+ found.append((start + pos, -1))
+
+ foundValue = False
+ bestList = []
+ for (foundpos, peakdist) in found:
+ seq = hg.sequence(rchrom, foundpos, motLen)
+ foundValue = True
+ (front, back) = mot.scoreMotif(seq)
+ sense = "+"
+ if front >= back:
+ score = int(100 * front / bestScore)
+ else:
+ score = int(100 * back / bestScore)
+ sense = "-"
+ seq = complement(seq)
+
+ if printSeq:
+ print seq
+
+ outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
+ if bestOnly:
+ bestList.append((abs(peakdist), outline))
+ else:
+ outfile.write(outline)
+
+ if bestOnly and foundValue:
+ bestList.sort()
+ outfile.write(bestList[0][1])
+
+ if not foundValue:
+ if printSeq:
+ print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
+
+ notFoundIndex += 1
+ if (count % 10000) == 0 and not printSeq:
+ print count
+
+ outfile.close()
+ print "did not find motif in %d regions" % notFoundIndex
+
+
+if __name__ == "__main__":
+ main(sys.argv)
--- /dev/null
+#
+# getfasta.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, findPeak
+from cistematic.genomes import Genome
+
+print "%s: version 3.4" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome regionfile outfilename [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--seqradius", type="int", dest="seqsize")
+ parser.add_option("--minreads", type="int", dest="minHitThresh")
+ parser.add_option("--returnTop", type="int", dest="topRegions")
+ parser.add_option("--maxsize", type="int", dest="maxsize")
+ parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+ parser.add_option("--dataset", dest="hitfile")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--compact", action="store_true", dest="doCompact")
+ parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
+ usePeaks=False, hitfile=None, doCache=False, doCompact=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ regionfile = args[1]
+ outfilename = args[2]
+
+ getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
+ options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
+ options.doCache, options.doCompact)
+
+
+def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
+ maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
+ doDataset = False
+ if hitfile is not None:
+ if usePeaks:
+ print "ignoring dataset and relying on peak data"
+ else:
+ doDataset = True
+
+ if doCompact:
+ mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
+ chromField=0, compact=True, keepPeak=usePeaks,
+ returnTop=topRegions)
+ else:
+ mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
+ keepPeak=usePeaks, returnTop=topRegions)
+
+ if usePeaks:
+ ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
+ elif doDataset:
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
+ else:
+ ncregions = getDefaultRegion(mergedRegions, maxsize)
+
+ writeFastaFile(ncregions, genome, outfilename, seqsize)
+
+
+def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
+ hg = Genome(genome)
+ outfile = open(outfilename, "w")
+ for chrom in ncregions:
+ for regionDict in ncregions[chrom]:
+ rstart = regionDict["start"]
+ rlen = regionDict["length"]
+ topPos = regionDict["topPos"]
+ if topPos[0] >= 0:
+ newrstart = rstart + topPos[0] - seqsize
+ newrlen = 2 * seqsize + 1
+ else:
+ newrstart = rstart
+ newrlen = rlen
+
+ seq2 = hg.sequence(chrom, newrstart, newrlen)
+ outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
+
+ outfile.close()
+
+
+def getDefaultRegion(regionDict, maxsize):
+ ncregions = {}
+ for chrom in regionDict:
+ ncregions[chrom] = []
+
+ for achrom in regionDict:
+ print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+ for region in regionDict[achrom]:
+ (rstart, rstop, rlen) = region
+
+ if rlen > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ continue
+
+ resultDict = {"start": rstart,
+ "length": rlen,
+ "topPos": [-1]
+ }
+ ncregions[achrom].append(resultDict)
+
+ return ncregions
+
+
+def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
+
+ ncregions = {}
+ for chrom in regionDict:
+ ncregions[chrom] = []
+
+ for achrom in regionDict:
+ print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+ for region in regionDict[achrom]:
+ (rstart, rstop, rlen, peakPos, peakHeight) = region
+
+ if rlen > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ continue
+
+ topPos = peakPos - rstart
+ if peakHeight > minHitThresh:
+ resultDict = {"start": rstart,
+ "length": rlen,
+ "topPos": [topPos]
+ }
+ ncregions[achrom].append(resultDict)
+
+ return ncregions
+
+
+def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
+
+ readlen = hitRDS.getReadSize()
+
+ ncregions = {}
+ for chrom in regionDict:
+ ncregions[chrom] = []
+
+ for achrom in regionDict:
+ print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+ for region in regionDict[achrom]:
+ (rstart, rstop, rlen) = region
+
+ if rlen > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ continue
+
+ thechrom = "chr%s" % achrom
+ print "."
+ hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+ print "hitDict length: %d", len(hitDict[thechrom])
+ (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
+ if numHits > minHitThresh:
+ resultDict = {"start": rstart,
+ "length": rlen,
+ "topPos": topPos
+ }
+ ncregions[achrom].append(resultDict)
+
+ return ncregions
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+from cistematic.genomes import Genome
+from math import log
+import os.path
+import sys
+import optparse
+import matplotlib
+from pylab import *
+
+print "%prog: version 2.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [--fontsize pts] [--length in] [--width in]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--fontsize", type="int", dest="fontSize")
+ parser.add_option("--length", type="int", dest="length")
+ parser.add_option("--width", type="int", dest="width")
+ parser.set_defaults(fontSize=5, length=10, width=7)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 5:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ imagename = args[1]
+
+ conditionList = args[2:]
+ conditions = len(conditionList) / 3
+ fileroots = []
+ titles = []
+ for index in range(conditions):
+ conditionIndex = index * 3
+ fileroots.append(conditionList[conditionIndex])
+ titles.append((conditionList[conditionIndex + 1], "(%s)" % conditionList[conditionIndex + 2]))
+
+ getgosig(genome, imagename, fileroots, titles, options.fontSize, options.length, options.width)
+
+
+def getgosig(genome, imagename, fileroots=[], titles=[], fontSize=5, length=10, width=7):
+ hg = Genome(genome)
+ allgodesc = hg.allGOterms()
+ godesc = []
+
+ matplotlib.use("Agg")
+
+ doGray = False
+
+ rootdir = "./"
+ htmlname = imagename[:-4] + ".html"
+
+ ceiling = 40.0
+ goterms = []
+ goscores = {}
+ numgenes = {}
+ possiblegenes = {}
+ flatArray = []
+
+ highestPval = 0.0
+ lowestPval = 1.0
+ for sigfile in fileroots:
+ infile = open(rootdir + sigfile + ".gosig", "r")
+ for line in infile:
+ if "depleted" in line:
+ continue
+
+ fields = line.split("\t")
+ if fields[0] not in goterms:
+ goterms.append(fields[0])
+ goscores[fields[0]] = []
+ numgenes[fields[0]] = []
+ possiblegenes[fields[0]] = 0
+
+ if float(fields[3]) > highestPval:
+ highestPval = float(fields[3])
+
+ if float(fields[3]) < lowestPval:
+ lowestPval = float(fields[3])
+
+ print highestPval
+ print lowestPval
+
+ boundaryScore = score = -1 * log(highestPval) / (2.0 * ceiling) + 0.49
+ print boundaryScore
+
+ cdict = {"red": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)),
+ "green": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)),
+ "blue": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.75), (1.0, 0.0, 0.0))
+ }
+
+ mymap = matplotlib.colors.LinearSegmentedColormap("my_colormap", cdict, 1024)
+
+ goindex = 0
+ for zfile in fileroots:
+ infile = open(rootdir + zfile + ".gozscore", "r")
+ for line in infile:
+ fields = line.split()
+ goindex += 1
+ if fields[0] not in goterms:
+ continue
+
+ score = -1 * log(float(fields[7])) / (2.0 * ceiling)
+ if score < -0.5:
+ score = -0.5
+
+ if score > 0.5:
+ score = 0.5
+
+ score += 0.5
+ if doGray:
+ score = 1 - score
+
+ goscores[fields[0]].append(score)
+ numgenes[fields[0]].append(fields[1])
+ possiblegenes[fields[0]] = int(fields[4])
+
+ goindex /= len(fileroots)
+
+ gokeys = goscores.keys()
+ gosortarray = []
+ for term in gokeys:
+ gosortarray.append(goscores[term] + [term])
+
+ gosortarray.sort()
+
+ htmlfile = open(htmlname, "w")
+ htmlfile.write('<html><head><title>GO Analysis</title></head><body><table border="1">')
+ htmlfile.write("<tr><th>Description</th><th>possible</th>")
+ for entry in titles:
+ htmlfile.write("<th>%s<br>%s</th>" % entry)
+
+ htmlfile.write("</tr>\n")
+ tableLines = []
+
+ for entry in gosortarray:
+ term = entry[-1]
+ outline = "%s:\t" % term
+ for entry in goscores[term]:
+ outline += str(round(entry, 4)) + "\t"
+
+ print outline
+ htmlLine = "<tr><th>%s</th><th>%d</th>" % (allgodesc[term], possiblegenes[term])
+ index = 0
+ for fileroot in fileroots:
+ gofile = fileroot + "." + term[3:]
+ ngene = numgenes[term][index]
+ if os.path.exists(gofile):
+ htmlLine += '<td><a href="%s">%s</a></td>' % (gofile, ngene)
+ else:
+ htmlLine += "<td>%s</td>" % (ngene)
+
+ index += 1
+
+ tableLines.append(htmlLine + "</tr>\n")
+ flatArray.append(goscores[term])
+ godesc.append(allgodesc[term])
+
+ tableLines.reverse()
+ for line in tableLines:
+ htmlfile.write(line)
+
+ htmlfile.write("<tr><th>Cohort Size:</th>")
+ htmlfile.write("</tr>\n")
+ htmlfile.write("</table></body></html>")
+
+ figure(figsize=(length, width))
+ myaxe = axes([0.3, 0.1, 0.55, 0.75])
+
+ Z = array(flatArray)
+ print Z.shape
+ if doGray:
+ c = pcolor(Z, cmap=cm.gray, vmin=0.0, vmax=1.0)
+ else:
+ c = pcolor(Z, cmap=mymap, vmin=0.0, vmax=1.0)
+
+ c.set_linewidth(0.1)
+ clim(0.0, 1.0)
+
+ ind = arange(len(fileroots))
+ width = 0.5
+
+ coordy = 0.1
+ deltaX = 1.0
+ deltaY = 1.0
+
+ pcolorAxes = c.get_axes()
+ for entry in gosortarray:
+ term = entry[-1]
+ coordx = 0.4
+ for genenum in numgenes[term]:
+ if len(genenum) == 1:
+ genenum = " " + genenum
+ elif len(genenum) == 2:
+ genenum = " " + genenum
+
+ pcolorAxes.text(coordx, coordy, genenum, fontsize=fontSize)
+ coordx += deltaX
+
+ coordy += deltaY
+
+ coordx = 0
+ for (line1,line2) in titles:
+ pcolorAxes.text(coordx + 0.1, coordy + 3 * deltaY + 0.5, line1, fontsize=int(fontSize*1.5))
+ pcolorAxes.text(coordx + 0.1, coordy + deltaY, line2, fontsize=int(fontSize*1.5))
+ coordx += deltaX
+
+ setp(gca(), "xticks", [])
+ setp(gca(), "xticklabels", [])
+ setp(gca(), "yticks", arange(len(godesc)))
+ setp(gca(), "yticklabels", godesc)
+ locs, labels = yticks()
+ setp(labels, fontsize=fontSize)
+ setp(labels, verticalalignment="bottom")
+ setp(gca(), "ylim", [0, len(godesc)])
+
+ figtext(0.3,0.02, str(goindex - len(gokeys)) + " additional GO Terms below threshold of significance", fontsize=fontSize*2)
+
+ d = colorbar(orientation="vertical", drawedges=False)
+ for t in d.ax.get_yticklabels():
+ t.set_fontsize(0)
+
+ locs, labels = yticks()
+ setp(labels, fontsize=5)
+ pcolorAxes.text(conditions + 1,len(godesc), str(lowestPval), fontsize=fontSize*2)
+ pcolorAxes.text(conditions + 1,boundaryScore * len(godesc), str(highestPval), fontsize=fontSize*2)
+
+ savefig(imagename, dpi=250)
+ show()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# getmers.py
+# ENRAGE
+#
+
+import sys
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+from cistematic.genomes import Genome
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print '%s: version 1.1' % argv[0]
+
+ if len(sys.argv) < 5:
+ print 'usage: python %s genome merlen chrAny:start-stop outfile' % argv[0]
+ exit(1)
+
+ genome = argv[1]
+ merlen = int(argv[2])
+ location = argv[3]
+ outfilename = argv[4]
+
+ getmers(genome, merlen, location, outfilename)
+
+
+def getmers(genome, merlen, location, outfilename):
+ (chrom, pos) = location.split(':')
+ chrom = chrom[3:]
+ (start, stop) = pos.split('-')
+ start = int(start)
+ regionlength = int(stop) - start + 1
+
+ hg = Genome(genome)
+
+ seq = hg.sequence(chrom, start, regionlength)
+
+ outfile = open(outfilename,'w')
+ print 'writing %d %d-mers' % (regionlength - merlen, merlen)
+ for index in range(regionlength - merlen):
+ outfile.write(seq[index:index + merlen].upper() + '\n')
+
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+import string
+try:
+ import psyco
+ psyco.full()
+except:
+ print "psyco not running"
+from cistematic.core import complement
+from cistematic.genomes import Genome
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ verstring = "%prog: version 1.0"
+ print verstring
+ delimiter = "|"
+
+ usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+ \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
+ \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--verbose", action="store_true", dest="doVerbose",
+ help="show verbose messages [default: False]")
+ parser.add_option("--spacer", type="int", dest="spacer",
+ help="number of spacer NTs to use [default: 2")
+ parser.set_defaults(doVerbose=False, spacer=2)
+ (options, args) = parser.parse_args(argv[1:])
+
+ try:
+ genome = args[0]
+ datafilename = args[1]
+ outfilename = args[2]
+ maxBorder = args[3]
+ except IndexError:
+ print usage
+ sys.exit(1)
+
+ getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
+
+
+def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
+ spacerseq = "N" * spacer
+
+ datafile = open(datafilename)
+ hg = Genome(genome)
+
+ spliceCountDict = {}
+ exonStartDict = {}
+ exonStopDict = {}
+ exonLengthDict = {}
+ nameToChromDict = {}
+ nameToComplementDict = {}
+ alreadySeen = {}
+ counter = 0
+
+ for line in datafile:
+ fields = line.split()
+ name = fields[0]
+ spliceCount = int(fields[7]) - 1
+ if spliceCount < 1:
+ continue
+
+ counter += spliceCount
+ spliceCountDict[name] = spliceCount
+ chrom = fields[1][3:]
+ if chrom == "chrM":
+ continue
+
+ nameToChromDict[name] = chrom
+ if chrom not in alreadySeen:
+ alreadySeen[chrom] = []
+
+ nameToComplementDict[name] = fields[2]
+ exonStarts = []
+ exonStops = []
+ for val in fields[8].split(",")[:-1]:
+ exonStarts.append(int(val))
+
+ for val in fields[9].split(",")[:-1]:
+ exonStops.append(int(val))
+
+ exonStartDict[name] = exonStarts
+ exonStopDict[name] = exonStops
+ exonLengths = []
+ for index in range(spliceCount + 1):
+ exonLengths.append(exonStops[index] - exonStarts[index])
+
+ exonLengthDict[name] = exonLengths
+
+ print len(spliceCountDict)
+ print counter
+
+ missedCount = 0
+ depressedCount = 0
+ splicefileindex = 1
+ spliceCounter = 0
+ outfile = open(outfilename, "w")
+ for name in nameToChromDict:
+ try:
+ spliceCount = spliceCountDict[name]
+ except:
+ continue
+
+ exonStarts = exonStartDict[name]
+ exonStops = exonStopDict[name]
+ exonLengths = exonLengthDict[name]
+ chrom = nameToChromDict[name]
+ for index in range(spliceCount):
+ if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
+ continue
+
+ regionstart = exonStops[index] - maxBorder
+ alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
+ beforeLen = exonLengths[index]
+ afterLen = exonLengths[index + 1]
+ if (beforeLen + afterLen) < maxBorder + spacer:
+ missedCount += 1
+ continue
+
+ if (beforeLen + afterLen) < 2 * maxBorder:
+ depressedCount += 1
+
+ if beforeLen > maxBorder:
+ beforeLen = maxBorder
+
+ if afterLen > maxBorder:
+ afterLen = maxBorder
+
+ try:
+ beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
+ afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
+ except:
+ if doVerbose:
+ print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
+ continue
+
+ sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
+ spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
+ outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
+ outfile.write(outstring)
+
+ splicefileindex += 1
+ spliceCounter += 1
+ if spliceCounter > 10000:
+ print "%d genes" % splicefileindex
+ spliceCounter = 0
+
+ outfile.close()
+
+ print "%d splices too short to be seen" % missedCount
+ print "%d splices will be under-reported" % depressedCount
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+if len(sys.argv) < 3:
+ print "usage: python %s infile.gff outfile.cis\n" % sys.argv[0]
+ print "\tTHIS SCRIPT WILL MOST LIKELY NEED TO BE EDITED FOR YOUR GFF FILE\n"
+ sys.exit(1)
+
+index = 1
+# Cistematic just want's a use set of exons labeled "CDS", "5UTR", and "3UTR"
+# just put the corresponding type in your GFF file as the key in the key:value pairs
+# in the ftypeDict below
+ftypeDict = {"CDS": "CDS",
+ "mRNA": "mRNA",
+ "five_prime_utr": "5UTR",
+ "three_prime_utr": "3UTR"
+}
+
+chrom = ""
+idfields = ""
+gene = ""
+sense = ""
+start = 0
+stop = 0
+ftype = ""
+
+infile = open(sys.argv[1])
+outfile = open(sys.argv[2], "w")
+for line in infile:
+ if line[0]=="#":
+ continue
+
+ fields = line.strip().split()
+ try:
+ if fields[2] in ftypeDict:
+ # this part of the code will need to be customized, most likely
+ # how does the annotation define the gene, geneid, and chromosome
+ # for example, for Anopheles Gambiae we have
+ #chrX VectorBase mRNA 582 16387 . - . ID=vectorbase|AGAP000002-RA; stable_id=AGAP000002-RA.1; Parent=vectorbase|AGAP000002;
+ if fields[2] == "mRNA":
+ chrom = fields[0][3:]
+ source = fields[1]
+ idfields = fields[9].split(";")
+ geneid = idfields[0].split("=")[1]
+ sense = fields[6]
+ else:
+ start = int(fields[3])
+ stop = int(fields[4])
+ ftype = ftypeDict[fields[2]]
+ outline = "%s\t%s%d\t%s\t%d\t%d\t%s\t%s\n" % (geneid, source, index, chrom, start, stop, sense, ftype)
+ outfile.write(outline)
+ except:
+ sys.exit()
+
+ index += 1
+
+infile.close()
+outfile.close()
--- /dev/null
+#
+# gointersects.py
+# ENRAGE
+#
+
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 4:
+ print "usage: python %s gogidfile gidfile outfile" % argv[0]
+ sys.exit(1)
+
+ gogidfilename = argv[1]
+ gidfilename = argv[2]
+ outfilename = argv[3]
+
+ gointersects(gogidfilename, gidfilename, outfilename)
+
+
+def gointersects(gogidfilename, gidfilename, outfilename):
+ gidList = []
+ gogidfile = open(gogidfilename)
+ for line in gogidfile:
+ fields = line.split()
+ gidList.append(fields[0])
+
+ gogidfile.close()
+
+ gidfile = open(gidfilename)
+ outfile = open(outfilename, "w")
+ for line in gidfile:
+ fields = line.split()
+ if fields[0] in gidList:
+ outfile.write(line)
+
+ gidfile.close()
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# intersects.py
+# ENRAGE
+#
+
+import sys, optparse
+
+print 'version 2.0'
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog infile1 infile2 outfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("-d", dest="delimiter")
+ parser.add_option("--file3", dest="infile3")
+ parser.add_option("-1", type="int", dest="matchfield1")
+ parser.add_option("-2", type="int", dest="matchfield2")
+ parser.add_option("-3", type="int", dest="matchfield3")
+ parser.add_option("-reject1", dest="reject1file")
+ parser.add_option("-trackGID", action="store_true", dest="trackGID")
+ parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0,
+ matchField3=0, rejectFileName="", trackGID=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ infile1 = args[0]
+ infile2 = args[1]
+ outfile = args[2]
+
+ intersects(infile1, infile2, outfile, options.delimiter, options.infile3,
+ options.matchField1, options.matchField2, options.matchField3,
+ options.rejectFileName, options.trackGID)
+
+
+def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
+ matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
+
+ if rejectFileName:
+ doReject1 = True
+ reject1file = open(rejectFileName)
+ else:
+ doReject1 = False
+
+ if infile3Name is not None:
+ doFile3 = True
+ else:
+ doFile3 = False
+
+ matchedList = []
+ matchedList12 = []
+ matchedList13 = []
+ matchedList23 = []
+ gidDict = {}
+
+ if trackGID:
+ gidKeys = gidDict.keys()
+ list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys)
+ for entry in fileGIDDict.keys():
+ gidDict[entry] = fileGIDDict[entry]
+
+ gidKeys = gidDict.keys()
+ list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys)
+ for entry in fileGIDDict.keys():
+ gidDict[entry] = fileGIDDict[entry]
+
+ if doFile3:
+ gidKeys = gidDict.keys()
+ list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys)
+ for entry in fileGIDDict.keys():
+ gidDict[entry] = fileGIDDict[entry]
+ else:
+ list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1)
+ list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2)
+ if doFile3:
+ list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3)
+
+ for candidate in list1:
+ if doFile3 and candidate in list2 and candidate in list3:
+ matchedList.append(candidate)
+ elif doFile3 and candidate in list3:
+ matchedList13.append(candidate)
+ elif doFile3 and candidate in list2:
+ matchedList12.append(candidate)
+ elif not doFile3 and candidate in list2:
+ matchedList.append(candidate)
+ elif doReject1:
+ if trackGID:
+ reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate]))
+ else:
+ reject1file.write("%s\n" % candidate)
+
+ if doFile3:
+ for candidate in list2:
+ if candidate not in list1 and candidate in list3:
+ matchedList23.append(candidate)
+
+ print len(list1), len(list2), len(list3)
+ if doFile3:
+ print len(matchedList12), len(matchedList13), len(matchedList23)
+ print len(matchedList)
+
+ outfile = open(outfileName, "w")
+ for match in matchedList:
+ if trackGID:
+ outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match]))
+ else:
+ outfile.write("%s\n" % match)
+
+ outfile.close()
+
+
+def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]):
+ infile = open(filename)
+ candidateList = []
+ gidDict = {}
+
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split(delimiter)
+ candidate = fields[matchField]
+ if candidate not in candidateList:
+ candidateList.append(candidate)
+
+ if trackGID and candidate not in gidList:
+ gidDict[candidate] = fields[matchField + 1]
+
+ infile.close()
+ return candidateList, gidDict
+
+
+def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]):
+ return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[])
+
+
+def getCandidateListFromFile(filename, delimiter, matchField):
+ candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField)
+ return candidateList
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# listGeneFeatures.py
+# ENRAGE
+#
+
+import sys
+from cistematic.genomes import Genome
+from commoncode import getMergedRegions, getFeaturesByChromDict
+
+print "%s: version 1.1" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 4:
+ print "usage: python %s genome [acceptFile] gid outfile\n" % argv[0]
+ sys.exit(1)
+
+ genome = argv[1]
+
+ if len(argv) == 4:
+ gid = argv[2]
+ outfile = argv[3]
+ else:
+ acceptFileName = argv[2]
+ gid = argv[3]
+ outfile = argv[4]
+
+ listGeneFeatures(genome, gid, outfile, acceptFileName)
+
+
+def listGeneFeatures(genome, gid, outFileName, acceptFileName=""):
+ hg = Genome(genome)
+ outfile = open(outFileName, "w")
+ if acceptFileName:
+ additionalDict = getMergedRegions(acceptFileName, maxDist = 0, keepLabel = True, verbose = True)
+ else:
+ additionalDict = {}
+
+ featuresDict = getFeaturesByChromDict(hg, additionalDict, restrictList=[gid])
+ outfile.write('track name="LOC%s"\n' % gid)
+
+ senseDict = {"F": "+",
+ "R": "-",
+ "+": "+",
+ "-": "-"
+ }
+
+ for chrom in featuresDict:
+ for (start, stop, fgid, sense, ftype) in featuresDict[chrom]:
+ outfile.write("chr%s\t%d\t%d\t%s\t0\t%s\n" % (chrom, start, stop, ftype, senseDict[sense]))
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys, os
+
+
+def getEdges(nodeList, shorten=False):
+ edgeDict = {}
+
+ for nodeEntry in nodeList:
+ try:
+ (node1, node2, count) = nodeEntry.strip().split("\t")
+ except ValueError:
+ continue
+
+ if shorten:
+ try:
+ node1 = node1.split("_")[1]
+ except IndexError:
+ pass
+
+ try:
+ node2 = node2.split("_")[1]
+ except IndexError:
+ pass
+
+ node1Detail = (node1, int(count))
+ node2Detail = (node2, int(count))
+ try:
+ if node2Detail not in edgeDict[node1]:
+ edgeDict[node1].append(node2Detail)
+ except KeyError:
+ edgeDict[node1] = [node2Detail]
+
+ try:
+ if node1Detail not in edgeDict[node2]:
+ edgeDict[node2].append(node1Detail)
+ except KeyError:
+ edgeDict[node2] = [node1Detail]
+
+ return edgeDict
+
+
+def getEdgesFromFile(inFileName, shorten=False):
+
+ infile = open(inFileName)
+ edgeDict = getEdges(infile, shorten)
+ infile.close()
+
+ return edgeDict
+
+
+def getOutputLine(currentNode, node, nodeCount):
+ if nodeCount > 2:
+ outputLine = '\t"%s" -- "%s" [ label = "%d", penwidth=%d, color="red", constraint=false] ; \n' % (currentNode, node, nodeCount, nodeCount)
+ else:
+ outputLine = '\t"%s" -- "%s" [ label = "%d", color="red", constraint=false] ; \n' % (currentNode, node, nodeCount)
+
+ return outputLine
+
+
+infilename = sys.argv[1]
+outprefix = sys.argv[2]
+
+shorten = False
+if "-shorten" in sys.argv:
+ shorten = True
+
+edgeDict = getEdgesFromFile(infilename, shorten)
+
+nodeList = edgeDict.keys()
+seenNodeDict = {}
+seenEdgeDict = {}
+currentNodeList = []
+currentEdgeList = []
+treeList = []
+localCount = []
+
+outstat = open("%s.stats" % outprefix,"w")
+outstat.write("#gID\tnodes\tedges\tweight\n")
+
+def visitNodes(currentNode):
+ if currentNode in seenNodeDict:
+ return
+
+ seenNodeDict[currentNode] = []
+ for (node, nodeCount) in edgeDict[currentNode]:
+ nodePair = [node, currentNode]
+ nodePair.sort()
+ if str(nodePair) not in seenEdgeDict:
+ if node not in currentNodeList:
+ currentNodeList.append(node)
+
+ outputLine = getOutputLine(currentNode, node, nodeCount)
+ currentEdgeList.append(outputLine)
+ seenEdgeDict[str(nodePair)] = 0
+ localCount[0] += nodeCount
+ try:
+ visitNodes(node)
+ except:
+ pass
+
+print "getting trees"
+for node in nodeList:
+ if node not in seenNodeDict:
+ currentNodeList = [node]
+ currentEdgeList = []
+ localCount = [0]
+ outfile = open("%s.%s.gv" % (outprefix, node), "w")
+ treeList.append(node)
+ outfile.write("graph g%s {\n" % node)
+ visitNodes(node)
+ currentNodeList.sort()
+ outfile.write('subgraph G0 {\n\t"%s" ' % currentNodeList[0])
+ for anode in currentNodeList[1:]:
+ outfile.write('-- "%s" ' % anode)
+
+ outfile.write(" [ weight = 100 ] ;\n\tordering = out ;\n}\n")
+ for line in currentEdgeList:
+ outfile.write(line)
+
+ outfile.write("}\n")
+ outfile.close()
+ outstat.write("%s\t%d\t%d\t%d\n" % (node, len(currentNodeList), len(currentEdgeList), localCount[0]))
+
+print "generating pngs"
+for node in treeList:
+ output = os.popen("dot -Tpng %s.%s.gv > %s.%s.png" % (outprefix, node, outprefix, node))
+
+outstat.close()
\ No newline at end of file
--- /dev/null
+#
+# makeSNPtrack.py
+# ENRAGE
+#
+# This script maps all the qualified SNC sites on to the genome browser
+# Output format: bed
+# Written by: Wendy Lee
+# Written on: August 18th, 2008
+# Last Modified: December 14th, 2008 by Ali Mortazavi
+
+import sys
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "%s: version 1.2" % argv[0]
+
+ if len(argv) < 4:
+ print "usage: python %s snpfile trackname trackoutfile" % argv[0]
+ sys.exit(1)
+
+ snpfile = argv[1]
+ track = argv[2]
+ outfile = argv[3]
+
+ makeSNPtrack(snpfile, track, outfile)
+
+
+def makeSNPtrack(snpfilename, track, outfilename):
+
+ snpfile = open(snpfilename, "r")
+ writeSNPsBedfile(snpfile, track, outfilename)
+ snpfile.close()
+
+
+def writeSNPsBedfile(snpPropertiesList, track, outfilename):
+
+ outfile = open(outfilename, "w")
+ header = getHeader(track)
+ outfile.write(header)
+
+ for line in snpPropertiesList:
+ if doNotProcessLine(line):
+ continue
+
+ fields = line.strip().split()
+ outline = getBedOutputLine(fields)
+ outfile.write(outline)
+
+ outfile.close()
+
+
+def getHeader(track):
+ header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track)
+ return header
+
+
+def doNotProcessLine(line):
+ return line[0] == "#"
+
+
+def getBedOutputLine(snpPropertiesList):
+ chromosome = snpPropertiesList[2]
+ readStart = int(snpPropertiesList[3]) - 1
+ readStop = readStart + 1
+ readName = snpPropertiesList[7]
+ color = getSNPColor(readName)
+ score = "0"
+ sense = "+"
+ outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color)
+
+ return outline
+
+
+def getSNPColor(readName):
+ baseColor = {"A": "200, 0, 255",
+ "T": "200, 0, 255",
+ "C": "200, 0, 255",
+ "G": "200, 0, 255"
+ }
+
+ specialColors = {"A-G": "255, 0, 0",
+ "T-C": "0, 0, 255"
+ }
+
+ if readName in specialColors.keys():
+ color = specialColors[readName]
+ else:
+ try:
+ color = baseColor[readName[-1]]
+ except (IndexError, KeyError):
+ color = "200, 0, 255"
+
+ return color
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makebedfromrds.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 7/19/08.
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset
+
+PLUS_COLOR = "0,0,255"
+MINUS_COLOR = "255,0,0"
+MULTI_PLUS_COLOR = "64,64,64"
+MULTI_MINUS_COLOR = "192,192,192"
+SPLICE_COLOR = "255,0,0"
+UNIQUE_COLOR = "0,0,0"
+MULTI_COLOR = "128,128,128"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ verstring = "%prog: version 3.1"
+ print verstring
+
+ doPairs = False
+
+ usage = "usage: %prog trackLabel rdsFile bamFile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+ parser.add_option("--pairs", type="int", dest="pairDist")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--chrom", action="append", dest="chromList")
+ parser.add_option("--strand", dest="strand")
+ parser.add_option("-r", "--region", dest="region", type="string",
+ help="samtools region string")
+ parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False,
+ pairDist=None, withFlag="", useFlagLike=False, enforceChr=False,
+ senseStrand="", allChrom=True, doCache=False, cachePages=100000,
+ chromList=[])
+ (options, args) = parser.parse_args(argv[1:])
+
+ try:
+ trackType = args[0]
+ except IndexError:
+ print "no track specified - see --help for usage"
+ sys.exit(1)
+
+ try:
+ rdsfile = args[1]
+ except IndexError:
+ print "no RDS file specified - see --help for usage"
+ sys.exit(1)
+
+ try:
+ outfilename = args[2]
+ except IndexError:
+ print "no output file specified - see --help for usage"
+ sys.exit(1)
+
+ if options.pairDist is not None:
+ doPairs = True
+
+ if options.chromList:
+ options.allChrom = False
+
+ outputBedFromRds(trackType, rdsfile, outfilename, options.withUniqs, options.withMulti,
+ options.doSplices, options.doSpliceColor, doPairs, options.pairDist,
+ options.withFlag, options.useFlagLike, options.enforceChr, options.senseStrand,
+ options.allChrom, options.doCache, options.cachePages, options.chromList)
+
+
+def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
+ doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
+ withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
+ allChrom=True, doCache=False, cachePages=100000, chromList=[]):
+
+ if not withUniqs and not withMulti and not doSplices:
+ print "must be outputing at least one of uniqs, multi, or -splices - exiting"
+ sys.exit(1)
+
+ print "\nsample:"
+ RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+
+ #check that this is better than the dataset's default cache size
+ if cachePages > RDS.getDefaultCacheSize():
+ RDS.setDBcache(cachePages)
+
+ readlength = RDS.getReadSize()
+ minDist = -1 * readlength
+
+ if allChrom:
+ if withUniqs:
+ chromList = RDS.getChromosomes()
+ elif withMulti:
+ chromList = RDS.getChromosomes(table="multi")
+ else:
+ chromList = RDS.getChromosomes(table="splices")
+
+ chromList.sort()
+
+ outfile = open(outfilename, "w")
+ outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (trackType))
+
+ if withUniqs or withMulti:
+ for achrom in chromList:
+ index = 0
+ if doNotOutputChromosome(achrom, enforceChr):
+ continue
+
+ print "chromosome %s" % (achrom)
+
+ if doPairs:
+ hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag,
+ withWeight=True, withPairID=True, doUniqs=withUniqs,
+ doMulti=withMulti, readIDDict=True,
+ flagLike=useFlagLike, strand=senseStrand)
+
+ readIDList = hitDict.keys()
+ if doSplices:
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag,
+ withPairID=True, readIDDict=True,
+ flagLike=useFlagLike, strand=senseStrand)
+
+ spliceIDList = spliceDict.keys()
+ combDict = {}
+ for readID in readIDList:
+ combDict[readID] = 1
+
+ for readID in spliceIDList:
+ combDict[readID] = 1
+
+ combinedIDList = combDict.keys()
+ else:
+ combinedIDList = readIDList
+
+ for readID in combinedIDList:
+ localList = []
+ try:
+ localList = hitDict[readID]
+ except:
+ pass
+
+ if doSplices:
+ try:
+ localList += spliceDict[readID]
+ except:
+ pass
+
+ localList.sort()
+ listLen = len(localList) - 1
+ localIndex = 0
+ while localIndex <= listLen:
+ try:
+ (leftpos, leftsense, leftweight, lPairID) = localList[localIndex]
+ leftstop = leftpos + readlength - 1
+ lpart = 1
+ startList = [leftpos]
+ stopList = [leftstop]
+ except:
+ (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex]
+ leftweight = 1.0
+ lpart = 2
+ startList = [leftpos, LRstart]
+ stopList = [LLstop, leftstop]
+
+ if localIndex < listLen:
+ try:
+ (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1]
+ rightstop = rightpos + readlength - 1
+ rpart = 1
+ rstartList = [rightpos]
+ rstopList = [rightstop]
+ except:
+ (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1]
+ rightweight = 1.0
+ rpart = 2
+ rstartList = [rightpos, RRstart]
+ rstopList = [RLstop, rightstop]
+ else:
+ rightsense = "+"
+ rightpos = 0
+ rstartList = []
+ rstopList = []
+
+ if leftsense == "+" and rightsense == "-" and minDist < (rightpos - leftstop) < pairDist and lPairID != rPairID:
+ if doSpliceColor:
+ plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ elif leftweight == 1.0 or rightweight == 1.0:
+ plusSenseColor = "0,0,0"
+ minusSenseColor = MINUS_COLOR
+ else:
+ plusSenseColor = "128,128,128"
+ minusSenseColor = MULTI_MINUS_COLOR
+
+ splitReadWrite(outfile, achrom, lpart + rpart, startList + rstartList, stopList + rstopList, "+", readID, plusSenseColor, minusSenseColor)
+ localIndex += 2
+ index += 2
+ else:
+ if doSpliceColor:
+ plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight)
+ outputSense = "+"
+ elif leftweight == 1.0:
+ plusSenseColor = PLUS_COLOR
+ minusSenseColor = MINUS_COLOR
+ outputSense = leftsense
+ else:
+ plusSenseColor = PLUS_COLOR
+ minusSenseColor = MINUS_COLOR
+ outputSense = leftsense
+
+ splitReadWrite(outfile, achrom, lpart, startList, stopList, outputSense, readID, plusSenseColor, minusSenseColor)
+ localIndex += 1
+ index += 1
+ else:
+ hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
+ try:
+ for (pos, sense, weight, readID) in hitDict[achrom]:
+ splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
+ index += 1
+ except:
+ pass
+
+ if doSplices:
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
+ if achrom not in spliceDict:
+ continue
+ for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+ splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
+ index += 1
+
+ elif doSplices:
+ for achrom in chromList:
+ index = 0
+ if doNotOutputChromosome(achrom, enforceChr):
+ continue
+
+ print "chromosome %s" % (achrom)
+
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
+ if achrom not in spliceDict:
+ continue
+ for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+ splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
+ index += 1
+
+ print index
+
+ outfile.close()
+
+
+def singleReadWrite(chrom, pos, sense, weight, readID, readlength, outfile):
+ start = pos
+ stop = pos + readlength - 1
+ senseColor = getSenseColor(sense, weight)
+ outfile.write("%s %d %d %s %.1f %s 0 0 %s\n" % (chrom, start, stop, readID, weight, sense, senseColor))
+
+
+def getSenseColor(sense, weight):
+ if weight < 1.0:
+ senseColor = getMultiSenseColor(sense)
+ else:
+ senseColor = getSingleSenseColor(sense)
+
+ return senseColor
+
+
+def getMultiSenseColor(sense):
+ if sense == "+":
+ senseColor = MULTI_PLUS_COLOR
+ else:
+ senseColor = MULTI_MINUS_COLOR
+
+ return senseColor
+
+
+def getSingleSenseColor(sense):
+ if sense == "+":
+ senseColor = PLUS_COLOR
+ else:
+ senseColor = MINUS_COLOR
+
+ return senseColor
+
+
+def splitReadWrite(outfile, chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
+ readSizes = getReadSizes(numPieces, startList, stopList)
+ readCoords = getReadCoords(numPieces, startList)
+ leftStart = startList[0]
+ rightStop = stopList[-1]
+
+ if rsense == "+":
+ senseCode = plusSense
+ else:
+ senseCode = minusSense
+
+ outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+ outfile.write(outline)
+
+
+def getReadSizes(numPieces, startList, stopList):
+ readSizes = "%d" % (stopList[0] - startList[0])
+ for index in range(1, numPieces):
+ readSizes += ',%d' % (stopList[index] - startList[index])
+
+ return readSizes
+
+
+def getReadCoords(numPieces, startList):
+ readCoords = "0"
+ for index in range(1, numPieces):
+ readCoords += ",%d" % (startList[index] - startList[0])
+
+ return readCoords
+
+
+def getSpliceColor(lpart, rpart, leftweight, rightweight, hackType=None):
+ if hackType == "1":
+ if (lpart + rpart) > 2:
+ aColor = SPLICE_COLOR
+ bColor = SPLICE_COLOR
+ elif leftweight == 1.0 or rightweight == 1.0:
+ aColor = UNIQUE_COLOR
+ bColor = UNIQUE_COLOR
+ else:
+ aColor = MULTI_COLOR
+ bColor = MULTI_COLOR
+ else:
+ if lpart > 1:
+ aColor = SPLICE_COLOR
+ bColor = SPLICE_COLOR
+ elif leftweight == 1.0:
+ aColor = UNIQUE_COLOR
+ bColor = UNIQUE_COLOR
+ else:
+ aColor = MULTI_COLOR
+ bColor = MULTI_COLOR
+
+ return aColor, bColor
+
+
+def doNotOutputChromosome(achrom, enforceChr):
+ result = False
+
+ if achrom == "chrM":
+ result = True
+
+ if enforceChr and ("chr" not in achrom):
+ result = True
+
+ return result
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makerdsfrombed.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 6/21/08.
+#
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 2.1" % sys.argv[0]
+print verstring
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--RNA", action="store_true", dest="rnaDataType")
+ parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ print "\ntreats all imported reads as uniquely mapped\n"
+ sys.exit(1)
+
+ label = args[0]
+ filename = args[1]
+ outdbname = args[2]
+
+ if options.rnaDataType:
+ dataType = "RNA"
+ else:
+ dataType = "DNA"
+
+ propertyList = []
+ for arg in args:
+ if "::" in arg:
+ (pname, pvalue) = arg.strip().split("::")
+ propertyList.append((pname, pvalue))
+
+ makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList)
+
+
+def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]):
+ readsize = 0
+ padsize = 0
+ index = 0
+ insertSize = 100000
+
+ writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
+
+ infile = open(filename,"r")
+
+ rds = readDataset(outdbname, init, dataType, verbose=True)
+ if not init:
+ rds.dropIndex()
+
+ #check that our cacheSize is better than the dataset's default cache size
+ defaultCacheSize = rds.getDefaultCacheSize()
+ if cachePages > defaultCacheSize:
+ if init:
+ rds.setDBcache(cachePages, default=True)
+ else:
+ rds.setDBcache(cachePages)
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+ insertList = []
+ for line in infile:
+ if "track" in line:
+ continue
+
+ fields = line.split()
+ if readsize == 0:
+ readsize = abs(int(fields[1]) - int(fields[2]))
+ if init:
+ rds.insertMetadata([("readsize", readsize+1)])
+ rds.insertMetadata([("imported_from_bed", "True")])
+
+ chrom = fields[0]
+ start = int(fields[1])
+ stop = int(fields[2])
+ sense = fields[5]
+ readID = "%s-%s" % (label, str(index))
+ insertList.append((readID, chrom, start, stop, sense, 1.0, "", ""))
+ if index % insertSize == 0:
+ rds.insertUniqs(insertList)
+ insertList = []
+ print ".",
+ sys.stdout.flush()
+
+ index += 1
+
+ if len(insertList) > 0:
+ rds.insertUniqs(insertList)
+
+ countString = "%d unique reads" % index
+ print countString
+
+ writeLog(outdbname + ".log", verstring, countString)
+
+ if doIndex:
+ print "building index...."
+ if cachePages > defaultCacheSize:
+ rds.setDBcache(cachePages)
+ rds.buildIndex(cachePages)
+ else:
+ rds.buildIndex(defaultCacheSize)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makerdsfromblat.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 12/7/08.
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 3.9"
+print verstring
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+ parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
+ parser.add_option("--flag", action="store_true", dest="flagReads")
+ parser.add_option("--strict", type="int", dest="minSpliceLength",
+ help="min required bp on each side of a splice")
+ parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--RNA", dest="geneDataFileName")
+ parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ label = args[0]
+ filename = args[1]
+ outdbname = args[2]
+
+ if options.geneDataFileName:
+ dataType = "RNA"
+ else:
+ dataType = "DNA"
+
+ theFlag = ""
+ if options.flagReads:
+ theFlag = "blat"
+
+ propertyList = []
+ for arg in args:
+ if "::" in arg:
+ (pname, pvalue) = arg.strip().split("::")
+ propertyList.append((pname, pvalue))
+
+ makerdsfromblat(label, filename, outdbname, dataType, options.init,
+ options.doIndex, options.trimReadID, options. minSpliceLength,
+ options.forceRNA, theFlag, options.spliceOnly, options.verbose,
+ options.cachePages, options.geneDataFileName, propertyList)
+
+
+def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
+ doIndex=False,trimReadID=True, minSpliceLength=0,
+ forceRNA=False, theFlag="", spliceOnly=False,
+ verbose=False, cachePages=100000, geneDataFileName="",
+ propertyList=[]):
+
+ delimiter = "|"
+ minIntron = 10
+ maxBorder = 0
+ index = 0
+ insertSize = 100000
+
+ if forceRNA:
+ print "forcing datatype to RNA"
+ dataType = "RNA"
+
+ if dataType == "RNA":
+ genedatafile = open(geneDataFileName)
+
+ writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
+
+ geneDict = {}
+ mapDict = {}
+ if dataType == "RNA" and not forceRNA:
+ for line in genedatafile:
+ fields = line.strip().split("\t")
+ blockCount = int(fields[7])
+ if blockCount < 2:
+ continue
+
+ uname = fields[0]
+ chrom = fields[1]
+ sense = fields[2]
+ chromstarts = fields[8][:-1].split(",")
+ chromstops = fields[9][:-1].split(",")
+ exonLengths = []
+ totalLength = 0
+ for index in range(blockCount):
+ chromstarts[index] = int(chromstarts[index])
+ chromstops[index] = int(chromstops[index])
+ exonLengths.append(chromstops[index] - chromstarts[index])
+ totalLength += exonLengths[index]
+
+ geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+ mapDict[uname] = []
+
+ genedatafile.close()
+
+ rds = readDataset(outdbname, init, dataType, verbose=True)
+
+ #check that our cacheSize is better than the dataset's default cache size
+ defaultCacheSize = rds.getDefaultCacheSize()
+ if cachePages > defaultCacheSize:
+ if init:
+ rds.setDBcache(cachePages, default=True)
+ else:
+ rds.setDBcache(cachePages)
+
+ if not init and doIndex:
+ try:
+ if rds.hasIndex():
+ rds.dropIndex()
+ except:
+ if verbose:
+ print "couldn't drop Index"
+
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+ # make some assumptions based on first read
+ infile = open(filename, "r")
+ for arg in range(6):
+ line = infile.readline()
+
+ fields = line.split()
+ readsize = int(fields[10])
+ pairedTest = fields[9][-2:]
+ paired = False
+ if pairedTest in ["/1", "/2"]:
+ print "assuming reads are paired"
+ paired = True
+
+ print "read size: %d bp" % readsize
+ if init:
+ rds.insertMetadata([("readsize", readsize)])
+ if paired:
+ rds.insertMetadata([("paired", "True")])
+
+ infile.close()
+ if "blat_mapped" not in rds.getMetadata():
+ rds.insertMetadata([("blat_mapped", "True")])
+
+ minReadScore = readsize - readsize/25 - 1
+ trim = -4
+ if dataType == "RNA":
+ maxBorder = readsize + trim
+
+ infile = open(filename, "r")
+ prevID = ""
+ readList = []
+ uInsertList = []
+ mInsertList = []
+ sInsertList = []
+ index = uIndex = mIndex = sIndex = lIndex = 0
+ bestScore = 0
+ # skip headers
+ for arg in range(5):
+ line = infile.readline()
+
+ for line in infile:
+ lIndex += 1
+ fields = line.strip().split()
+ readID = fields[9]
+ if trimReadID:
+ readID = string.join(readID.split(":")[1:], ":")
+
+ if readID != prevID:
+ newReadList = []
+ if bestScore > minReadScore:
+ for readData in readList:
+ if readData[1] == bestScore:
+ newReadList.append(readData)
+
+ if trimReadID:
+ prevID = label + "-" + prevID
+
+ listlen = len(newReadList)
+ if listlen == 1:
+ parts = int(newReadList[0][0])
+ if parts == 1 and not spliceOnly:
+ (part, score, sense, chrom, start, mismatches) = newReadList[0]
+ stop = start + readsize
+ uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
+ uIndex += 1
+ elif forceRNA and parts == 2:
+ (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
+ startL = int(startList[0])
+ stopL = startL + int(lengthList[0])
+ startR = int(startList[1])
+ stopR = startR + int(lengthList[1])
+ if stopL + minIntron < startR:
+ sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
+ sIndex += 1
+ elif parts == 2:
+ print newReadList
+ (part, score, sense, chrom, start, mismatches) = newReadList[0]
+ currentSplice = chrom
+ (model, spliceID, regionStart) = currentSplice.split(delimiter)
+ if model not in geneDict:
+ print fields
+ continue
+
+ (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+ spliceID = int(spliceID)
+ rstart = int(start) - 2
+ lefthalf = maxBorder - rstart
+ if lefthalf < 1 or lefthalf > maxBorder:
+ continue
+
+ righthalf = readsize - lefthalf
+ startL = int(regionStart) + rstart
+ stopL = startL + lefthalf
+ startR = chromstarts[spliceID + 1]
+ stopR = chromstarts[spliceID + 1] + righthalf
+ if stopL + minIntron < startR:
+ sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
+ sIndex += 1
+ elif listlen > 1 and not spliceOnly:
+ prevID = prevID + "::" + str(listlen)
+ mIndex += 1
+ # ignore multireads that can also map across splices
+ skip = False
+ for readData in newReadList:
+ if readData[0] > 1:
+ skip = True
+
+ if not skip:
+ for (part, score, sense, chrom, start, mismatches) in newReadList:
+ stop = start + readsize
+ mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
+ else:
+ prevID = readID
+
+ if index % insertSize == 0:
+ rds.insertUniqs(uInsertList)
+ rds.insertMulti(mInsertList)
+ uInsertList = []
+ mInsertList = []
+ if dataType == "RNA":
+ rds.insertSplices(sInsertList)
+ sInsertList = []
+
+ print ".",
+ sys.stdout.flush()
+
+ # start processing new read
+ readList = []
+ prevID = readID
+ bestScore = 0
+ index += 1
+
+ # add the new read
+ score = int(fields[0])
+ sense = fields[8]
+ chrom = fields[13]
+ parts = int(fields[17])
+ passStrict = True
+ if parts > 1:
+ lengthList = fields[18][:-1].split(",")
+ startList = fields[20][:-1].split(",")
+ listlen = len(lengthList)
+ for lpos in range(listlen):
+ if int(lengthList[lpos]) < minSpliceLength:
+ passStrict = False
+
+ # throw out deletions, for now
+ if lpos > 0:
+ if int(lengthList[lpos - 1]) == int(startList[lpos]):
+ passStrict = False
+ pass
+ else:
+ start = int(fields[15])
+
+ if passStrict:
+ if score > bestScore:
+ bestScore = score
+
+ mismatches = ""
+ if int(fields[1]) > 0:
+ try:
+ mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
+ except:
+ mismatches = ""
+
+ if parts == 1:
+ readList.append((parts, score, sense, chrom, start, mismatches))
+ else:
+ readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
+
+ if lIndex % 1000000 == 0:
+ print "processed %d lines" % lIndex
+
+ print "%d lines processed" % lIndex
+
+ if len(uInsertList) > 0:
+ rds.insertUniqs(uInsertList)
+ if len(mInsertList) > 0:
+ rds.insertMulti(mInsertList)
+ if len(sInsertList) > 0:
+ rds.insertSplices(sInsertList)
+
+ combString = "%d unique reads" % uIndex
+ combString += "\t%d multi reads" % mIndex
+ if dataType == "RNA":
+ combString += "\t%d spliced reads" % sIndex
+
+ print
+ print combString.replace("\t", "\n")
+
+ writeLog(outdbname + ".log", verstring, combString)
+
+ if doIndex:
+ print "building index...."
+ if cachePages > defaultCacheSize:
+ rds.setDBcache(cachePages)
+ rds.buildIndex(cachePages)
+ else:
+ rds.buildIndex(defaultCacheSize)
+
+
+def decodeMismatches(gString, rString, rsense):
+
+ output = []
+ rlen = len(gString)
+ partIndex = 0
+ for rindex in xrange(rlen):
+ if gString == ",":
+ partIndex += 1
+
+ if gString[rindex] == rString[rindex]:
+ continue
+
+ genNT = gString[rindex]
+ readNT = rString[rindex]
+ # for eland-compatibility, we are 1-based
+ output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
+
+ return string.join(output, ",")
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makerdsfrombowtie.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 10/20/08.
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 4.1"
+print verstring
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--RNA", dest="genedatafilename")
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--spacer", type="int", dest="spacer")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+ parser.add_option("--forcepair", type="int", dest="forceID")
+ parser.add_option("--flip", action="store_true", dest="flip")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--strip", action="store_true", dest="stripSpace")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
+ trimReadID=True, forceID=None, flip=False, verbose=False,
+ stripSpace=False, cachePages=100000)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ label = args[0]
+ filename = args[1]
+ outdbname = args[2]
+
+ propertyList = []
+ for arg in args:
+ if "::" in arg:
+ (pname, pvalue) = arg.strip().split("::")
+ propertyList.append((pname, pvalue))
+
+ makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
+ options.doIndex, options.spacer, options.trimReadID, options.forceID,
+ options.flip, options.verbose, options.stripSpace, options.cachePages,
+ propertyList)
+
+
+def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
+ doIndex=False, spacer=2, trimReadID=True, forceID=None,
+ flip=False, verbose=False, stripSpace=False, cachePages=100000,
+ propertyList=[]):
+
+ delimiter = "|"
+
+ dataType = "DNA"
+ if genedatafilename is not None:
+ dataType = "RNA"
+ genedatafile = open(genedatafilename)
+
+
+ forcePair = False
+ if forceID is not None:
+ forcePair = True
+ else:
+ forceID = 0
+
+ maxBorder = 0
+ index = 0
+ insertSize = 100000
+
+ writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
+
+ geneDict = {}
+ mapDict = {}
+ if dataType == "RNA":
+ for line in genedatafile:
+ fields = line.strip().split("\t")
+ blockCount = int(fields[7])
+ if blockCount < 2:
+ continue
+
+ uname = fields[0]
+ chrom = fields[1]
+ sense = fields[2]
+ chromstarts = fields[8][:-1].split(",")
+ chromstops = fields[9][:-1].split(",")
+ exonLengths = []
+ totalLength = 0
+ for index in range(blockCount):
+ chromstarts[index] = int(chromstarts[index])
+ chromstops[index] = int(chromstops[index])
+ exonLengths.append(chromstops[index] - chromstarts[index])
+ totalLength += exonLengths[index]
+
+ geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+ mapDict[uname] = []
+
+ genedatafile.close()
+
+ rds = readDataset(outdbname, init, dataType, verbose=True)
+
+ #check that our cacheSize is better than the dataset's default cache size
+ defaultCacheSize = rds.getDefaultCacheSize()
+ if cachePages > defaultCacheSize:
+ if init:
+ rds.setDBcache(cachePages, default=True)
+ else:
+ rds.setDBcache(cachePages)
+
+ if not init and doIndex:
+ try:
+ if rds.hasIndex():
+ rds.dropIndex()
+ except:
+ if verbose:
+ print "couldn't drop Index"
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+ # make some assumptions based on first read
+ infile = open(filename, "r")
+ line = infile.readline()
+ if stripSpace:
+ line = line.replace(" ","")
+
+ fields = line.split()
+ readsize = len(fields[5])
+ pairedTest = fields[0][-2:]
+ paired = False
+ if pairedTest in ["/1", "/2"] or forcePair:
+ print "assuming reads are paired"
+ paired = True
+
+
+ print "read size: %d bp" % readsize
+ if init:
+ rds.insertMetadata([("readsize", readsize)])
+ if paired:
+ rds.insertMetadata([("paired", "True")])
+
+ if "bowtie_mapped" not in rds.getMetadata():
+ rds.insertMetadata([("bowtie_mapped", "True")])
+
+ if dataType == "RNA" and "spacer" not in rds.getMetadata():
+ rds.insertMetadata([("spacer", spacer)])
+
+ infile.close()
+
+ trim = -4
+ if dataType == "RNA":
+ maxBorder = readsize + trim
+
+ infile = open(filename, "r")
+ prevID = ""
+ readList = []
+ uInsertList = []
+ mInsertList = []
+ sInsertList = []
+ index = uIndex = mIndex = sIndex = lIndex = 0
+ for line in infile:
+ lIndex += 1
+ if stripSpace:
+ line = line.replace(" ","")
+
+ fields = line.strip().split()
+ readID = fields[0]
+ if trimReadID:
+ readID = string.join(readID.split(":")[1:], ":")
+
+ if readID != prevID:
+ listlen = len(readList)
+ if trimReadID:
+ prevID = "%s-%s" % (label, prevID)
+
+ if forcePair:
+ prevID += "/%d" % forceID
+
+ if listlen == 1:
+ (sense, chrom, start, mismatches) = readList[0]
+ if flip:
+ if sense == "+":
+ sense = "-"
+ else:
+ sense = "+"
+
+ if "|" not in chrom:
+ stop = start + readsize
+ uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
+ uIndex += 1
+ elif dataType == "RNA":
+ currentSplice = chrom
+ (model, spliceID, regionStart) = currentSplice.split(delimiter)
+ if model not in geneDict:
+ prevID = readID
+ else:
+ (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+ spliceID = int(spliceID)
+ rstart = int(start) - spacer
+ lefthalf = maxBorder - rstart
+ if lefthalf < 1 or lefthalf > maxBorder:
+ prevID = readID
+ else:
+ righthalf = readsize - lefthalf
+ startL = int(regionStart) + rstart
+ stopL = startL + lefthalf
+ startR = chromstarts[spliceID + 1]
+ stopR = chromstarts[spliceID + 1] + righthalf
+ sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
+ sIndex += 1
+ elif listlen > 1:
+ prevID = "%s::%s" % (prevID, str(listlen))
+ mIndex += 1
+ # ignore multireads that can also map across splices
+ skip = False
+ for (sense, chrom, start, mismatches) in readList:
+ if "|" in chrom:
+ skip = True
+
+ if not skip:
+ for (sense, chrom, start, mismatches) in readList:
+ stop = start + readsize
+ if flip:
+ if sense == "+":
+ sense = "-"
+ else:
+ sense = "+"
+
+ mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
+ else:
+ prevID = readID
+
+ if index % insertSize == 0:
+ rds.insertUniqs(uInsertList)
+ rds.insertMulti(mInsertList)
+ uInsertList = []
+ mInsertList = []
+ if dataType == "RNA":
+ rds.insertSplices(sInsertList)
+ sInsertList = []
+
+ print ".",
+ sys.stdout.flush()
+
+ # start processing new read
+ readList = []
+ prevID = readID
+ index += 1
+
+ # add the new read
+ sense = fields[1]
+ chrom = fields[2]
+ # for eland compat, we are 1-based
+ start = int(fields[3]) + 1
+ mismatches = ""
+ if ":" in fields[-1]:
+ mismatches = decodeMismatches(fields[-1], sense)
+
+ readList.append((sense, chrom, start, mismatches))
+ if lIndex % 1000000 == 0:
+ print "processed %d lines" % lIndex
+
+ print "%d lines processed" % lIndex
+
+ if len(uInsertList) > 0:
+ rds.insertUniqs(uInsertList)
+
+ if len(mInsertList) > 0:
+ rds.insertMulti(mInsertList)
+
+ if len(sInsertList) > 0:
+ rds.insertSplices(sInsertList)
+
+ combString = "%d unique reads" % uIndex
+ combString += "\t%d multi reads" % mIndex
+ if dataType == "RNA":
+ combString += "\t%d spliced reads" % sIndex
+
+ print
+ print combString.replace("\t", "\n")
+
+ writeLog("%s.log" % outdbname, verstring, combString)
+
+ if doIndex:
+ print "building index...."
+ if cachePages > defaultCacheSize:
+ rds.setDBcache(cachePages)
+ rds.buildIndex(cachePages)
+ else:
+ rds.buildIndex(defaultCacheSize)
+
+
+def decodeMismatches(mString, rsense):
+ complement = {"A": "T",
+ "T": "A",
+ "C": "G",
+ "G": "C",
+ "N": "N"
+ }
+
+ output = []
+ mismatches = mString.split(",")
+ for mismatch in mismatches:
+ (pos,change) = mismatch.split(":")
+ (genNT, readNT) = change.split(">")
+ if rsense == "-":
+ readNT = complement[readNT]
+ genNT = complement[genNT]
+
+ elandCompatiblePos = int(pos) + 1
+ output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
+
+ return string.join(output, ",")
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makerdsfromeland2.py
+# ENRAGE
+#
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse
+from commoncode import readDataset
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ verstring = "%prog: version 3.4"
+ print verstring
+
+ usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
+ \ninput reads must be sorted to properly record multireads"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init",
+ help="append to existing rds file [default: create new]")
+ parser.add_option("--RNA", dest="geneDataFileName",
+ help="set data type to RNA [default: DNA]")
+ parser.add_option("--index", action="store_true", dest="doIndex",
+ help="index the output rds file")
+ parser.add_option("--cache", type="int", dest="cachePages",
+ help="number of cache pages to use [default: 100000")
+ parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
+ help="use : as the delimiter")
+ parser.add_option("--paired", dest="pairID",
+ help="pairID value")
+ parser.add_option("--extended", action="store_true", dest="extended",
+ help="use eland_extended input")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--maxlines", type="int", dest="maxLines",
+ help="[default: 1000000000")
+ parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ label = args[0]
+ filename = args[1]
+ outdbname = args[2]
+
+ delimiter = '|'
+ if options.useOldDelimiter:
+ delimiter = ':'
+
+ paired = False
+ pairID = '1'
+ if options.pairID is not None:
+ paired = True
+ if options.pairID not in ['1','2']:
+ print 'pairID value must be 1 or 2'
+ sys.exit(-1)
+
+ print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
+
+ dataType = 'DNA'
+ if options.geneDataFileName is not None:
+ dataType = 'RNA'
+
+ makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
+
+
+def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
+ maxBorder = 0
+ index = 0
+ insertSize = 100000
+
+ geneDict = {}
+ mapDict = {}
+ seenSpliceList = []
+ if dataType == 'RNA':
+ genedatafile = open(geneDataFileName)
+ for line in genedatafile:
+ fields = line.strip().split('\t')
+ blockCount = int(fields[7])
+ if blockCount < 2:
+ continue
+
+ uname = fields[0]
+ chrom = fields[1]
+ sense = fields[2]
+ chromstarts = fields[8][:-1].split(',')
+ chromstops = fields[9][:-1].split(',')
+ exonLengths = []
+ totalLength = 0
+ for index in range(blockCount):
+ chromstarts[index] = int(chromstarts[index])
+ chromstops[index] = int(chromstops[index])
+ exonLengths.append(chromstops[index] - chromstarts[index])
+ totalLength += exonLengths[index]
+
+ geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+ mapDict[uname] = []
+ genedatafile.close()
+
+ rds = readDataset(outdbname, init, dataType, verbose=True)
+
+ if cachePages > rds.getDefaultCacheSize():
+ if init:
+ rds.setDBcache(cachePages, default=True)
+ else:
+ rds.setDBcache(cachePages)
+
+ if not init and doIndex:
+ try:
+ if rds.hasIndex():
+ rds.dropIndex()
+ except:
+ if verbose:
+ print "couldn't drop Index"
+
+ propertyList = []
+ for arg in sys.argv:
+ if '::' in arg:
+ (pname, pvalue) = arg.strip().split('::')
+ if pname == 'flowcell' and paired:
+ pvalue = pvalue + '/' + pairID
+
+ propertyList.append((pname, pvalue))
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+ infile = open(filename,'r')
+ line = infile.readline()
+ fields = line.split()
+ readsize = len(fields[1])
+ readsizeString = str(readsize)
+ if dataType == 'RNA' and readsize > 32:
+ splicesizeString = '32'
+ else:
+ splicesizeString = readsizeString
+
+ print 'read size: %d bp' % readsize
+ if init:
+ rds.insertMetadata([('readsize', readsize)])
+ rds.insertMetadata([('eland_mapped', 'True')])
+ if extended:
+ rds.insertMetadata([('eland_extended', 'True')])
+
+ if paired:
+ rds.insertMetadata([('paired', 'True')])
+
+ trim = -4
+ if dataType == 'RNA':
+ maxBorder = readsize + trim
+
+ insertList = []
+ infile = open(filename,'r')
+ print 'mapping unique reads...'
+ lineIndex = 0
+ for line in infile:
+ lineIndex += 1
+ if lineIndex > maxLines:
+ break
+
+ fields = line.split()
+ if fields[2] in ['QC','NM']:
+ continue
+
+ (matchType, bestMatch) = getUniqueMatch(fields[2])
+ if matchType == -1:
+ continue
+
+ bestpos = []
+ try:
+ pos = fields[3].split(',')
+ except:
+ if verbose:
+ print 'problem with line: %s' % line.strip()
+ continue
+
+ matchDict = {0:[], 1:[], 2:[], 3:[]}
+ if len(pos) == 1:
+ if 'splice' in pos:
+ continue
+
+ bestpos = pos
+ else:
+ currentChr = ''
+ for apos in pos:
+ if 'splice' in apos:
+ continue
+
+ if ':' in apos:
+ (front, back) = apos.split(':')
+ currentChr = front
+ else:
+ back = apos
+ apos = currentChr + ':' + apos
+
+ if extended:
+ matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
+ if matchType > 2:
+ matchType = 3
+ else:
+ matchType = int(apos[-1])
+
+ matchDict[matchType].append(apos)
+ if bestMatch[matchType]:
+ bestpos.append(apos)
+
+ # for padded reads, mapped read might have more mismatches!
+ if len(bestpos) == 0:
+ # let's not worry about these yet.
+ if 'splice' in line:
+ continue
+
+ for matchType in [1, 2, 3]:
+ if len(matchDict[matchType]) > 0:
+ if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
+ bestpos = matchDict[matchType]
+ break
+
+ if len(bestpos) == 0 and verbose:
+ print "couldn't pick best read from line: %s" % line
+
+ for apos in bestpos:
+ try:
+ (chrom, back) = apos.split(':')
+ except:
+ continue
+
+ if 'splice' in chrom:
+ continue
+
+ if '/' in chrom:
+ chromfields = chrom.split('/')
+ chrom = chromfields[-1]
+
+ if '.' in chrom:
+ try:
+ (chrom, fileExt) = chrom.split('.')
+ except:
+ if verbose:
+ print 'problem with chromosome on line %s' % line.strip()
+
+ continue
+
+ if extended:
+ if 'F' in back:
+ sense = '+'
+ (start, matchPart) = back.split('F')
+ else:
+ sense = '-'
+ (start, matchPart) = back.split('R')
+
+ start = int(start)
+ if matchPart == readsizeString:
+ matchType = ''
+ else:
+ matchType = decodeMismatches(fields[1], matchPart)
+ else:
+ start = int(back[:-2])
+ if back[-2] == 'F':
+ sense = '+'
+ else:
+ sense = '-'
+
+ stop = int(start) + readsize - 1
+ if paired:
+ readID = label + '-' + str(lineIndex) + '/' + pairID
+ else:
+ readID = label + '-' + str(index)
+
+ if len(chrom) > 0:
+ insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
+
+ if index % insertSize == 0:
+ rds.insertUniqs(insertList)
+ insertList = []
+ print '.',
+ sys.stdout.flush()
+
+ index += 1
+
+ if len(insertList) > 0:
+ rds.insertUniqs(insertList)
+ insertList = []
+
+ print
+ print '%d unique reads' % index
+ infile.close()
+
+ if dataType == 'RNA':
+ print 'mapping splices...'
+ index = 0
+ lineIndex = 0
+ mapfile = open(filename,'r')
+ for line in mapfile:
+ lineIndex += 1
+ if lineIndex > maxLines:
+ break
+
+ if 'splice' not in line:
+ continue
+
+ fields = line.strip().split()
+ (matchType, bestMatch) = getUniqueMatch(fields[2])
+ if matchType == -1:
+ continue
+
+ bestpos = []
+ pos = fields[3].split(',')
+ matchDict = {0:[], 1:[], 2:[], 3:[]}
+ if len(pos) == 1:
+ if 'chr' in pos:
+ continue
+
+ bestpos = pos
+ else:
+ currentSplice = ''
+ for apos in pos:
+ if 'splice' not in apos:
+ continue
+
+ if ':' in apos:
+ if delimiter == ':':
+ try:
+ (extmodel, spliceID, regionStart, thepos) = apos.split(':')
+ except:
+ try:
+ (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
+ extmodel = extmodel1 + ':' + extmodel2
+ except:
+ print 'warning: could not process splice %s' % apos
+ continue
+
+ currentSplice = extmodel + ':' + spliceID + ':' + regionStart
+ else:
+ try:
+ (currentSplice, thepos) = apos.split(':')
+ except:
+ try:
+ (extmodel1, restSplice, thepos) = apos.split(':')
+ currentSplice = extmodel1 + ':' + restSplice
+ (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
+ except:
+ print 'warning: could not process splice %s' % apos
+ continue
+ else:
+ thepos = apos
+ apos = currentSplice + ':' + apos
+
+ if extended:
+ matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
+ if matchType > 2:
+ matchType = 3
+
+ # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
+ if thepos[:2] == splicesizeString:
+ matchType = 0
+ else:
+ matchType = int(apos[-1])
+
+ if bestMatch[matchType]:
+ bestpos.append(apos)
+
+ # for padded reads, mapped read might have more mismatches!
+ if len(bestpos) == 0:
+ for matchType in [1, 2, 3]:
+ if len(matchDict[matchType]) > 0:
+ if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
+ bestpos = matchDict[matchType]
+
+ break
+ if len(bestpos) == 0 and verbose:
+ print "couldn't pick best read from line: %s" % line
+
+ for apos in bestpos:
+ if delimiter == ':':
+ try:
+ (extmodel, spliceID, regionStart, thepos) = apos.split(':')
+ except:
+ try:
+ (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
+ extmodel = extmodel1 + ':' + extmodel2
+ except:
+ print 'warning: could not process splice %s' % apos
+ continue
+ else:
+ try:
+ (currentSplice, thepos) = apos.split(':')
+ except:
+ try:
+ (extmodel1, restSplice, thepos) = apos.split(':')
+ currentSplice = extmodel1 + ':' + restSplice
+ except:
+ print 'warning: could not process splice %s' % apos
+ continue
+
+ (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
+
+ modelfields = extmodel.split('/')
+ if len(modelfields) > 2:
+ model = string.join(modelfields[1:],'/')
+ else:
+ model = modelfields[1]
+
+ if model not in geneDict:
+ print fields
+ continue
+
+ (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+ if extended:
+ if 'F' in thepos:
+ rsense = '+'
+ (start, matchPart) = thepos.split('F')
+ else:
+ rsense = '-'
+ (start, matchPart) = thepos.split('R')
+
+ rstart = int(start) - 2
+ if matchPart == readsizeString:
+ matchType = ''
+ elif matchPart[:2] == splicesizeString:
+ matchType = ''
+ else:
+ matchType = decodeMismatches(fields[1], matchPart)
+ else:
+ rstart = int(thepos[:-2]) - 2
+ if thepos[-2] == 'F':
+ rsense = '+'
+ else:
+ rsense = '-'
+
+ if trim <= rstart <= maxBorder:
+ pass
+ else:
+ print rstart
+ continue
+
+ currentSplice = model + delimiter + spliceID + delimiter + regionStart
+ spliceID = int(spliceID)
+ lefthalf = maxBorder - rstart
+ if lefthalf < 1 or lefthalf > maxBorder:
+ continue
+
+ righthalf = readsize - lefthalf
+ startL = int(regionStart) + rstart
+ stopL = startL + lefthalf
+ startR = chromstarts[spliceID + 1]
+ stopR = chromstarts[spliceID + 1] + righthalf
+ if paired:
+ readName = label + '-' + str(lineIndex) + '/' + pairID
+ else:
+ readName = model + '-' + str(thepos)
+
+ insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
+ index += 1
+ if index % insertSize == 0:
+ rds.insertSplices(insertList)
+ print '.',
+ sys.stdout.flush()
+ insertList = []
+
+ if currentSplice not in seenSpliceList:
+ seenSpliceList.append(currentSplice)
+
+ mapfile.close()
+ if len(insertList) > 0:
+ rds.insertSplices(insertList)
+ insertList = []
+
+ print
+ print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
+
+ infile = open(filename,'r')
+ print 'mapping multireads...'
+ lineIndex = 0
+ origReadid = rds.getMultiCount()
+ try:
+ readid = int(origReadid) + 1
+ except:
+ readid = 0
+ origReadid = 0
+
+ print 'starting at %d' % (readid + 1)
+
+ for line in infile:
+ lineIndex += 1
+ if lineIndex > maxLines:
+ break
+
+ fields = line.split()
+ if len(fields) < 4:
+ continue
+
+ if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
+ continue
+
+ (zero, one, two) = fields[2].split(':')
+ zero = int(zero)
+ one = int(one)
+ two = int(two)
+
+ bestMatch = [False] * readsize
+ if zero > 1:
+ bestMatch[0] = True
+ elif zero == 0 and one > 1:
+ bestMatch[1] = True
+ elif zero == 0 and one == 0 and two > 1:
+ bestMatch[2] = True
+ else:
+ continue
+
+ readcount = 0
+ bestpos = []
+ pos = fields[3].split(',')
+ matchDict = {0:[], 1:[], 2:[], 3:[]}
+ currentChr = ''
+ for apos in pos:
+ if ':' in apos:
+ try:
+ (front, back) = apos.split(':')
+ except:
+ if verbose:
+ print "problem splitting %s" % str(apos)
+ continue
+
+ currentChr = front
+ else:
+ back = apos
+ apos = currentChr + ':' + apos
+
+ if extended:
+ matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
+ else:
+ matchType = int(apos[-1])
+
+ try:
+ matchDict[matchType].append(apos)
+ except:
+ matchDict[matchType] = [apos]
+
+ if bestMatch[matchType]:
+ bestpos.append(apos)
+
+ # for padded reads, mapped read might have more mismatches!
+ if len(bestpos) == 0:
+ for matchType in [1, 2, 3]:
+ if len(matchDict[matchType]) > 0:
+ if len(matchDict[matchType]) > 1:
+ noSplice = True
+ for arg in matchDict[matchType]:
+ if 'splice' in arg:
+ noSplice = False
+
+ if noSplice:
+ bestpos = matchDict[matchType]
+ break
+
+ if len(bestpos) == 0 and verbose:
+ print "couldn't pick best read from line: %s" % line
+ continue
+
+ hasSplice = False
+ for apos in bestpos:
+ if 'splice' in apos:
+ hasSplice = True
+
+ # do not allow multireads that can also map accross splices for now
+ if hasSplice:
+ if verbose:
+ print "throwing out multiread because of splice conflict"
+ continue
+
+ if len(bestpos) > 0:
+ readid += 1
+
+ for apos in bestpos:
+ readcount += 1
+ (front, back) = apos.split(':')
+ chrom = front[:-3]
+ if extended:
+ if 'F' in back:
+ sense = '+'
+ (start, matchPart) = back.split('F')
+ else:
+ sense = '-'
+ (start, matchPart) = back.split('R')
+
+ start = int(start)
+ if matchPart == readsizeString:
+ matchType = ''
+ else:
+ matchType = decodeMismatches(fields[1], matchPart)
+ else:
+ start = int(back[:-2])
+ if back[-2] == 'F':
+ sense = '+'
+ else:
+ sense = '-'
+
+ stop = int(start) + readsize
+ readName = '%dx%d' % (readid, len(bestpos))
+ if paired:
+ readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
+
+ insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
+ if index % insertSize == 0:
+ rds.insertMulti(insertList)
+ insertList = []
+ print '.',
+ sys.stdout.flush()
+
+ index += 1
+
+ if len(insertList) > 0:
+ rds.insertMulti(insertList)
+ insertList = []
+
+ print
+ print '%d multireads' % (readid - origReadid)
+
+ if doIndex:
+ print 'building index....'
+ rds.buildIndex(cachePages)
+
+
+def getUniqueMatch(elandCode):
+ (zero, one, two) = elandCode.split(':')
+ zero = int(zero)
+ one = int(one)
+ two = int(two)
+ bestMatch = [False, False, False, False]
+ if zero == 1:
+ bestMatch[0] = True
+ matchType = 0
+ elif zero == 0 and one == 1:
+ bestMatch[1] = True
+ matchType = 1
+ elif zero == 0 and one == 0 and two == 1:
+ bestMatch[2] = True
+ matchType = 2
+ else:
+ matchType = -1
+
+ return (matchType, bestMatch)
+
+
+def decodeMismatches(origSeq, code):
+ output = []
+ number = '0'
+ index = 0
+ for pos in code:
+ if pos.isdigit():
+ number += pos
+ else:
+ index += int(number) + 1
+ origNT = origSeq[index - 1]
+ output.append('%s%d%s' % (origNT, index, pos))
+ number = '0'
+
+ return string.join(output, ',')
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makesitetrack.py
+# ENRAGE
+#
+
+import sys, string, optparse
+
+print "%prog: version 2.1"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog sitefile outbedfile [--noheader] [--stype fieldID] [--color xx,yy,zz] [--append] [--exploded]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--noheader", action="store_true", dest="noHeader")
+ parser.add_option("--stype", type="int", dest="stypeID")
+ parser.add_option("--color", dest="color")
+ parser.add_option("--append", action="store_true", dest="append")
+ parser.add_option("--exploded", action="store_false", dest="compact")
+ parser.set_defaults(stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ infile = args[0]
+ outfileName = args[1]
+
+ makesitetrack(infile, outfileName, options.stypeID, options.color, options.append, options.compact, options.noHeader)
+
+
+def makesitetrack(infileName, outFileName, stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False):
+ if stypeID is not None:
+ doStype = True
+ else:
+ doStype = False
+ stypeID = 4
+
+ infile = open(infileName)
+
+ if append:
+ outfile = open(outFileName, "a")
+ else:
+ outfile = open(outFileName, "w")
+
+ try:
+ (name, extension) = outFileName.split(".")
+ except ValueError:
+ name = outFileName.split(".")[:-1]
+ name = string.join(name, "_")
+
+ if not noHeader:
+ outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % name)
+
+ count = 1
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ fields = line.split()
+ if compact:
+ (chrom, loc) = fields[0].split(":")
+ (start, stop) = loc.split("-")
+ score = fields[1]
+ else:
+ chrom = fields[1]
+ start = fields[2]
+ stop = fields[3]
+ score = 1.
+
+ stype = "%s-%s" % (name, str(count))
+ if doStype:
+ try:
+ stype = fields[stypeID]
+ if stype == "11":
+ stype = "can"
+ elif stype == "0":
+ stype = "half"
+ else:
+ stype = "NC" + stype
+ except IndexError:
+ pass
+
+ sense = fields[-2].strip()
+ if sense not in ["+", "-"]:
+ sense = "+"
+
+ outfile.write("%s\t%s\t%d\t%s\t%s\t%s\t-\t-\t%s\n" % (chrom, start, int(stop) + 1, stype, score, sense, color))
+ count += 1
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# makewiggle.py
+# ENRAGE
+#
+import sys, optparse
+from commoncode import readDataset
+
+print "%prog: version 6.7"
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s name rdsfile outfilename [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--raw", action="store_false", dest="doNormalize")
+ parser.add_option("--color", dest="color")
+ parser.add_option("--altcolor", dest="altColor")
+ parser.add_option("--chrom", dest="limitChrom")
+ parser.add_option("--shift", type="int", dest="shift")
+ parser.add_option("--split", action="store_true", dest="doSplit")
+ parser.add_option("--listfile", dest="listfilename")
+ parser.add_option("--listprefix", dest="listPrefix")
+ parser.add_option("--group", dest="group")
+ parser.add_option("--startPriority", type="float", dest="startPriority")
+ parser.add_option("--skiprandom", action="store_true", dest="skipRandom")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="withSplices")
+ parser.add_option("--singlebase", action="store_true", dest="doSingle")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--stranded", dest="strand")
+ parser.add_option("--maxchunk", type="int", dest="chunk")
+ parser.set_defaults(doNormalize=True, color=None, altColor="", limitChrom=None,
+ shift=0, doSplit=False, listfilename=None, listPrefix="",
+ group="", startPriority=0.01, skipRandom=False, withMulti=True,
+ withSplices=False, doSingle=False, cachePages=-1, enforceChr=False,
+ strand=None, chunk=20)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ name = args[0]
+ hitfilename = args[1]
+ outfilename = args[2]
+
+ makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
+ options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
+ options.group, options.startPriority, options.skipRandom, options.withMulti,
+ options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
+ options.chunk)
+
+
+def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, altColor="",
+ limitChrom=None, shift=0, doSplit=False, listfilename=None, listPrefix="",
+ group="", startPriority=0.01, skipRandom=False, withMulti=True, withSplices=False,
+ doSingle=False, cachePages=-1, enforceChr=False, strand=None, chunk=20):
+
+ priorityIncrement = 0.01
+ wigType = "bedGraph"
+
+ if color is not None:
+ colorString = " color=%s" % color
+ else:
+ colorString = ""
+
+ if altColor:
+ colorString += " altcolor=%s" % altColor
+
+ doList = False
+ if listfilename is not None:
+ doList = True
+
+ chromLimit = False
+ if limitChrom is not None:
+ chromLimit = True
+
+ if group:
+ groupName = "group=%s" % group
+
+ doCache = False
+ if cachePages > 0:
+ doCache = True
+
+ maxSpan = chunk * 1000000
+
+ isStranded = False
+ strandedDirection = "both"
+ if strand is not None:
+ isStranded = True
+ if strand == "plus":
+ strandedDirection = "plusOnly"
+ elif strand == "minus":
+ strandedDirection = "minusOnly"
+
+ print "will keep track of %s strand(s)" % strandedDirection
+
+ if shift:
+ print "Will shift reads by +/- %d bp according to their sense" % shift
+ name += "shift=%d" % shift
+
+ hitRDS = readDataset(hitfilename, verbose=True, cache=doCache)
+
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ readlen = hitRDS.getReadSize()
+
+ if doNormalize:
+ normalizeBy = len(hitRDS) / 1000000.
+ else:
+ normalizeBy = 1.
+
+ if doList:
+ listfile = open(listfilename, "w")
+
+ priority = startPriority
+ if not doSplit:
+ outfile = open(outfilename, "w")
+ if doList:
+ listfile.write("%s%s\n" % (listPrefix, outfilename))
+
+ outfile.write('track type=%s name="%s" %s priority=%.3f visibility=full%s\n' % (wigType, name, groupName, priority, colorString))
+
+ chromList = hitRDS.getChromosomes()
+ chromList.sort()
+ for achrom in chromList:
+ if enforceChr and ("chr" not in achrom):
+ continue
+
+ if chromLimit and achrom != limitChrom:
+ continue
+
+ if skipRandom and "random" in achrom:
+ continue
+
+ if doSplit:
+ outfile = open("%s.%s" % (outfilename, achrom), "w")
+ if doList:
+ listfile.write("%s%s.%s\n" % (listPrefix, outfilename, achrom))
+
+ outfile.write('track type=%s name="%s %s" %s priority=%.3f visibility=full%s\n' % (wigType, name, achrom, groupName, priority, colorString))
+ priority += priorityIncrement
+
+ lastNT = hitRDS.getMaxCoordinate(achrom, doMulti=withMulti, doSplices=withSplices) + readlen
+ spanStart = 0
+
+ previousVal = 0
+ previousStart = 1
+ lineIndex = 0
+ for spanStop in xrange(maxSpan, lastNT+maxSpan, maxSpan):
+ if spanStop > lastNT:
+ spanStop = lastNT
+
+ print achrom, spanStart, spanStop
+ chromModel = hitRDS.getChromProfile(achrom, spanStart, spanStop, withMulti, withSplices, normalizeBy, isStranded, strandedDirection, shiftValue=shift)
+
+ for index in xrange(len(chromModel)):
+ currentVal = chromModel[index]
+ if doSingle:
+ outline = "%s %d %.4f\n" % (achrom, spanStart + index, currentVal)
+ outfile.write(outline)
+ continue
+
+ if currentVal == previousVal:
+ continue
+
+ if currentVal != previousVal:
+ if previousVal != 0:
+ lastpos = index + spanStart
+ outline = "%s %d %d %.4f\n" % (achrom, previousStart, lastpos, previousVal)
+ outfile.write(outline)
+ lineIndex += 1
+
+ previousVal = currentVal
+ previousStart = index + spanStart
+
+ currentVal = 0
+ del chromModel
+ spanStart = spanStop + 1
+
+ if doSplit:
+ outfile.close()
+
+ if doSingle:
+ print index + 1
+ else:
+ print lineIndex
+
+ if not doSplit:
+ outfile.close()
+
+ if doList:
+ listfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+
+print "%prog: version 5.6"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--gidField", type="int", dest="fieldID")
+ parser.add_option("--maxLength", type="float", dest="maxLength")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
+ replaceModels=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(sys.argv) < 6:
+ print usage
+ print "\twhere splicecountfile can be set to 'none' to not count splices\n"
+ sys.exit(1)
+
+ genome = args[0]
+ hitfile = args[1]
+ uniquecountfile = args[2]
+ splicecountfile = args[3]
+ outfile = args[4]
+
+ candidateLines = []
+ acceptedfilename = ""
+ if len(args) > 5:
+ try:
+ candidatefile = open(args[5])
+ candidateLines = candidatefile.readlines()
+ candidatefile.close()
+ acceptedfilename = args[6]
+ except IndexError:
+ pass
+
+ normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
+ candidateLines, acceptedfilename, options.fieldID,
+ options.maxLength, options.doCache, options.extendGenome,
+ options.replaceModels)
+
+
+def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
+ outfilename, candidateLines=[], acceptedfilename="",
+ fieldID=0, maxLength=1000000000., doCache=False,
+ extendGenome="", replaceModels=False):
+
+ uniquecountfile = open(uniquecountfilename)
+
+ if acceptedfilename:
+ acceptedfile = open(acceptedfilename, "w")
+
+ dosplicecount = False
+ if splicecountfilename != "none":
+ dosplicecount = True
+ splicecountfile = open(splicecountfilename)
+
+ if extendGenome:
+ if replaceModels:
+ print "will replace gene models with %s" % extendGenome
+ else:
+ print "will extend gene models with %s" % extendGenome
+
+ if doCache:
+ cacheGeneDB(genome)
+ hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+ print "%s cached" % genome
+ else:
+ hg = Genome(genome, inRAM=True)
+
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace=replaceModels)
+
+ RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
+ uniqcount = RDS.getUniqsCount()
+ print "%d unique reads" % uniqcount
+
+ splicecount = 0
+ countDict = {}
+ gidList = []
+ farList = []
+ candidateDict = {}
+
+ gidToGeneDict = {}
+
+ featuresDict = hg.getallGeneFeatures()
+ print "got featuresDict"
+
+ outfile = open(outfilename, "w")
+
+ for line in uniquecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ gene = fields[1]
+ countDict[gid] = float(fields[-1])
+ gidList.append(gid)
+ gidToGeneDict[gid] = gene
+
+ uniquecountfile.close()
+
+ if dosplicecount:
+ for line in splicecountfile:
+ fields = line.strip().split()
+ gid = fields[fieldID]
+ try:
+ countDict[gid] += float(fields[-1])
+ except:
+ print fields
+ continue
+
+ splicecount += float(fields[-1])
+
+ splicecountfile.close()
+
+ for line in candidateLines:
+ if "#" in line:
+ continue
+
+ fields = line.strip().split()
+ gid = fields[1]
+ gene = fields[0]
+ if gid not in gidList:
+ if gid not in farList:
+ farList.append(gid)
+ gidToGeneDict[gid] = gene
+
+ if gid not in countDict:
+ countDict[gid] = 0
+
+ countDict[gid] += float(fields[6])
+
+ if gid not in candidateDict:
+ candidateDict[gid] = []
+
+ candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+
+ totalCount = (uniqcount + splicecount) / 1000000.
+ uniqScale = uniqcount / 1000000.
+ for gid in gidList:
+ gene = gidToGeneDict[gid]
+ featureList = []
+ try:
+ featureList = featuresDict[gid]
+ except:
+ try:
+ featureList = featuresDict[gene]
+ except:
+ print gene, gid
+
+ newfeatureList = []
+ geneLength = 0.
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+ geneLength += (abs(start - stop) + 1.) / 1000.
+
+ if geneLength < 0.1:
+ geneLength = 0.1
+ elif geneLength > maxLength:
+ geneLength = maxLength
+
+ rpm = countDict[gid] / totalCount
+ rpkm = rpm / geneLength
+ if gid in candidateDict:
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ cratio = cCount / (cLength / 1000.)
+ cratio = (uniqScale * cratio) / totalCount
+ if 10. * cratio < rpkm:
+ continue
+
+ countDict[gid] += cCount
+ geneLength += cLength / 1000.
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+
+ rpm = countDict[gid] / totalCount
+ rpkm = rpm / geneLength
+ outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
+
+ for gid in farList:
+ gene = gidToGeneDict[gid]
+ geneLength = 0
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ geneLength += cLength / 1000.
+
+ if geneLength < 0.1:
+ continue
+
+ for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+ cratio = cCount / (cLength / 1000.)
+ cratio = cratio / totalCount
+ acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+
+ rpm = countDict[gid] / totalCount
+ rpkm = rpm / geneLength
+ outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
+
+ outfile.close()
+ try:
+ acceptedfile.close()
+ except:
+ pass
+
+ if doCache:
+ uncacheGeneDB(genome)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset
+
+print "%prog: version 3.5" % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--multifraction", action="store_true", dest="reportfraction")
+ parser.add_option("--multifold", action="store_true", dest="reportFold")
+ parser.add_option("--minrpkm", type="float", dest="minThreshold")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--withGID", action="store_true", dest="writeGID")
+ parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
+ doCache=False, writeGID=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ rdsfilename = argv[1]
+ expandedRPKMfile = args[3]
+ multicountfile = args[2]
+ outfilename = args[3]
+
+ normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
+ options.reportFraction, options.reportFold, options.minThreshold,
+ options.doCache, options.writeGID)
+
+
+def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
+ reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
+ writeGID=False):
+
+ expandedRPKMfile = open(expandedRPKMfilename)
+ multicountfile = open(multicountfilename)
+
+ if reportFraction:
+ print "reporting fractional contribution of multireads"
+ reportFold = False
+ elif reportFold:
+ print "reporting fold contribution of multireads"
+
+ RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
+ uniqcount = RDS.getUniqsCount()
+ splicecount = RDS.getSplicesCount()
+ multicount = RDS.getMultiCount()
+ countDict = {}
+ multicountDict = {}
+ lengthDict = {}
+ gidList = []
+
+ uniqspliceCount = (uniqcount + splicecount) / 1000000.
+ totalCount = (uniqcount + splicecount + multicount) / 1000000.
+
+ symbolDict = {}
+
+ for line in expandedRPKMfile:
+ fields = line.strip().split()
+ lineGID = fields[0]
+ symbolDict[lineGID] = fields[1]
+ countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
+ lengthDict[lineGID] = float(fields[-2])
+ multicountDict[lineGID] = 0
+ if lineGID not in gidList:
+ gidList.append(lineGID)
+
+ expandedRPKMfile.close()
+
+ for line in multicountfile:
+ fields = line.strip().split()
+ gid = fields[0]
+ if gid in countDict:
+ countDict[gid] += float(fields[-1])
+ multicountDict[gid] = float(fields[-1])
+ else:
+ print "could not find gid %s in dictionaries" % gid
+
+ multicountfile.close()
+
+ outfile = open(outfilename, "w")
+ outheader = "#"
+ if writeGID:
+ outheader += "GID\t"
+
+ outheader += "gene\tlen_kb\tRPKM"
+ if reportFraction:
+ outheader += "\tmulti/all"
+ elif reportFold:
+ outheader += "\tall/uniq"
+
+ outheader += "\n"
+ outfile.write(outheader)
+
+ outlineList = []
+ index = 0
+ for gid in gidList:
+ outline = ""
+ gene = symbolDict[gid]
+ rpm = countDict[gid] / totalCount
+ rpkm = rpm / lengthDict[gid]
+ if rpkm < minThreshold:
+ continue
+
+ if writeGID:
+ outline = "%s\t" % gid
+
+ index += 1
+ try:
+ multirpm = multicountDict[gid] / totalCount
+ multirpkm = multirpm / lengthDict[gid]
+ except:
+ print "problem with %s - skipping " % gid
+ continue
+
+ if reportFraction or reportFold:
+ try:
+ if reportFraction:
+ multivalue = multirpkm / rpkm
+ else:
+ if rpm > multirpm:
+ uniqrpkm = (rpm - multirpm) / lengthDict[gid]
+ multivalue = rpkm / uniqrpkm
+ elif rpkm > 0.01:
+ multivalue = 100.
+ else:
+ multivalue = 1.0
+ except:
+ multivalue = 0
+
+ outline += "%s\t%.3f\t%.2f\t%.2f\n" % (gene, lengthDict[gid], rpkm, multivalue)
+ outlineList.append((rpkm, outline))
+ else:
+ outline += "%s\t%.3f\t%.2f\n" % (gene, lengthDict[gid], rpkm)
+ outlineList.append((rpkm, outline))
+
+ outlineList.sort()
+ outlineList.reverse()
+
+ for (rpkm, line) in outlineList:
+ outfile.write(line)
+
+ outfile.close()
+
+ print "returned %d genes" % index
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# partition.py
+# ENRAGE
+#
+""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum]
+ where the regionfiles must be comma-separated with no white space
+ -minFeature controls the size of the smallest partition
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, string, optparse
+from commoncode import getMergedRegions, writeLog
+
+versionString = '%s: version 2.0' % sys.argv[0]
+print versionString
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--minFeature", type="int", dest="minFeature",
+ help="size of smallest partition")
+ parser.add_option("--chromField", type="int", dest="cField",
+ help="num chromosome fields")
+ parser.add_option("--padregion", type="int", dest="padregion",
+ help="padding on each side of region")
+ parser.add_option("--mergeregion", type="int", dest="mergeregion",
+ help="bp threshold to merge regions")
+ parser.add_option("--nomerge", action="store_false", dest="merging",
+ help="do not merge regions")
+ parser.add_option("--log", dest="logfilename",
+ help="log file")
+ parser.add_option("--locID", action="store_true", dest="locID",
+ help="use location as region ID")
+ parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
+ help="ignore 'random' chromosomes")
+ parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log")
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ mergeID = args[0]
+ regionfiles = args[1]
+ outfilename = args[2]
+
+ if options.padregion:
+ print "padding %d bp on each side of a region" % options.padregion
+
+ if options.mergeregion:
+ print "merging regions closer than %d bp" % options.mergeregion
+
+ if options.locID:
+ print "using locations as region ID"
+
+ if options.ignoreRandom:
+ print "ignoring 'random' chromosomes"
+
+ partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename)
+
+
+def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"):
+
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+ allregionsDict = {}
+ regionFileList = regionfiles.split(',')
+ numRegions = len(regionFileList)
+ chromList = []
+ for regionID in range(numRegions):
+ allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion, minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion)
+ for achrom in allregionsDict[regionID]:
+ if achrom not in chromList:
+ chromList.append(achrom)
+
+ outregionDict = {}
+
+ chromList = sorted(chromList)
+
+ for chrom in chromList:
+ if ignoreRandom and 'random' in chrom:
+ continue
+
+ outregionDict[chrom] = []
+ pointList = []
+ for regionID in range(numRegions):
+ if chrom in allregionsDict[regionID]:
+ for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]:
+ pointList.append(rstart)
+ pointList.append(rstop)
+
+ pointList.sort()
+ start = 0
+ for point in pointList:
+ if (point - start) > minFeature:
+ outregionDict[chrom].append((start, point - 1, point - 1 - start))
+ start = point
+
+ outfile = open(outfilename, 'w')
+ if locID:
+ outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n')
+ else:
+ outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n')
+
+ index = 0
+ for chrom in outregionDict:
+ for (start, stop, length) in outregionDict[chrom]:
+ index += 1
+ if locID:
+ outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.))
+ else:
+ outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.))
+
+ message = "%s was partitioned into %d regions" % (mergeID, index)
+ print message
+ writeLog(logfilename, versionString, message)
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# peakstoregion.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 3:
+ print "usage: python %s peakfile outfile [radius] [chromField] [posField] [labelField] [datafield]" % sys.argv[0]
+ sys.exit(1)
+
+ peakfile = argv[1]
+ outfile = argv[2]
+
+ radius = 500
+ chromField = 2
+ posField = 3
+ labelField = 1
+ dataField = -1
+
+ if len(argv) > 3:
+ radius = int(argv[3])
+
+ if len(argv) > 4:
+ chromField = int(argv[4])
+
+ if len(argv) > 5:
+ posField = int(argv[5])
+
+ if len(argv) > 6:
+ labelField = int(argv[6])
+
+ if len(argv) > 7:
+ dataField = int(argv[7])
+
+ peakstoregion(peakfile, outfile, radius, chromField, posField, labelField, dataField)
+
+
+def peakstoregion(peakfilename, outfilename, radius=500, chromField=2, posField=3, labelField=1, dataField=-1):
+ peakfile = open(peakfilename)
+ outfile = open(outfilename, "w")
+
+ for line in peakfile:
+ fields = line.strip().split()
+ label = "REGION"
+ try:
+ label = fields[labelField]
+ except IndexError:
+ pass
+
+ start = int(fields[posField]) - radius
+ stop = int(fields[posField]) + radius
+ outfile.write("%s\t%s\t%d\t%d\t%s\n" % (label, fields[chromField], start, stop, fields[dataField]))
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# plotbardist.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 12/13/07.
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import optparse
+import matplotlib
+from pylab import *
+from math import *
+
+
+print "%prog: version 3.2"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog infile1 [infile2] [infile3] [options] outfile.png"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--bins", type="int", dest="bins")
+ parser.add_option("--field", type="int", dest="binnedField")
+ parser.add_option("--binSize", type="float", dest="binLength")
+ parser.add_option("--doLog", type="int", dest="logBase")
+ parser.add_option("--ymax", type="int", dest="maxY")
+ parser.add_option("--xlabel", dest="xLabel")
+ parser.add_option("--ylabel", dest="yLabel")
+ parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
+ parser.add_option("--title", dest="figTitle")
+ parser.add_option("--legend", dest="barsLegend", help="comma separated list")
+ parser.add_option("--xoffset", type="float", dest="pointOffset")
+ parser.add_option("--figsize", dest="figSizes", help="x,y pair")
+ parser.set_defaults(bins=10, binnedField=-1, binLength=-1, logBase=None, maxY=0,
+ xLabel="bins", yLabel="count", binLabels=None, figTitle="",
+ barsLegend=None, pointOffset=0., figSizes=None)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+
+ if len(args) < 2 or len(args) > 4:
+ print usage
+ print "where labelList and legendList are comma delimited strings of the form 'labelA,labelB,...,labelN'"
+ sys.exit(1)
+
+ fileList = args[:-1]
+ pngfilename = args[-1]
+
+ plotbardist(fileList, pngfilename, options.bins, options.binnedField, options.binLength,
+ options.logBase, options.maxY, options.xLabel, options.yLabel, options.binLabels,
+ options.figTitle, options.barsLegend, options.pointOffset, options.figSizes)
+
+
+def plotbardist(fileList, pngfilename, bins=10, binnedField=-1, binLength=-1, logBase=None,
+ maxY=0, xLabel="bins", yLabel="count", binLabels=None, figTitle="",
+ barsLegend=None, pointOffset=0., figSizes=None):
+
+ matplotlib.use("Agg")
+ plotParameters = {1: {"width": 0.5,
+ "offset": [-0.25]},
+ 2: {"width": 0.3,
+ "offset": [-0.3, 0]},
+ 3: {"width": 0.2,
+ "offset": [-0.2, 0., 0.2]}
+ }
+
+ colorList = ["b", "r", "c"]
+ width = plotParameters[len(fileList)]["width"]
+ offset = plotParameters[len(fileList)]["offset"]
+
+ doLog = False
+ if logBase is not None:
+ doLog = True
+ print "taking log%d of x datapoints" % logBase
+ xLabel = "log%d(%s)" % (logBase, xLabel)
+ else:
+ logBase = 10
+
+ if figSizes is not None:
+ sizes = figSizes.strip().split(",")
+ figure(figsize=(float(sizes[0]),float(sizes[1])))
+
+ doLabels = False
+ if binLabels is not None:
+ binLabels = binLabels.strip().split(",")
+ doLabels = True
+ else:
+ binLabels = []
+
+ if barsLegend is not None:
+ barsLegend = barsLegend.strip().split(",")
+ else:
+ barsLegend = []
+
+ ind2 = arange(bins)
+
+ bars = []
+ barsColors = []
+ index = 0
+ for fileName in fileList:
+ aFile = open(fileName)
+ distbin = bins * [0]
+
+ dataList = []
+ for line in aFile:
+ fields = line.strip().split()
+ try:
+ point = float(fields[binnedField]) + pointOffset
+ if doLog:
+ if point < 1:
+ point = 1
+
+ point = log(point, logBase)
+
+ dataList.append(point)
+ except:
+ continue
+
+ print "%d data points" % len(dataList)
+
+ dataList.sort()
+ print "low = %f high = %f" % (dataList[0], dataList[-1])
+
+ if binLength < 0:
+ binLength = abs(dataList[-1] - dataList[0]) / bins
+
+ for point in dataList:
+ try:
+ distbin[int(round(point/binLength))] += 1
+ except:
+ distbin[-1] += 1
+
+ print binLength, int(round(point/binLength))
+
+ bars.append(bar(ind2 + offset[index], distbin, width, color=colorList[index]))
+ barsColors.append(bars[-1][0])
+
+ print distbin
+ halfCount = sum(distbin) / 2
+ median = 0
+ foundMedian = False
+ while not foundMedian:
+ if sum(distbin[:median]) < halfCount:
+ median += 1
+ else:
+ foundMedian = True
+
+ print median
+ index += 1
+
+ xlim(-1 * width - 0.2, bins + 0.2)
+
+ if len(barsLegend) > 0:
+ legend(barsColors, barsLegend)
+
+ ylabel(yLabel)
+ xlabel(xLabel)
+
+ if doLabels:
+ setp(gca(), "xticklabels", binLabels)
+
+ if maxY > 0:
+ ylim(0, maxY)
+
+ if len(figTitle) > 0:
+ title(figTitle)
+
+ gca().get_xaxis().tick_bottom()
+ gca().get_yaxis().tick_left()
+
+ savefig(pngfilename)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# plotnomogram.py
+# ENRAGE
+#
+
+import sys
+
+import matplotlib
+from pylab import *
+import matplotlib.axes
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+print "%s: version 1.1" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 5:
+ print "usage: python %s maxdev xreads infile outpng" % argv[0]
+ sys.exit(1)
+
+ maxdev = float(argv[1])
+ xreads = float(argv[2])
+ infilename = argv[3]
+ outfilename = argv[4]
+
+ plotnomogram(maxdev, xreads, infilename, outfilename)
+
+
+def plotnomogram(maxdev, xreads, infilename, outfilename):
+ matplotlib.use("Agg")
+ infile = open(infilename)
+ line = infile.readline().strip()
+
+ percentages = line.split()
+ del percentages[0]
+
+ listWidth = len(percentages)
+
+ geneValues = {}
+
+ for line in infile:
+ fields = line.strip().split()
+ geneValues[fields[0]] = []
+ for pos in range(listWidth):
+ geneValues[fields[0]].append(float(fields[1 + pos]))
+
+ # categories here are: 3000+, 2999-300, 299-30, 29-3
+ genes3000p = []
+ genes300p = []
+ genes30p = []
+ genes3p = []
+
+ for gene in geneValues:
+ finalLevel = geneValues[gene][0]
+ if finalLevel >= 3000:
+ genes3000p.append(gene)
+ elif finalLevel >= 300:
+ genes300p.append(gene)
+ elif finalLevel >= 30:
+ genes30p.append(gene)
+ elif finalLevel >= 3:
+ genes3p.append(gene)
+
+ organizedList = [genes3000p, genes300p, genes30p, genes3p]
+ listNames = ["3000+ RPKM ", "300-2999 RPKM", "30-299 RPKM ", "3-29 RPKM "]
+ listColors = ["k", "c", "m", "r"]
+ geneCounts = {}
+ oldscores = [0.]
+ newscores = {}
+ for name in listNames:
+ newscores[name] = [0.]
+
+ index = 0
+ for percent in percentages[1:]:
+ oldscores.append(xreads * float(percent) / 100.)
+ index += 1
+ listindex = 0
+ for geneList in organizedList:
+ geneCount = len(geneList)
+ numOver = 0.
+ for gene in geneList:
+ finalVal = geneValues[gene][0]
+ currentVal = geneValues[gene][index]
+ if abs((currentVal - finalVal) / finalVal) > maxdev:
+ numOver += 1.
+
+ fraction = 1. - numOver / geneCount
+ print "%s %s %d %.2f" % (percent, listNames[listindex], geneCount, fraction)
+ newscores[listNames[listindex]].append(fraction)
+ geneCounts[listNames[listindex]] = geneCount
+ listindex += 1
+
+ matplotlib.axes._process_plot_var_args.defaultColors = ["k", "y", "m", "c", "b", "g", "r"]
+
+ oldscores.append(xreads)
+ index = 0
+ plots = []
+ plotsColors = []
+ plotsLegend = []
+ for name in listNames:
+ newscores[name].append(1.0)
+ plots.append(plot(oldscores, newscores[name], listColors[index], linewidth=2))
+ plot(oldscores[1:-1], newscores[name][1:-1], listColors[index] + "^")
+ plotsColors.append(plots[-1][0])
+ plotsLegend.append("%s n = %d" % (name, geneCounts[name]))
+ index += 1
+
+ legend(plotsColors, plotsLegend, loc=0)
+ xticks(oldscores)
+ locs, labels = xticks()
+ setp(labels, rotation="vertical")
+ ylim(0, 1.03)
+ xlim(-0.1, xreads + .1)
+ savefig(outfilename)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# plotprofile.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import optparse
+from pylab import *
+from math import *
+import matplotlib
+
+
+print "%prog: version 2.2"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s infile outfile.png [--scale] [--max weightMax] [--ymin bottom] [--ymax top] [--subtractEvens]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--scale", action="store_true", dest="doScale")
+ parser.add_option("--max", type="float", dest="weightMax")
+ parser.add_option("--ymin", type="float", dest="ymin")
+ parser.add_option("--ymax", type="float", dest="ymax")
+ parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
+ parser.set_defaults(doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ infile = args[0]
+ pngfilename = args[1]
+
+ plotprofile(infile, pngfilename, options.doScale, options.weightMax, options.ymin, options.ymax, options.subtractEvens)
+
+
+def plotprofile(inFileName, pngfilename, doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False):
+ infile = open(inFileName)
+ limitYscale = False
+ if ymax is not None:
+ limitYscale = True
+ else:
+ ymax = 0.
+
+ if ymin is not None:
+ limitYscale = True
+ else:
+ ymin = 0.
+
+ matplotlib.use("Agg")
+
+ labelList = []
+ dataList = []
+ plotList = []
+ xmin = 10**20
+ xmax = -10**20
+
+ xcoordList = []
+ datapointList = []
+ weightList = []
+ line = infile.readline()
+ fields = line.strip().split()
+ for data in fields[1:-1]:
+ datapoint = float(data)
+ if datapoint < xmin:
+ xmin = datapoint
+
+ if datapoint > xmax:
+ xmax = datapoint
+
+ xcoordList.append(datapoint)
+
+ index = 1
+ for line in infile:
+ fields = line.strip().split()
+ datapointList = []
+ for data in fields[1:-1]:
+ datapointList.append(float(data))
+
+ if subtractEvens and index % 2 == 0:
+ for dataIndex in range(len(datapointList)):
+ dataList[-1][dataIndex] -= datapointList[dataIndex]
+ else:
+ dataList.append(datapointList)
+
+ weight = float(fields[-1])
+ if subtractEvens and index % 2 == 0:
+ pass
+ else:
+ labelList.append(fields[0])
+ if weight > weightMax:
+ weightMax = weight
+
+ weightList.append(weight)
+
+ index += 1
+
+ for index in range(len(dataList)):
+ newList = []
+ if doScale:
+ scale = weightList[index] / weightMax
+ print weightList[index], weightMax, scale
+ for val in dataList[index]:
+ newList.append(val * scale)
+ else:
+ newList = dataList[index]
+
+ plotList.append(plot(xcoordList, newList, linewidth=3.0))
+
+ xticks(xcoordList, rotation="vertical")
+ xlim(xmin - 0.1, xmax + 0.1)
+ if limitYscale:
+ ylim(ymin, ymax)
+
+ legend(plotList, labelList)
+ savefig(pngfilename)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys
+from cistematic.genomes import Genome
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print '%s: version 1.1' % argv[0]
+
+ if len(argv) < 6:
+ print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
+ sys.exit(1)
+
+ genome = argv[1]
+ # number of nucleotides at the end of each exon that is affected by splicing
+ splicelead = int(argv[2])
+ uniquefilecount = argv[3]
+ splicefilecount = argv[4]
+ outfilename = argv[5]
+
+ predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename)
+
+
+def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
+ hg = Genome(genome)
+
+ gidDict = {}
+ gidList = []
+ uniqueCountDict = {}
+ spliceCountDict = {}
+
+ uniquefile = open(uniquefilecount)
+ for line in uniquefile:
+ fields = line.strip().split()
+ gidDict[fields[0]] = fields[1]
+ gidList.append(fields[0])
+ uniqueCountDict[fields[0]] = int(fields[2])
+
+ splicefile = open(splicefilecount)
+ for line in splicefile:
+ fields = line.strip().split()
+ spliceCountDict[fields[0]] = int(fields[2])
+
+ outfile = open(outfilename,'w')
+
+ gidList.sort()
+ for gid in gidList:
+ symbol = gidDict[gid]
+ featureList = hg.getGeneFeatures((genome, gid))
+ newfeatureList = []
+ featuresizesum = 0
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+ featuresizesum += stop - start + 1
+
+ if featuresizesum < 1:
+ featuresizesum = 1
+
+ splicearea = (len(newfeatureList) - 1) * splicelead
+ if splicearea < splicelead:
+ splicearea = 0
+
+ fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+ expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+
+ # this p-value is based on the observed unique count, not the expected total count
+ # nor the multi-read adjusted count
+ pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
+ print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
+ outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# profilebins.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+print "%prog: version 2.2"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog label infile1 [--upstream infile2] [--downstream infile3] [--uplength kb] [--downlength kb] [--gene geneName] [--genes genefile] [--append] outfile"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--upstream", dest="upfilename")
+ parser.add_option("--downstream", dest="downfilename")
+ parser.add_option("--uplength", type="float", dest="uplength")
+ parser.add_option("--downlength", type="int", dest="")
+ parser.add_option("--gene", dest="gene")
+ parser.add_option("--genes", dest="genefile")
+ parser.add_option("--append", action="store_true", dest="doAppend")
+ parser.set_defaults(upfilename=None, downfilename=None, uplength=0.0, downlength=0.0,
+ gene=None, genefile=None, doAppend=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ label = args[0]
+ infilename = args[1]
+ outfilename = args[2]
+
+ profilebins(label, infilename, outfilename, options.upfilename, options.downfilename,
+ options.uplength, options.downlength, options.gene, options.genefile,
+ options.doAppend)
+
+
+def profilebins(label, infilename, outfilename, upfilename=None, downfilename=None,
+ uplength=0.0, downlength=0.0, gene=None, genefile=None, doAppend=False):
+
+ fileList = [infilename]
+ geneList = []
+ restrictGenes = False
+ if gene is not None:
+ geneList.append(gene)
+ restrictGenes = True
+
+ if genefile is not None:
+ for line in genefile:
+ fields = line.strip().split()
+ if len(fields) > 1:
+ geneList.append(fields[0])
+ else:
+ geneList.append(line.strip())
+
+ restrictGenes = True
+
+ if upfilename is not None:
+ fileList = [upfilename, infilename]
+
+ if downfilename is not None:
+ fileList.append(downfilename)
+
+ partLength = [10.]
+ partOffset = [0.]
+
+ if uplength:
+ partLength = [uplength, 10.]
+ partOffset = [-1. * uplength, 0.]
+
+ if downlength:
+ partLength.append(downlength)
+ partOffset.append(10.)
+
+ totalWeight = 0.
+ totalBins = []
+ for afile in fileList:
+ infile = open(afile)
+
+ line = infile.readline()
+ fields = line.strip().split()
+ numBins = len(fields) - 4
+
+ geneName = fields[1]
+ weight = float(fields[2])
+ if restrictGenes and geneName in geneList:
+ totalWeight += weight
+
+ totalBins.append([])
+ for myBin in fields[4:]:
+ if not restrictGenes or (restrictGenes and geneName in geneList):
+ totalBins[-1].append(weight * float(myBin))
+ else:
+ totalBins[-1].append(0.)
+
+ for line in infile:
+ fields = line.strip().split()
+ geneName = fields[1]
+ if restrictGenes and geneName not in geneList:
+ continue
+
+ weight = float(fields[2])
+ index = 0
+ for myBin in fields[4:]:
+ totalBins[-1][index] += weight * float(myBin)
+ index += 1
+
+ totalWeight += weight
+
+ sumWeight = 0.
+ totalPercent = 0.
+ if doAppend:
+ outfile = open(outfilename, "a")
+ else:
+ outfile = open(outfilename, "w")
+ outfile.write("x-axis")
+ partIndex = 0
+ for partBins in totalBins:
+ partLen = partLength[partIndex]
+ numBins = len(partBins)
+ for binIndex in range(numBins):
+ outfile.write("\t%.2f" % (partOffset[partIndex] + (binIndex * partLen/numBins)))
+
+ partIndex += 1
+
+ outfile.write("\tweight\n")
+
+ outfile.write(label)
+ for partBins in totalBins:
+ for aBin in partBins:
+ percent = aBin / totalWeight
+ outfile.write("\t%.1f" % percent)
+ sumWeight += aBin
+ totalPercent += percent
+
+ outfile.write("\t%.1f\n" % totalWeight)
+ outfile.close()
+
+ print sumWeight
+ print totalPercent
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import string
+import optparse
+import math
+
+print "%prog: version 2.3"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog denominatorField infile [--only fieldID] [--out outfile]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--only", type="int", dest="onlyField")
+ parser.add_option("--out", dest="outFileName")
+ parser.set_defaults(outFileName=None, onlyField=-1)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ field = int(args[0])
+ if args[1] == "-":
+ inFileName = sys.stdin
+ else:
+ inFileName = args[1]
+
+ ratio(field, inFileName, options.outFileName, options.onlyField)
+
+def ratio(field, inFileName, outFileName=None, onlyField=-1):
+
+ if inFileName is not None:
+ infile = open(inFileName)
+ else:
+ infile = sys.stdin
+
+ record = False
+ if outFileName is not None:
+ outfile = open(outFileName, "w")
+ record = True
+
+ doOnly = False
+ if onlyField != -1:
+ doOnly = True
+
+ line = infile.readline()
+ count = len(line.strip().split())
+ if record:
+ outfile.write(line)
+
+ for line in infile:
+ fields = line.strip().split()
+ outline = str(fields[0])
+ outError = False
+ for index in range(1, count):
+ if field == index:
+ outline = string.join([outline, "0"], " ")
+ elif doOnly and index != onlyField:
+ outline = string.join([outline, str(fields[index])], " ")
+ else:
+ try:
+ ratioString = "%2.2f" % math.log((float(fields[index]) + 1)/(float(fields[field]) + 1), 2)
+ outline = string.join([outline, ratioString], " ")
+ except:
+ try:
+ outline = string.join([outline, "e%s" % fields[index]], " ")
+ except:
+ outError = True
+
+ if outError:
+ continue
+
+ if record:
+ outfile.write(outline + "\n")
+ else:
+ print outline
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# rdsmetadata.py
+# ENRAGE
+#
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import optparse
+from commoncode import readDataset
+
+print "%prog: version 2.7"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog rdsfile [propertyName1::propertyValue1] ... [propertyNameN::propertyValueN] [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--defaultcache", type="int", dest="cacheVal")
+ parser.add_option("--index", action="store_true", dest="buildIndex")
+ parser.add_option("--dropindex", action="store_true", dest="dropIndex")
+ parser.add_option("--nocount", action="store_false", dest="doCount")
+ parser.add_option("--complexity", action="store_true", dest="doComplexity")
+ parser.add_option("--reset", action="store_true", dest="resetFlags")
+ parser.add_option("--initrna", action="store_true", dest="rnaDataType")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(cacheVal=0, buildIndex=False, dropIndex=False, doCount=True,
+ doComplexity=False, resetFlags=False, rnaDataType=False,
+ cachePages=-1)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 1:
+ print usage
+ print "where the optional metadata name::value pairs are added to the existing dataset"
+ sys.exit(1)
+
+ datafile = args[0]
+
+ propertyList=[]
+ for arg in args:
+ if "::" in arg:
+ (pname, pvalue) = arg.strip().split("::")
+ print "adding %s : %s" % (pname, pvalue)
+ propertyList.append((pname, pvalue))
+
+ rdsmetadata(datafile, propertyList, options.cacheVal, options.buildIndex,
+ options.dropIndex, options.doCount, options.doComplexity,
+ options.resetFlags, options.rnaDataType, options.cachePages)
+
+
+def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False,
+ dropIndex=False, doCount=True, doComplexity=False, resetFlags=False,
+ rnaDataType=False, cachePages=-1):
+
+ doCache = False
+ if cachePages != -1:
+ doCache = True
+
+ if rnaDataType:
+ rds = readDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
+ else:
+ rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
+
+ if cachePages > rds.getDefaultCacheSize():
+ rds.setDBcache(cachePages)
+
+ if cacheVal > 0:
+ rds.setDBcache(cacheVal, default=True)
+ print "set default cache size to %d pages" % cacheVal
+
+ if resetFlags:
+ print "clearing read flags"
+ rds.resetFlags()
+
+ if dropIndex:
+ try:
+ rds.dropIndex()
+ except:
+ print "could not drop index"
+
+ if buildIndex:
+ print "building index...."
+ if cacheVal > 0:
+ rds.buildIndex(cacheVal)
+ else:
+ rds.buildIndex()
+
+ if doComplexity:
+ print "calculating uniq read complexity..."
+ uniqs = rds.getUniqsCount(distinct=False)
+ distincts = rds.getUniqsCount(distinct=True)
+ print "%d distincts / %d uniqs = %.2f" % (distincts, uniqs, float(distincts) / uniqs)
+
+ if len(propertyList) > 0:
+ rds.insertMetadata(propertyList)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# recordLog.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 12/14/08.
+#
+
+import sys
+from commoncode import writeLog
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if "-verbose" in argv or len(argv) < 4:
+ print "%s: version 1.0" % sys.argv[0]
+
+ if len(argv) < 4:
+ print "usage: python %s logFile messenger message [--verbose]" % argv[0]
+ sys.exit(1)
+
+ logFile = argv[1]
+ messenger = argv[2]
+ message = argv[3]
+
+ writeLog(logFile, messenger, message)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# regionBins.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys
+print '%s: version 2.0' % sys.argv[0]
+
+if len(sys.argv) < 4:
+ print 'usage: python %s regionfile rdsfile outfilename [-bins numbins] [-field fieldNum] [-raw] [-padregion bp] [-mergeregion bp] [-cache]' % sys.argv[0]
+ sys.exit(1)
+
+from commoncode import *
+
+regionfilename = sys.argv[1]
+hitfile = sys.argv[2]
+outfilename = sys.argv[3]
+
+if '-raw' in sys.argv:
+ normalize = False
+ normalizeBins = False
+else:
+ normalize = True
+ normalizeBins = True
+
+doCache = False
+if '-cache' in sys.argv:
+ doCache = True
+
+cField = 1
+if '-field' in sys.argv:
+ fieldIndex = sys.argv.index('-field') + 1
+ cField = int(sys.argv[fieldIndex])
+
+padregion = 0
+if '-padregion' in sys.argv:
+ padField = sys.argv.index('-padregion') + 1
+ padregion = int(sys.argv[padField])
+ print 'padding %d bp on each side of a region' % padregion
+
+mergeregion = 0
+if '-mergeregion' in sys.argv:
+ mergeField = sys.argv.index('-mergeregion') + 1
+ mergeregion = int(sys.argv[mergeField])
+ print 'merging regions closer than %d bp' % mergeregion
+
+bins = 10
+if '-bins' in sys.argv:
+ binfield = sys.argv.index('-bins') + 1
+ bins = int(sys.argv[binfield])
+
+hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+readlen = hitRDS.getReadSize()
+normalizationFactor = 1.0
+if normalize:
+ totalCount = len(hitRDS)
+ normalizationFactor = totalCount / 1000000.
+
+chromList = hitRDS.getChromosomes(fullChrom=False)
+chromList.sort()
+
+regionDict = getMergedRegions(regionfilename, maxDist = mergeregion, keepLabel = True, verbose = True, chromField = cField, pad=padregion)
+
+hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+(regionsBins, regionsLen) = computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizationFactor)
+
+outfile = open(outfilename, 'w')
+for regionID in regionsBins:
+ tagCount = 0.
+ for binAmount in regionsBins[regionID]:
+ tagCount += binAmount
+ outfile.write('%s\t%s\t%.1f\t%d' % (regionID, regionID, tagCount, Len[gid]))
+ for binAmount in gidBins[gid]:
+ if normalizeBins:
+ if tagCount == 0:
+ tagCount = 1
+ outfile.write('\t%.1f' % (100. * binAmount / tagCount))
+ else:
+ outfile.write('\t%.1f' % binAmount)
+ outfile.write('\n')
+
+outfile.close()
\ No newline at end of file
--- /dev/null
+#
+# regionCounts.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ print 'psyco not running'
+
+import sys, string, optparse
+from commoncode import readDataset, getMergedRegions, findPeak, writeLog
+
+versionString = "%prog: version 3.9"
+print versionString
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog regionfile rdsfile outfilename [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--markRDS", action="store_true", dest="flagRDS")
+ parser.add_option("--chromField", type="int", dest="cField")
+ parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--padregion", type="int", dest="padregion")
+ parser.add_option("--mergeregion", type="int", dest="mergeregion")
+ parser.add_option("--nomerge", action="store_false", dest="merging")
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+ parser.add_option("--noMulti", action="store_false", dest="doMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--peak", action="store_true", dest="usePeak")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--log", dest="logfilename")
+ parser.add_option("--rpkm", action="store_true", dest="doRPKM")
+ parser.add_option("--length", action="store_true", dest="doLength")
+ parser.add_option("--force", action="store_true", dest="forceRegion")
+ parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
+ padregion=0, mergeregion=0, merging=True, doUniqs=True,
+ doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
+ logfilename="regionCounts.log", doRPKM=False, doLength=False,
+ forceRegion=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ regionfilename = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+ options.useFullchrom, options.normalize, options.padregion,
+ options.mergeregion, options.merging, options.doUniqs, options.doMulti,
+ options.doSplices, options.usePeak, options.cachePages, options.logfilename,
+ options.doRPKM, options.doLength, options.forceRegion)
+
+
+def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+ useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
+ merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
+ cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
+ forceRegion=False):
+
+ print "padding %d bp on each side of a region" % padregion
+ print "merging regions closer than %d bp" % mergeregion
+ print "will use peak values"
+
+ if cachePages != -1:
+ doCache = True
+ else:
+ doCache = False
+
+ normalize = True
+ doRPKM = False
+ if doRPKM == True:
+ normalize = True
+
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+ regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
+ fullChrom=useFullchrom, verbose=True, chromField=cField,
+ doMerge=merging, pad=padregion)
+
+ labelList = []
+ labeltoRegionDict = {}
+ regionCount = {}
+
+ hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ readlen = hitRDS.getReadSize()
+ if cachePages > hitRDS.getDefaultCacheSize():
+ hitRDS.setDBcache(cachePages)
+
+ totalCount = len(hitRDS)
+ if normalize:
+ normalizationFactor = totalCount / 1000000.
+
+ chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
+ if len(chromList) == 0 and doSplices:
+ chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
+
+ chromList.sort()
+
+ if flagRDS:
+ hitRDS.setSynchronousPragma("OFF")
+
+ for rchrom in regionDict:
+ if forceRegion and rchrom not in chromList:
+ print rchrom
+ for (label, start, stop, length) in regionDict[rchrom]:
+ regionCount[label] = 0
+ labelList.append(label)
+ labeltoRegionDict[label] = (rchrom, start, stop)
+
+ for rchrom in chromList:
+ regionList = []
+ if rchrom not in regionDict:
+ continue
+
+ print rchrom
+ if useFullchrom:
+ fullchrom = rchrom
+ else:
+ fullchrom = "chr%s" % rchrom
+
+ if usePeak:
+ readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
+ rindex = 0
+ dictLen = len(readDict[fullchrom])
+
+ for (label, start, stop, length) in regionDict[rchrom]:
+ regionCount[label] = 0
+ labelList.append(label)
+ labeltoRegionDict[label] = (rchrom, start, stop)
+
+ if useFullchrom:
+ fullchrom = rchrom
+ else:
+ fullchrom = "chr%s" % rchrom
+
+ for (label, rstart, rstop, length) in regionDict[rchrom]:
+ regionList.append((label, fullchrom, rstart, rstop))
+ if usePeak:
+ readList = []
+ for localIndex in xrange(rindex, dictLen):
+ read = readDict[fullchrom][localIndex]
+ if read[0] < rstart:
+ rindex += 1
+ elif rstart <= read[0] <= rstop:
+ readList.append(read)
+ else:
+ break
+
+ if len(readList) < 1:
+ continue
+
+ readList.sort()
+ (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
+ try:
+ topValue = smoothArray[topPos[0]]
+ except:
+ print "problem with %s %s" % (str(topPos), str(smoothArray))
+ continue
+
+ regionCount[label] += topValue
+ else:
+ regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+ if flagRDS:
+ hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+ if flagRDS:
+ hitRDS.setSynchronousPragma("ON")
+
+ if normalize:
+ for label in regionCount:
+ regionCount[label] = float(regionCount[label]) / normalizationFactor
+
+ outfile = open(outfilename, "w")
+
+ if forceRegion:
+ labelList.sort()
+
+ for label in labelList:
+ (chrom, start, stop) = labeltoRegionDict[label]
+ if useFullchrom:
+ fullchrom = chrom
+ else:
+ fullchrom = "chr%s" % chrom
+
+ if normalize:
+ if doRPKM:
+ length = abs(stop - start) / 1000.
+ else:
+ length = 1.
+
+ if length < 0.001:
+ length = 0.001
+
+ outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
+ if doLength:
+ outfile.write("\t%.1f" % length)
+ else:
+ outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
+
+ outfile.write("\n")
+
+ outfile.close()
+ if doCache and flagRDS:
+ hitRDS.saveCacheDB(hitfile)
+
+ writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# regionintersects.py
+# ENRAGE
+#
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, findPeak
+
+print "%prog: version 3.0"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--reject1", dest="rejectOneName")
+ parser.add_option("--reject2", dest="rejectTwoName")
+ parser.add_option("--union", action="store_true", dest="trackReject")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
+ doCache=False, normalize=True, doVerbose=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 5:
+ print usage
+ sys.exit(1)
+
+ readOneName = args[0]
+ regionOneName = args[1]
+ readTwoName = args[2]
+ regionTwoName = args[3]
+ outfilename = args[4]
+
+ regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
+ outfilename, options.rejectOneName, options.rejectTwoName,
+ options.trackReject, options.doCache, options.normalize,
+ options.doVerbose)
+
+
+def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
+ outfilename, rejectOneName=None, rejectTwoName=None,
+ trackReject=False, doCache=False, normalize=True, doVerbose=False):
+
+ mergedist=0
+
+ outfile = open(outfilename, "w")
+
+ doReject = False
+ if rejectOneName is not None:
+ trackReject = True
+ doReject = True
+ rejectOne = open(rejectOneName, "w")
+
+ if rejectTwoName is not None:
+ trackReject = True
+ doReject = True
+ rejectTwo = open(rejectTwoName, "w")
+
+ oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
+ twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
+
+ oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache)
+ twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
+
+ if normalize:
+ normalize1 = len(oneRDS) / 1000000.
+ normalize2 = len(twoRDS) / 1000000.
+ else:
+ normalize1 = 1.
+ normalize2 = 1.
+
+ commonRegions = 0
+ oneRejectIndex = 0
+ twoRejectIndex = 0
+
+ onePeaksDict = {}
+ oneFoundDict = {}
+
+ numRegionsOne = 0
+ numRegionsTwo = 0
+ for rchrom in oneDict:
+ numRegionsOne += len(oneDict[rchrom])
+
+ for rchrom in twoDict:
+ numRegionsTwo += len(twoDict[rchrom])
+
+ outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
+
+ for rchrom in oneDict:
+ if rchrom not in twoDict:
+ continue
+
+ print rchrom
+ rindex = 0
+ rindex2 = 0
+ fullchrom = "chr" + rchrom
+ oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
+ dictLen1 = len(oneReads[fullchrom])
+ twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
+ dictLen2 = len(twoReads[fullchrom])
+ chrom = rchrom
+ onePeaksDict[chrom] = []
+ oneFoundDict[chrom] = []
+ for (start, stop, length) in oneDict[chrom]:
+ readList = []
+ for localIndex in xrange(rindex, dictLen1):
+ read = oneReads[fullchrom][localIndex]
+ if read[0] < start:
+ rindex += 1
+ elif start <= read[0] <= stop:
+ readList.append(read)
+ else:
+ break
+
+ if len(readList) < 1:
+ continue
+
+ readList.sort()
+
+ (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
+ onePeakScore = smoothArray[topPos[0]]
+ onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
+
+ for (start, stop, length) in twoDict[chrom]:
+ readList2 = []
+ for localIndex in xrange(rindex2, dictLen2):
+ read = twoReads[fullchrom][localIndex]
+ if read[0] < start:
+ rindex2 += 1
+ elif start <= read[0] <= stop:
+ readList2.append(read)
+ else:
+ break
+
+ if len(readList2) < 1:
+ continue
+
+ readList2.sort()
+ (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
+ numHits /= normalize2
+ twoIsCommon = False
+ twoPeak = topPos[0] + start
+ twoRadius = length/2
+ twoPeakScore = smoothArray[topPos[0]] / normalize2
+ for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+ if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
+ if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
+ oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
+
+ twoIsCommon = True
+ commonRegions += 1
+ outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
+ if doVerbose:
+ print outline
+
+ outfile.write(outline + "\n")
+
+ if trackReject and not twoIsCommon:
+ twoRejectIndex += 1
+ outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
+ if doReject:
+ rejectTwo.write(outline + "\n")
+ else:
+ outfile.write(outline + "\n")
+
+ if doVerbose:
+ print outline
+
+ if trackReject:
+ for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+ if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
+ oneRejectIndex += 1
+ outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
+ if doReject:
+ rejectOne.write(outline + "\n")
+ else:
+ outfile.write(outline + "\n")
+
+ if doVerbose:
+ print outline
+
+ if trackReject:
+ print "common: %d one-only: %d two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex)
+ outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex))
+ else:
+ print "common: %d" % commonRegions
+ outfile.write("#common: %d\n" % commonRegions)
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+ usage: python regiontobed label regionfile outbedfile [--color r,g,b] [--score field] [--narrowPeak] [--broadPeak] [--itemRgb] [--nolabel]
+ where color is in comma-delimited RGB without space
+ and field is a column with a score (first column is 0, second is 1,...)
+ t-narrowPeak assumes that findall.py was run with -listPeak
+ t-broadPeak assumes that findall.py was *NOT* run with -listPeak
+"""
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, math, optparse
+
+print "%prog: version 3.1"
+
+
+def usage():
+ print __doc__
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = __doc__
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--color", dest="color")
+ parser.add_option("--score", type="int", dest="scoreField")
+ parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
+ parser.add_option("--broadPeak", action="store_true", dest="doBroad")
+ parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
+ parser.add_option("--nolabel", action="store_true", dest="noLabel")
+ parser.set_defaults(color="0,0,0", scoreField=None, doNarrow=False,
+ doBroad=False, itemRGB=False, noLabel=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ usage()
+ sys.exit(2)
+
+ factorlabel = args[0]
+ regionfile = args[1]
+ outfile = args[2]
+
+ regiontobed(factorlabel, regionfile, outfile, options.color,
+ options.scoreField, options.doNarrow, options.doBroad,
+ options.itemRGB, options.noLabel)
+
+
+def regiontobed(factorlabel, regionFileName, outFileName, color="0,0,0",
+ scoreField=None, doNarrow=False, doBroad=False, itemRGB=False,
+ noLabel=False):
+
+ regionfile = open(regionFileName)
+ outfile = open(outFileName, "w")
+
+ if itemRGB:
+ print "assigning each item its color"
+
+ if noLabel:
+ if itemRGB:
+ outfile.write('track name=%s visibility=4 itemRgb="on"\n' % factorlabel)
+ else:
+ outfile.write("track name=%s visibility=4 color=%s\n" % (factorlabel, color))
+
+ for line in regionfile:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split()
+ if doNarrow:
+ signalVal = float(fields[4])
+ pval = float(fields[-1])
+ if pval == 0.:
+ pValue = 350
+ else:
+ pValue = -1. * math.log(pval, 10)
+
+ peakPos = int(fields[9]) - int(fields[2])
+ outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1\t%d" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue, peakPos))
+ elif doBroad:
+ signalVal = float(fields[4])
+ pval = float(fields[-1])
+ if pval == 0.:
+ pValue = 350
+ else:
+ pValue = -1. * math.log(pval, 10)
+
+ outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue))
+ elif scoreField is not None:
+ score = int(float(fields[scoreField]))
+ if score > 1000:
+ score = 1000
+
+ outfile.write("%s\t%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0], score))
+ if itemRGB:
+ outfile.write("\t+\t-\t-\t%s" % color)
+ else:
+ outfile.write("%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0]))
+ if itemRGB:
+ outfile.write("\t1000\t+\t-\t-\t%s" % color)
+
+ outfile.write("\n")
+
+ outfile.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s infile outfile" % sys.argv[0]
+
+ if len(argv) < 3:
+ print usage
+ sys.exit(1)
+
+ infile = open(argv[1])
+ outfile = open(argv[2], "w")
+
+ lines = infile.readlines()
+ outputLines = rnaAToIFilter(lines)
+
+ for line in outputLines:
+ outfile.write(line)
+
+ outfile.close()
+
+
+def rnaAToIFilter(snpPropertiesList):
+ outputLines = []
+ for line in snpPropertiesList:
+ fields = line.split()
+ if fields[13] == "F" and fields[7] == "A-G":
+ outputLines.append(line)
+ elif fields[13] == "R" and fields[7] == "T-C":
+ outputLines.append(line)
+
+ return outputLines
+
+
+if __name__ == '__main__':
+ pass
\ No newline at end of file
--- /dev/null
+"""
+Based on shell script provided by Ali.
+"""
+
+import sys
+import optparse
+from Erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
+from Erange.commoncode import countDuplicatesInList
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog dbfile snpsfile genome rpkmfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--goprefix", dest="prefix")
+ parser.add_option("--novelsnp", dest="novelsnpoutfilename")
+ parser.add_option("--bedfile", dest="bedoutfilename")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--snpDB", action="append", dest="snpDBList",
+ help="additional snp db files to check will be searched in order given")
+ parser.set_defaults(prefix=None, novelsnpoutfilename=None, bedoutfilename=None, cachePages=None, snpDBList=[])
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ dbfile = args[0]
+ hitfile = args[1]
+ genome = args[2]
+ rpkmfilename = args[3]
+
+ if options.cachePages is not None:
+ doCache = True
+ else:
+ doCache = False
+
+ # get the SNPs
+ snpList = getSNPs.getSNPs(hitfile, 3, 0.25, doCache, options.cachePages, forceChr=True)
+
+ # check for existing SNPs
+ dbList = [dbfile]
+ for dbFileName in options.snpDBList:
+ dbList.append(dbFileName)
+
+ snpPropertiesList = chksnp.chkSNP(dbList, snpList, options.cachePages)
+
+ # get the neighboring genes
+ geneInfoList = getSNPGeneInfo.getSNPGeneInfo(genome, snpPropertiesList, rpkmfilename, doCache, flankBP=10000)
+
+ # filter out for the A-to-I events in the same direction as the genes
+ filteredSNPs = rnaAToIFilter.rnaAToIFilter(geneInfoList)
+
+ # count the number of different bases that have been called for each gene
+ # pick a set of genes with a high number of sites (here 5)
+ geneList = getGenesWithMultipleSNPs(filteredSNPs, minCount=5)
+
+ if options.prefix is not None:
+ analyzego.analyzeGO(genome, geneList, options.prefix, translateGene=True, fieldID=1)
+
+ if options.novelsnpoutfilename is not None:
+ getNovelSNPs.writeNovelSNPFile(genome, filteredSNPs, options.novelsnpoutfilename)
+
+ if options.bedoutfilename is not None:
+ makeSNPtrack.writeSNPsBedfile(filteredSNPs, "rnaEdit_sample", options.bedoutfilename)
+
+
+def getGenesWithMultipleSNPs(snpList, minCount=1):
+ geneList = []
+ for snpEntry in snpList:
+ geneList.append(snpEntry[11])
+
+ duplicateCountList = countDuplicatesInList(geneList)
+
+ geneList = []
+ for (gene, count) in duplicateCountList:
+ if count >= minCount:
+ geneList.append(gene)
+
+ return geneList
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# RNAFARpairs.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 11/2/08.
+#
+""" usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
+ looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
+"""
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys, time, optparse
+from commoncode import readDataset
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "%prog: version 3.6"
+ usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--verbose", action="store_true", dest="doVerbose",
+ help="verbose output")
+ parser.add_option("--cache", action="store_true", dest="doCache",
+ help="use cache")
+ parser.add_option("--maxDist", type="int", dest="maxDist",
+ help="maximum distance")
+ parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(1)
+
+ genome = args[0]
+ goodfilename = args[1]
+ rdsfile = args[2]
+ outfilename = args[3]
+
+ rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+
+
+def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+ goodDict = {}
+ goodfile = open(goodfilename)
+ for line in goodfile:
+ fields = line.split()
+ goodDict[fields[0]] = line
+
+ RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ rdsChromList = RDS.getChromosomes()
+
+ if doVerbose:
+ print time.ctime()
+
+ distinct = 0
+ total = 0
+ outfile = open(outfilename,"w")
+
+ idb = geneinfoDB()
+ if genome == "dmelanogaster":
+ geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+ else:
+ geneinfoDict = idb.getallGeneInfo(genome)
+
+ hg = Genome(genome)
+ geneannotDict = hg.allAnnotInfo()
+
+ assigned = {}
+ farConnected = {}
+ for achrom in rdsChromList:
+ if achrom == "chrM":
+ continue
+
+ print achrom
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+ if doVerbose:
+ print len(uniqDict), time.ctime()
+
+ for readID in uniqDict:
+ readList = uniqDict[readID]
+ if len(readList) == 2:
+ total += 1
+ (start1, flag1, pair1) = readList[0]
+ (start2, flag2, pair2) = readList[1]
+
+ if flag1 != flag2:
+ dist = abs(start1 - start2)
+ if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+ geneID = ""
+ saw1 = False
+ saw2 = False
+ if flag1 in goodDict:
+ geneID = flag2
+ farFlag = flag1
+ saw1 = True
+
+ if flag2 in goodDict:
+ geneID = flag1
+ farFlag = flag2
+ saw2 = True
+
+ if saw1 or saw2:
+ total += 1
+
+ if saw1 and saw2:
+ if flag1 < flag2:
+ geneID = flag1
+ farFlag = flag2
+ else:
+ geneID = flag2
+ farFlag = flag1
+
+ if geneID in farConnected:
+ farConnected[geneID].append(farFlag)
+ else:
+ farConnected[geneID] = [farFlag]
+ elif geneID != "":
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneinfoDict["Dmel_" + geneID][0][0]
+ else:
+ symbol = geneinfoDict[geneID][0][0]
+ except:
+ try:
+ symbol = geneannotDict[(genome, geneID)][0]
+ except:
+ symbol = "LOC" + geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+ if farFlag not in assigned:
+ assigned[farFlag] = (symbol, geneID)
+ print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+ distinct += 1
+
+ farIndex = 0
+ for farFlag in farConnected:
+ geneID = ""
+ symbol = ""
+ idList = [farFlag] + farConnected[farFlag]
+ for oneID in idList:
+ if oneID in assigned:
+ (symbol, geneID) = assigned[oneID]
+
+ if geneID == "":
+ farIndex += 1
+ symbol = "FAR%d" % farIndex
+ geneID = -1 * farIndex
+
+ for oneID in idList:
+ if oneID not in assigned:
+ print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
+ distinct += 1
+ assigned[oneID] = (symbol, geneID)
+
+ for farFlag in goodDict:
+ if farFlag not in assigned:
+ farIndex += 1
+ line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
+ print line.strip()
+ outfile.write(line)
+
+ outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+ outfile.close()
+ print "distinct: %d\ttotal: %d" % (distinct, total)
+ print time.ctime()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+10
+
+dir
+23
+file:///Users/sau/svn/repos/erange/source/Erange/rnapath
+file:///Users/sau/svn/repos
+
+
+
+2010-10-01T18:32:26.347691Z
+22
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9
+\f
+RNAPATH.py
+file
+
+
+
+
+
+dbb616164849ddb57ad0880cf59ff36a
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
+__init__.py
+file
+
+
+
+
+2010-09-10T18:56:21.000000Z
+d41d8cd98f00b204e9800998ecf8427e
+2010-09-10T18:57:45.549780Z
+20
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+0
+\f
+processvelvet.py
+file
+
+
+
+
+
+c232f2e5338d3f018f259576a65ff49e
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
--- /dev/null
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+ """ returns the complementary basepair to base nt
+ """
+ compDict = { "A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "S": "S",
+ "W": "W",
+ "R": "Y",
+ "Y": "R",
+ "M": "K",
+ "K": "M",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "V": "B",
+ "N": "N",
+ "a": "t",
+ "t": "a",
+ "g": "c",
+ "c": "g",
+ "n": "n",
+ "z": "z"
+ }
+
+ return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+ """ returns the complement of the sequence.
+ """
+ newSeq = ""
+
+ seqLength = len(sequence)
+
+ if length == seqLength or length < 0:
+ seqList = list(sequence)
+ seqList.reverse()
+ return "".join(map(compNT, seqList))
+
+ #TODO: this seems to want to deal with case where length is more than
+ # sequence length except that a negative index on a sequence is fine
+ # index will only be overrun if length is negative but that case is
+ # handled above
+ for index in range(seqLength - 1,seqLength - length - 1, -1):
+ try:
+ newSeq += compNT(sequence[index])
+ except:
+ newSeq += "N"
+
+ return newSeq
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="pathPrefix")
+ parser.add_option("--overlap", type="int", dest="overlap")
+ parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(0)
+
+ incontigfilename = args[0]
+ distalPairsfile = args[1]
+ outpathfilename = args[2]
+ outcontigfilename = args[3]
+
+ rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+ outpathfile = open(outpathfilename, "w")
+
+ outheader = "#settings: %s" % " ".join(sys.argv)
+ print outheader
+ print >> outpathfile, outheader
+
+ contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+ halfSize = calculateN50(origSize)
+ print "building the adjacency graph"
+ pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+ print "found %d paths" % len(pathList)
+
+ newSizeList = []
+ pathID = 0
+ outcontigfile = open(outcontigfilename, "w")
+ for path in pathList:
+ pathID += 1
+ outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path)))
+ vertexNameList = []
+ for vertex in path:
+ vertexNameList.append(nameList[vertex])
+ pathDescription = string.join(vertexNameList, ",")
+
+ print >> outpathfile, pathDescription
+ currentVertex = path[0]
+ currentSense = "+"
+ assemblyList = currentVertex
+ sequence = contigDict[currentVertex]
+ for nextVertex in path[1:]:
+ if (currentVertex, nextVertex) in edgeSenseDict:
+ senseList = edgeSenseDict[currentVertex, nextVertex]
+ FR = senseList.count(("+", "-"))
+ RF = senseList.count(("-", "+"))
+ else:
+ senseList = edgeSenseDict[nextVertex, currentVertex]
+ # flip
+ FR = senseList.count(("-", "+"))
+ RF = senseList.count(("+", "-"))
+
+ FF = senseList.count(("+", "+"))
+ RR = senseList.count(("-", "-"))
+ if currentSense == "-":
+ # we had flipped the upstream piece! Must flip again
+ temp1 = FR
+ temp2 = FF
+ FR = RR
+ FF = RF
+ RR = temp1
+ RF = temp2
+
+ if FR >= FF and FR >= RR and FR >= RF:
+ # we have FR - leave alone
+ sense1 = "+"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+ seqleft = sequence[-20:]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += contigDict[nextVertex][offset:]
+ else:
+ sequence += "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ elif FF >= RR and FF >= RF:
+ # we have FF - flip seqright
+ sense1 = "+"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+ seqleft = sequence[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence += "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+ elif RR >= RF:
+ # we have RR - flip seqleft
+ sense1 = "-"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+ seqleft = complement(sequence)[:20]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + contigDict[nextVertex][offset:]
+ else:
+ sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ else:
+ # we have RF - flip both
+ sense1 = "-"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+ seqleft = complement(sequence)[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+
+ outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+ print outstring
+ print >> outpathfile, outstring
+ currentVertex = nextVertex
+
+ outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+ newSizeList.append(len(sequence))
+
+ for vertex in contigDict:
+ if vertex in visitedDict:
+ continue
+
+ newSizeList.append(len(contigDict[vertex]))
+ outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+ calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+ if referenceMean is None:
+ totalSize = sum(sizeList)
+ referenceMean = totalSize / 2
+
+ sizeList.sort()
+ sizeList.reverse()
+ currentTotalLength = 0
+ for size in sizeList:
+ if currentTotalLength + size > referenceMean:
+ print "#contigs", len(sizeList)
+ print "N50", size
+ break
+
+ currentTotalLength += size
+
+ print sizeList[:50]
+
+ return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+ nameList = []
+ origSize = []
+ contigNum = 0
+ currentChrom = ""
+ seq = ""
+ contigDict = {}
+
+ try:
+ incontigfile = open(contigFileName)
+ except IOError:
+ print "Error opening contig file: %s" % contigFileName
+ return contigNum, nameList, contigDict, origSize
+
+ for line in incontigfile:
+ if ">" in line:
+ if currentChrom !="":
+ nameList.append(currentChrom)
+ contigDict[contigNum] = seq
+ origSize.append(len(seq))
+ contigNum += 1
+
+ currentChrom = line.strip().split()[0][1:]
+ seq = ""
+ else:
+ seq += line.strip()
+
+ incontigfile.close()
+
+ return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+ edgeMatrix = EdgeMatrix(contigNum)
+
+ print len(edgeMatrix.edgeArray)
+ try:
+ print len(edgeMatrix.edgeArray[50])
+ except IndexError:
+ pass
+
+ print "processing distal pairs"
+ verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+ willVisitList = verticesWithEdges.keys()
+ willVisitList.sort()
+ print "visiting %d vertices" % len(willVisitList)
+
+ print "cleaning up graph of edges with weight 1"
+ verticesToDelete = []
+ for rindex in willVisitList:
+ if rindex not in notSoloDict:
+ cindex = vertexEdges[rindex][0]
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ verticesToDelete.append(rindex)
+
+ for vertex in verticesToDelete:
+ willVisitList.remove(vertex)
+
+ print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+ zeroedEdge = 0
+ print "visiting %d vertices" % len(willVisitList)
+
+ leafList = []
+ print "picking top 2 edges per vertex - zero out others"
+ for rindex in willVisitList:
+ vertices = vertexEdges[rindex]
+ rEdges = []
+ for avertex in vertices:
+ if avertex in willVisitList:
+ rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+ if len(rEdges) > 2:
+ rEdges.sort()
+ rEdges.reverse()
+ zeroedEdge += len(rEdges[2:])
+ for (weight, cindex) in rEdges[2:]:
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ elif len(rEdges) == 1:
+ if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+ leafList.append(rindex)
+
+ print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+ pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+ return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+ pathList = []
+ visitedDict = {}
+ leafList.sort()
+ print "traveling through the graph"
+ for rindex in leafList:
+ if visitedDict.has_key(rindex):
+ pass
+ else:
+ path = edgeMatrix.visitLink(rindex)
+ if len(path) > 1:
+ for vertex in path:
+ visitedDict[vertex] = ""
+
+ print path
+ pathList.append(path)
+
+ return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+ contigToRowLookup = {}
+ verticesWithEdges = {}
+ vertexEdges = {}
+ notSoloDict = {}
+ edgeSenseDict = {}
+
+ distalPairs = open(distalPairsfilename)
+ for line in distalPairs:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split()
+ contA = "chr%s" % fields[1]
+ try:
+ contig1 = contigToRowLookup[contA]
+ except KeyError:
+ try:
+ contig1 = nameList.index(contA)
+ contigToRowLookup[contA] = contig1
+ except ValueError:
+ print "problem with end1: ", line
+ continue
+
+ sense1 = fields[3]
+
+ contB = "chr%s" % fields[4]
+ try:
+ contig2 = contigToRowLookup[contB]
+ except KeyError:
+ try:
+ contig2 = nameList.index(contB)
+ contigToRowLookup[contB] = contig2
+ except ValueError:
+ print "problem with end2: ", line
+ continue
+
+ sense2 = fields[6]
+
+ edgeMatrix.edgeArray[contig1][contig2] += 1
+ edgeMatrix.edgeArray[contig2][contig1] += 1
+ verticesWithEdges[contig1] = ""
+ verticesWithEdges[contig2] = ""
+ if (contig1, contig2) in edgeSenseDict:
+ edgeSenseDict[contig1, contig2].append((sense1, sense2))
+ elif (contig2, contig1) in edgeSenseDict:
+ edgeSenseDict[contig2, contig1].append((sense2, sense1))
+ else:
+ edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+ if contig1 in vertexEdges:
+ if contig2 not in vertexEdges[contig1]:
+ vertexEdges[contig1].append(contig2)
+ else:
+ vertexEdges[contig1] = [contig2]
+
+ if contig2 in vertexEdges:
+ if contig1 not in vertexEdges[contig2]:
+ vertexEdges[contig2].append(contig1)
+ else:
+ vertexEdges[contig2] = [contig1]
+
+ if edgeMatrix.edgeArray[contig1][contig2] > 1:
+ notSoloDict[contig1] = ""
+ notSoloDict[contig2] = ""
+
+ distalPairs.close()
+
+ return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+ """ Describes a sparse matrix to hold edge data.
+ """
+
+ def __init__(self, dimension):
+ self.dimension = dimension
+ self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+ def visitLink(self, fromVertex, ignoreList=[]):
+ returnPath = [fromVertex]
+ toVertex = []
+ for toindex in xrange(self.dimension):
+ if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+ toVertex.append(toindex)
+
+ for vertex in toVertex:
+ if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+ self.edgeArray[fromVertex][vertex] = 0
+ self.edgeArray[vertex][fromVertex] = 0
+ return returnPath + [vertex]
+ else:
+ self.edgeArray[fromVertex][vertex] = 0
+ try:
+ return returnPath + self.visitLink(vertex, returnPath)
+ except IOError:
+ return returnPath + [vertex]
+ return []
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="contigPrefix")
+ parser.add_option("--filter", dest="filterFileName")
+ parser.add_option("--min", type="int", dest="minSize")
+ parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+ parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(2)
+
+ infile = args[0]
+ outfile = args[1]
+
+ processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ filterList = getFilterList(filterFileName)
+
+ node = {"contigPrefix": contigPrefix,
+ "completeID": "",
+ "currentSeq": ""
+ }
+
+ counts = {"acceptedSize": 0,
+ "nSize": 0,
+ "contigsAccepted": 0,
+ "filteredSize": 0
+ }
+
+ for line in infile:
+ if ">NODE" in line:
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+ node["completeID"] = line.strip()[1:]
+ node["currentSeq"] = ""
+ else:
+ node["currentSeq"] += line
+
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+ infile.close()
+ outfile.close()
+
+ print "%d contigs accepted" % counts["contigsAccepted"]
+ print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+ print "%d bp accepted" % counts["acceptedSize"]
+ print "%d bp accepted N" % counts["nSize"]
+ print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+ filterList = []
+
+ if filterFileName:
+ try:
+ filterFile = open(filterFileName)
+ except IOError:
+ return filterList
+
+ for line in filterFile:
+ if "NODE" in line:
+ fields = line.strip().split()
+ try:
+ exclude = fields[9]
+ except IndexError:
+ continue
+
+ if exclude not in filterList:
+ filterList.append(exclude)
+
+ filterFile.close()
+
+ return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+ completeID = node["completeID"]
+ currentSeq = node["currentSeq"]
+ sequenceLength = len(currentSeq) - currentSeq.count("\n")
+ if len(completeID) > 5 and completeID not in filterList:
+ fields = completeID.split("_")
+ newID = fields[1]
+ if keepCoverage:
+ newID = fields[1] + "_" + fields[-1].strip()
+
+ if sequenceLength >= minSize:
+ outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+ counts["acceptedSize"] += sequenceLength
+ counts["nSize"] += currentSeq.count("N")
+ counts["contigsAccepted"] += 1
+ else:
+ counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+ """ returns the complementary basepair to base nt
+ """
+ compDict = { "A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "S": "S",
+ "W": "W",
+ "R": "Y",
+ "Y": "R",
+ "M": "K",
+ "K": "M",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "V": "B",
+ "N": "N",
+ "a": "t",
+ "t": "a",
+ "g": "c",
+ "c": "g",
+ "n": "n",
+ "z": "z"
+ }
+
+ return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+ """ returns the complement of the sequence.
+ """
+ newSeq = ""
+
+ seqLength = len(sequence)
+
+ if length == seqLength or length < 0:
+ seqList = list(sequence)
+ seqList.reverse()
+ return "".join(map(compNT, seqList))
+
+ #TODO: this seems to want to deal with case where length is more than
+ # sequence length except that a negative index on a sequence is fine
+ # index will only be overrun if length is negative but that case is
+ # handled above
+ for index in range(seqLength - 1,seqLength - length - 1, -1):
+ try:
+ newSeq += compNT(sequence[index])
+ except:
+ newSeq += "N"
+
+ return newSeq
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="pathPrefix")
+ parser.add_option("--overlap", type="int", dest="overlap")
+ parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(0)
+
+ incontigfilename = args[0]
+ distalPairsfile = args[1]
+ outpathfilename = args[2]
+ outcontigfilename = args[3]
+
+ rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+ outpathfile = open(outpathfilename, "w")
+
+ outheader = "#settings: %s" % " ".join(sys.argv)
+ print outheader
+ print >> outpathfile, outheader
+
+ contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+ halfSize = calculateN50(origSize)
+ print "building the adjacency graph"
+ pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+ print "found %d paths" % len(pathList)
+
+ newSizeList = []
+ pathID = 0
+ outcontigfile = open(outcontigfilename, "w")
+ for path in pathList:
+ pathID += 1
+ outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path)))
+ vertexNameList = []
+ for vertex in path:
+ vertexNameList.append(nameList[vertex])
+ pathDescription = string.join(vertexNameList, ",")
+
+ print >> outpathfile, pathDescription
+ currentVertex = path[0]
+ currentSense = "+"
+ assemblyList = currentVertex
+ sequence = contigDict[currentVertex]
+ for nextVertex in path[1:]:
+ if (currentVertex, nextVertex) in edgeSenseDict:
+ senseList = edgeSenseDict[currentVertex, nextVertex]
+ FR = senseList.count(("+", "-"))
+ RF = senseList.count(("-", "+"))
+ else:
+ senseList = edgeSenseDict[nextVertex, currentVertex]
+ # flip
+ FR = senseList.count(("-", "+"))
+ RF = senseList.count(("+", "-"))
+
+ FF = senseList.count(("+", "+"))
+ RR = senseList.count(("-", "-"))
+ if currentSense == "-":
+ # we had flipped the upstream piece! Must flip again
+ temp1 = FR
+ temp2 = FF
+ FR = RR
+ FF = RF
+ RR = temp1
+ RF = temp2
+
+ if FR >= FF and FR >= RR and FR >= RF:
+ # we have FR - leave alone
+ sense1 = "+"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+ seqleft = sequence[-20:]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += contigDict[nextVertex][offset:]
+ else:
+ sequence += "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ elif FF >= RR and FF >= RF:
+ # we have FF - flip seqright
+ sense1 = "+"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+ seqleft = sequence[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence += "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+ elif RR >= RF:
+ # we have RR - flip seqleft
+ sense1 = "-"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+ seqleft = complement(sequence)[:20]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + contigDict[nextVertex][offset:]
+ else:
+ sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ else:
+ # we have RF - flip both
+ sense1 = "-"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+ seqleft = complement(sequence)[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+
+ outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+ print outstring
+ print >> outpathfile, outstring
+ currentVertex = nextVertex
+
+ outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+ newSizeList.append(len(sequence))
+
+ for vertex in contigDict:
+ if vertex in visitedDict:
+ continue
+
+ newSizeList.append(len(contigDict[vertex]))
+ outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+ calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+ if referenceMean is None:
+ totalSize = sum(sizeList)
+ referenceMean = totalSize / 2
+
+ sizeList.sort()
+ sizeList.reverse()
+ currentTotalLength = 0
+ for size in sizeList:
+ if currentTotalLength + size > referenceMean:
+ print "#contigs", len(sizeList)
+ print "N50", size
+ break
+
+ currentTotalLength += size
+
+ print sizeList[:50]
+
+ return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+ nameList = []
+ origSize = []
+ contigNum = 0
+ currentChrom = ""
+ seq = ""
+ contigDict = {}
+
+ try:
+ incontigfile = open(contigFileName)
+ except IOError:
+ print "Error opening contig file: %s" % contigFileName
+ return contigNum, nameList, contigDict, origSize
+
+ for line in incontigfile:
+ if ">" in line:
+ if currentChrom !="":
+ nameList.append(currentChrom)
+ contigDict[contigNum] = seq
+ origSize.append(len(seq))
+ contigNum += 1
+
+ currentChrom = line.strip().split()[0][1:]
+ seq = ""
+ else:
+ seq += line.strip()
+
+ incontigfile.close()
+
+ return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+ edgeMatrix = EdgeMatrix(contigNum)
+
+ print len(edgeMatrix.edgeArray)
+ try:
+ print len(edgeMatrix.edgeArray[50])
+ except IndexError:
+ pass
+
+ print "processing distal pairs"
+ verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+ willVisitList = verticesWithEdges.keys()
+ willVisitList.sort()
+ print "visiting %d vertices" % len(willVisitList)
+
+ print "cleaning up graph of edges with weight 1"
+ verticesToDelete = []
+ for rindex in willVisitList:
+ if rindex not in notSoloDict:
+ cindex = vertexEdges[rindex][0]
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ verticesToDelete.append(rindex)
+
+ for vertex in verticesToDelete:
+ willVisitList.remove(vertex)
+
+ print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+ zeroedEdge = 0
+ print "visiting %d vertices" % len(willVisitList)
+
+ leafList = []
+ print "picking top 2 edges per vertex - zero out others"
+ for rindex in willVisitList:
+ vertices = vertexEdges[rindex]
+ rEdges = []
+ for avertex in vertices:
+ if avertex in willVisitList:
+ rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+ if len(rEdges) > 2:
+ rEdges.sort()
+ rEdges.reverse()
+ zeroedEdge += len(rEdges[2:])
+ for (weight, cindex) in rEdges[2:]:
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ elif len(rEdges) == 1:
+ if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+ leafList.append(rindex)
+
+ print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+ pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+ return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+ pathList = []
+ visitedDict = {}
+ leafList.sort()
+ print "traveling through the graph"
+ for rindex in leafList:
+ if visitedDict.has_key(rindex):
+ pass
+ else:
+ path = edgeMatrix.visitLink(rindex)
+ if len(path) > 1:
+ for vertex in path:
+ visitedDict[vertex] = ""
+
+ print path
+ pathList.append(path)
+
+ return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+ contigToRowLookup = {}
+ verticesWithEdges = {}
+ vertexEdges = {}
+ notSoloDict = {}
+ edgeSenseDict = {}
+
+ distalPairs = open(distalPairsfilename)
+ for line in distalPairs:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split()
+ contA = "chr%s" % fields[1]
+ try:
+ contig1 = contigToRowLookup[contA]
+ except KeyError:
+ try:
+ contig1 = nameList.index(contA)
+ contigToRowLookup[contA] = contig1
+ except ValueError:
+ print "problem with end1: ", line
+ continue
+
+ sense1 = fields[3]
+
+ contB = "chr%s" % fields[4]
+ try:
+ contig2 = contigToRowLookup[contB]
+ except KeyError:
+ try:
+ contig2 = nameList.index(contB)
+ contigToRowLookup[contB] = contig2
+ except ValueError:
+ print "problem with end2: ", line
+ continue
+
+ sense2 = fields[6]
+
+ edgeMatrix.edgeArray[contig1][contig2] += 1
+ edgeMatrix.edgeArray[contig2][contig1] += 1
+ verticesWithEdges[contig1] = ""
+ verticesWithEdges[contig2] = ""
+ if (contig1, contig2) in edgeSenseDict:
+ edgeSenseDict[contig1, contig2].append((sense1, sense2))
+ elif (contig2, contig1) in edgeSenseDict:
+ edgeSenseDict[contig2, contig1].append((sense2, sense1))
+ else:
+ edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+ if contig1 in vertexEdges:
+ if contig2 not in vertexEdges[contig1]:
+ vertexEdges[contig1].append(contig2)
+ else:
+ vertexEdges[contig1] = [contig2]
+
+ if contig2 in vertexEdges:
+ if contig1 not in vertexEdges[contig2]:
+ vertexEdges[contig2].append(contig1)
+ else:
+ vertexEdges[contig2] = [contig1]
+
+ if edgeMatrix.edgeArray[contig1][contig2] > 1:
+ notSoloDict[contig1] = ""
+ notSoloDict[contig2] = ""
+
+ distalPairs.close()
+
+ return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+ """ Describes a sparse matrix to hold edge data.
+ """
+
+ def __init__(self, dimension):
+ self.dimension = dimension
+ self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+ def visitLink(self, fromVertex, ignoreList=[]):
+ returnPath = [fromVertex]
+ toVertex = []
+ for toindex in xrange(self.dimension):
+ if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+ toVertex.append(toindex)
+
+ for vertex in toVertex:
+ if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+ self.edgeArray[fromVertex][vertex] = 0
+ self.edgeArray[vertex][fromVertex] = 0
+ return returnPath + [vertex]
+ else:
+ self.edgeArray[fromVertex][vertex] = 0
+ try:
+ return returnPath + self.visitLink(vertex, returnPath)
+ except IOError:
+ return returnPath + [vertex]
+ return []
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="contigPrefix")
+ parser.add_option("--filter", dest="filterFileName")
+ parser.add_option("--min", type="int", dest="minSize")
+ parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+ parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(2)
+
+ infile = args[0]
+ outfile = args[1]
+
+ processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ filterList = getFilterList(filterFileName)
+
+ node = {"contigPrefix": contigPrefix,
+ "completeID": "",
+ "currentSeq": ""
+ }
+
+ counts = {"acceptedSize": 0,
+ "nSize": 0,
+ "contigsAccepted": 0,
+ "filteredSize": 0
+ }
+
+ for line in infile:
+ if ">NODE" in line:
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+ node["completeID"] = line.strip()[1:]
+ node["currentSeq"] = ""
+ else:
+ node["currentSeq"] += line
+
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+ infile.close()
+ outfile.close()
+
+ print "%d contigs accepted" % counts["contigsAccepted"]
+ print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+ print "%d bp accepted" % counts["acceptedSize"]
+ print "%d bp accepted N" % counts["nSize"]
+ print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+ filterList = []
+
+ if filterFileName:
+ try:
+ filterFile = open(filterFileName)
+ except IOError:
+ return filterList
+
+ for line in filterFile:
+ if "NODE" in line:
+ fields = line.strip().split()
+ try:
+ exclude = fields[9]
+ except IndexError:
+ continue
+
+ if exclude not in filterList:
+ filterList.append(exclude)
+
+ filterFile.close()
+
+ return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+ completeID = node["completeID"]
+ currentSeq = node["currentSeq"]
+ sequenceLength = len(currentSeq) - currentSeq.count("\n")
+ if len(completeID) > 5 and completeID not in filterList:
+ fields = completeID.split("_")
+ newID = fields[1]
+ if keepCoverage:
+ newID = fields[1] + "_" + fields[-1].strip()
+
+ if sequenceLength >= minSize:
+ outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+ counts["acceptedSize"] += sequenceLength
+ counts["nSize"] += currentSeq.count("N")
+ counts["contigsAccepted"] += 1
+ else:
+ counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+ """ returns the complementary basepair to base nt
+ """
+ compDict = { "A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "S": "S",
+ "W": "W",
+ "R": "Y",
+ "Y": "R",
+ "M": "K",
+ "K": "M",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "V": "B",
+ "N": "N",
+ "a": "t",
+ "t": "a",
+ "g": "c",
+ "c": "g",
+ "n": "n",
+ "z": "z"
+ }
+
+ return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+ """ returns the complement of the sequence.
+ """
+ newSeq = ""
+
+ seqLength = len(sequence)
+
+ if length == seqLength or length < 0:
+ seqList = list(sequence)
+ seqList.reverse()
+ return "".join(map(compNT, seqList))
+
+ #TODO: this seems to want to deal with case where length is more than
+ # sequence length except that a negative index on a sequence is fine
+ # index will only be overrun if length is negative but that case is
+ # handled above
+ for index in range(seqLength - 1,seqLength - length - 1, -1):
+ try:
+ newSeq += compNT(sequence[index])
+ except:
+ newSeq += "N"
+
+ return newSeq
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="pathPrefix")
+ parser.add_option("--overlap", type="int", dest="overlap")
+ parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 4:
+ print usage
+ sys.exit(0)
+
+ incontigfilename = args[0]
+ distalPairsfile = args[1]
+ outpathfilename = args[2]
+ outcontigfilename = args[3]
+
+ rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+ outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+ outpathfile = open(outpathfilename, "w")
+
+ outheader = "#settings: %s" % " ".join(sys.argv)
+ print outheader
+ print >> outpathfile, outheader
+
+ contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+ halfSize = calculateN50(origSize)
+ print "building the adjacency graph"
+ pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+ print "found %d paths" % len(pathList)
+
+ newSizeList = []
+ pathID = 0
+ outcontigfile = open(outcontigfilename, "w")
+ for path in pathList:
+ pathID += 1
+ outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path)))
+ vertexNameList = []
+ for vertex in path:
+ vertexNameList.append(nameList[vertex])
+ pathDescription = string.join(vertexNameList, ",")
+
+ print >> outpathfile, pathDescription
+ currentVertex = path[0]
+ currentSense = "+"
+ assemblyList = currentVertex
+ sequence = contigDict[currentVertex]
+ for nextVertex in path[1:]:
+ if (currentVertex, nextVertex) in edgeSenseDict:
+ senseList = edgeSenseDict[currentVertex, nextVertex]
+ FR = senseList.count(("+", "-"))
+ RF = senseList.count(("-", "+"))
+ else:
+ senseList = edgeSenseDict[nextVertex, currentVertex]
+ # flip
+ FR = senseList.count(("-", "+"))
+ RF = senseList.count(("+", "-"))
+
+ FF = senseList.count(("+", "+"))
+ RR = senseList.count(("-", "-"))
+ if currentSense == "-":
+ # we had flipped the upstream piece! Must flip again
+ temp1 = FR
+ temp2 = FF
+ FR = RR
+ FF = RF
+ RR = temp1
+ RF = temp2
+
+ if FR >= FF and FR >= RR and FR >= RF:
+ # we have FR - leave alone
+ sense1 = "+"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+ seqleft = sequence[-20:]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += contigDict[nextVertex][offset:]
+ else:
+ sequence += "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ elif FF >= RR and FF >= RF:
+ # we have FF - flip seqright
+ sense1 = "+"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+ seqleft = sequence[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence += complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence += "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+ elif RR >= RF:
+ # we have RR - flip seqleft
+ sense1 = "-"
+ sense2 = "-"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+ seqleft = complement(sequence)[:20]
+ seqright = contigDict[nextVertex][:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + contigDict[nextVertex][offset:]
+ else:
+ sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+ currentSense = "+"
+ else:
+ # we have RF - flip both
+ sense1 = "-"
+ sense2 = "+"
+ assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+ seqleft = complement(sequence)[-20:]
+ seqright = complement(contigDict[nextVertex])[:overlap]
+ if seqleft in seqright:
+ pos = seqright.index(seqleft)
+ offset = pos + 20
+ outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+ print outstring
+ print >> outpathfile, outstring
+ sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+ else:
+ sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+ currentSense = "-"
+
+ outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+ print outstring
+ print >> outpathfile, outstring
+ currentVertex = nextVertex
+
+ outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+ newSizeList.append(len(sequence))
+
+ for vertex in contigDict:
+ if vertex in visitedDict:
+ continue
+
+ newSizeList.append(len(contigDict[vertex]))
+ outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+ calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+ if referenceMean is None:
+ totalSize = sum(sizeList)
+ referenceMean = totalSize / 2
+
+ sizeList.sort()
+ sizeList.reverse()
+ currentTotalLength = 0
+ for size in sizeList:
+ if currentTotalLength + size > referenceMean:
+ print "#contigs", len(sizeList)
+ print "N50", size
+ break
+
+ currentTotalLength += size
+
+ print sizeList[:50]
+
+ return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+ nameList = []
+ origSize = []
+ contigNum = 0
+ currentChrom = ""
+ seq = ""
+ contigDict = {}
+
+ try:
+ incontigfile = open(contigFileName)
+ except IOError:
+ print "Error opening contig file: %s" % contigFileName
+ return contigNum, nameList, contigDict, origSize
+
+ for line in incontigfile:
+ if ">" in line:
+ if currentChrom !="":
+ nameList.append(currentChrom)
+ contigDict[contigNum] = seq
+ origSize.append(len(seq))
+ contigNum += 1
+
+ currentChrom = line.strip().split()[0][1:]
+ seq = ""
+ else:
+ seq += line.strip()
+
+ incontigfile.close()
+
+ return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+ edgeMatrix = EdgeMatrix(contigNum)
+
+ print len(edgeMatrix.edgeArray)
+ try:
+ print len(edgeMatrix.edgeArray[50])
+ except IndexError:
+ pass
+
+ print "processing distal pairs"
+ verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+ willVisitList = verticesWithEdges.keys()
+ willVisitList.sort()
+ print "visiting %d vertices" % len(willVisitList)
+
+ print "cleaning up graph of edges with weight 1"
+ verticesToDelete = []
+ for rindex in willVisitList:
+ if rindex not in notSoloDict:
+ cindex = vertexEdges[rindex][0]
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ verticesToDelete.append(rindex)
+
+ for vertex in verticesToDelete:
+ willVisitList.remove(vertex)
+
+ print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+ zeroedEdge = 0
+ print "visiting %d vertices" % len(willVisitList)
+
+ leafList = []
+ print "picking top 2 edges per vertex - zero out others"
+ for rindex in willVisitList:
+ vertices = vertexEdges[rindex]
+ rEdges = []
+ for avertex in vertices:
+ if avertex in willVisitList:
+ rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+ if len(rEdges) > 2:
+ rEdges.sort()
+ rEdges.reverse()
+ zeroedEdge += len(rEdges[2:])
+ for (weight, cindex) in rEdges[2:]:
+ edgeMatrix.edgeArray[rindex][cindex] = 0
+ edgeMatrix.edgeArray[cindex][rindex] = 0
+ elif len(rEdges) == 1:
+ if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+ leafList.append(rindex)
+
+ print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+ pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+ return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+ pathList = []
+ visitedDict = {}
+ leafList.sort()
+ print "traveling through the graph"
+ for rindex in leafList:
+ if visitedDict.has_key(rindex):
+ pass
+ else:
+ path = edgeMatrix.visitLink(rindex)
+ if len(path) > 1:
+ for vertex in path:
+ visitedDict[vertex] = ""
+
+ print path
+ pathList.append(path)
+
+ return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+ contigToRowLookup = {}
+ verticesWithEdges = {}
+ vertexEdges = {}
+ notSoloDict = {}
+ edgeSenseDict = {}
+
+ distalPairs = open(distalPairsfilename)
+ for line in distalPairs:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split()
+ contA = "chr%s" % fields[1]
+ try:
+ contig1 = contigToRowLookup[contA]
+ except KeyError:
+ try:
+ contig1 = nameList.index(contA)
+ contigToRowLookup[contA] = contig1
+ except ValueError:
+ print "problem with end1: ", line
+ continue
+
+ sense1 = fields[3]
+
+ contB = "chr%s" % fields[4]
+ try:
+ contig2 = contigToRowLookup[contB]
+ except KeyError:
+ try:
+ contig2 = nameList.index(contB)
+ contigToRowLookup[contB] = contig2
+ except ValueError:
+ print "problem with end2: ", line
+ continue
+
+ sense2 = fields[6]
+
+ edgeMatrix.edgeArray[contig1][contig2] += 1
+ edgeMatrix.edgeArray[contig2][contig1] += 1
+ verticesWithEdges[contig1] = ""
+ verticesWithEdges[contig2] = ""
+ if (contig1, contig2) in edgeSenseDict:
+ edgeSenseDict[contig1, contig2].append((sense1, sense2))
+ elif (contig2, contig1) in edgeSenseDict:
+ edgeSenseDict[contig2, contig1].append((sense2, sense1))
+ else:
+ edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+ if contig1 in vertexEdges:
+ if contig2 not in vertexEdges[contig1]:
+ vertexEdges[contig1].append(contig2)
+ else:
+ vertexEdges[contig1] = [contig2]
+
+ if contig2 in vertexEdges:
+ if contig1 not in vertexEdges[contig2]:
+ vertexEdges[contig2].append(contig1)
+ else:
+ vertexEdges[contig2] = [contig1]
+
+ if edgeMatrix.edgeArray[contig1][contig2] > 1:
+ notSoloDict[contig1] = ""
+ notSoloDict[contig2] = ""
+
+ distalPairs.close()
+
+ return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+ """ Describes a sparse matrix to hold edge data.
+ """
+
+ def __init__(self, dimension):
+ self.dimension = dimension
+ self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+ def visitLink(self, fromVertex, ignoreList=[]):
+ returnPath = [fromVertex]
+ toVertex = []
+ for toindex in xrange(self.dimension):
+ if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+ toVertex.append(toindex)
+
+ for vertex in toVertex:
+ if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+ self.edgeArray[fromVertex][vertex] = 0
+ self.edgeArray[vertex][fromVertex] = 0
+ return returnPath + [vertex]
+ else:
+ self.edgeArray[fromVertex][vertex] = 0
+ try:
+ return returnPath + self.visitLink(vertex, returnPath)
+ except IOError:
+ return returnPath + [vertex]
+ return []
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--prefix", dest="contigPrefix")
+ parser.add_option("--filter", dest="filterFileName")
+ parser.add_option("--min", type="int", dest="minSize")
+ parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+ parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(2)
+
+ infile = args[0]
+ outfile = args[1]
+
+ processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+ filterList = getFilterList(filterFileName)
+
+ node = {"contigPrefix": contigPrefix,
+ "completeID": "",
+ "currentSeq": ""
+ }
+
+ counts = {"acceptedSize": 0,
+ "nSize": 0,
+ "contigsAccepted": 0,
+ "filteredSize": 0
+ }
+
+ for line in infile:
+ if ">NODE" in line:
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+ node["completeID"] = line.strip()[1:]
+ node["currentSeq"] = ""
+ else:
+ node["currentSeq"] += line
+
+ writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+ infile.close()
+ outfile.close()
+
+ print "%d contigs accepted" % counts["contigsAccepted"]
+ print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+ print "%d bp accepted" % counts["acceptedSize"]
+ print "%d bp accepted N" % counts["nSize"]
+ print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+ filterList = []
+
+ if filterFileName:
+ try:
+ filterFile = open(filterFileName)
+ except IOError:
+ return filterList
+
+ for line in filterFile:
+ if "NODE" in line:
+ fields = line.strip().split()
+ try:
+ exclude = fields[9]
+ except IndexError:
+ continue
+
+ if exclude not in filterList:
+ filterList.append(exclude)
+
+ filterFile.close()
+
+ return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+ completeID = node["completeID"]
+ currentSeq = node["currentSeq"]
+ sequenceLength = len(currentSeq) - currentSeq.count("\n")
+ if len(completeID) > 5 and completeID not in filterList:
+ fields = completeID.split("_")
+ newID = fields[1]
+ if keepCoverage:
+ newID = fields[1] + "_" + fields[-1].strip()
+
+ if sequenceLength >= minSize:
+ outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+ counts["acceptedSize"] += sequenceLength
+ counts["nSize"] += currentSeq.count("N")
+ counts["contigsAccepted"] += 1
+ else:
+ counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+"""
+ usage: python scatterfields.py infilename xaxisLabel xField yaxisLabel yField outImageName [--xmin xMin] [--ymin yMin]
+ [--xmax xMax] [--ymax yMax] [--doLogF1] [--doLogF2] [--arcsinh] [--order polyOrder] [--base logBase]
+ [--markGenes geneFile] [--markfold times] [--noregression] [--large] [--markdiag] [--title text] [--verbose]
+
+ Do a scatter plot of 2 fields from an input file.
+ fields are counted from 0.
+ use [-order polyOrder] to specify polynomial fits > 1
+ Supports very rudimentary compound fields for X value
+ using python's lambda functions (omit the keyword lambda)
+"""
+
+import matplotlib
+matplotlib.use("Agg")
+
+from pylab import *
+import math, cmath
+import sys
+import optparse
+
+alphaVal = 0.5
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = __doc__
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--xmin", type="float", dest="forcexmin")
+ parser.add_option("--ymin", type="float", dest="forceymin")
+ parser.add_option("--xmax", type="float", dest="forcexmax")
+ parser.add_option("--ymax", type="float", dest="forceymax")
+ parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
+ parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
+ parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
+ parser.add_option("--order", type="int", dest="fitOrder")
+ parser.add_option("--base", type="int", dest="base")
+ parser.add_option("--markGenes", dest="markFile")
+ parser.add_option("--markfold", type="float", dest="foldChange")
+ parser.add_option("--noregression", action="store_false", dest="doRegression")
+ parser.add_option("--large", action="store_true", dest="plotLarge")
+ parser.add_option("--markdiag", action="store_true", dest="markDiag")
+ parser.add_option("--title", type="int", dest="figtitle")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False,
+ doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None,
+ foldChange=None, doRegression=True, plotLarge=False, markDiag=False,
+ figtitle="", verbose=False)
+
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 6:
+ print usage
+ sys.exit(1)
+
+ infile = open(args[0])
+ xaxis = args[1]
+ xField = args[2]
+ yaxis = args[3]
+ yField = int(args[4])
+ outfilename = args[5]
+
+ scatterfields(infile, xaxis, xField, yaxis, yField, outfilename, options.forcexmin,
+ options.forceymin, options.forcexmax, options.forceymax, options.doLogF1,
+ options.doLogF2, options.doArcsinh, options.fitOrder, options.base,
+ options.markFile, options.foldChange, options.doRegression, options.plotLarge,
+ options.markDiag, options.figtitle, options.verbose)
+
+
+def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
+ forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
+ base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
+ markDiag=False, figtitle="", verbose=False):
+
+ infile = open(infilename)
+ compoundField = False
+ try:
+ xField = int(xField)
+ except:
+ try:
+ compoundOp = "lambda %s" % xField
+ operator = eval(compoundOp)
+ compoundField = True
+ print "compound field %s" % xField
+ except:
+ pass
+
+ if not compoundField:
+ print "expression %s not supported" % xField
+ sys.exit(1)
+
+ markedGenes = []
+ marking = False
+ if markFile is not None:
+ for line in markFile:
+ try:
+ markedGenes.append(line.strip().split()[0].upper())
+ except:
+ markedGenes.append(line.strip().upper())
+
+ markFile.close()
+ marking = True
+
+ markFold = False
+ if foldChange is not None:
+ markFold = True
+
+ newscores = []
+ oldscores = []
+
+ markednewscores = []
+ markedoldscores = []
+
+ markedfoldnewscores = []
+ markedfoldoldscores = []
+
+ ymax = 0.
+ xmax = 0.
+ for line in infile:
+ fields = line.strip().split()
+ gene = fields[0]
+ try:
+ if compoundField:
+ score = operator(fields)
+ else:
+ score = float(fields[xField])
+
+ newscore = float(fields[yField])
+ except:
+ continue
+
+ foldMarkThisScore = False
+ if markFold:
+ tempscore = score
+ if tempscore == 0:
+ tempscore = 0.03
+
+ tempratio = newscore / tempscore
+ if tempratio == 0:
+ tempratio2 = tempscore / 0.03
+ else:
+ tempratio2 = 1. / tempratio
+
+ if tempratio > foldChange or tempratio2 > foldChange:
+ foldMarkThisScore = True
+
+ if doArcsinh:
+ score = abs(cmath.asinh(score))
+ elif doLogF1:
+ try:
+ score = math.log(score, base)
+ except:
+ score = forcexmin
+
+ if score > xmax:
+ xmax = score
+
+ if doArcsinh:
+ newscore = abs(cmath.asinh(newscore))
+ elif doLogF2:
+ try:
+ newscore = math.log(newscore, base)
+ except:
+ newscore = forceymin
+
+ if newscore > ymax:
+ ymax = newscore
+
+ oldscores.append(score)
+ newscores.append(newscore)
+ if foldMarkThisScore:
+ markedfoldoldscores.append(score)
+ markedfoldnewscores.append(newscore)
+ if marking and gene.upper() not in markedGenes:
+ print gene, score, newscore, "unmarked"
+
+ if gene.upper() in markedGenes:
+ print gene, score, newscore, "overfold"
+
+ if verbose:
+ print len(markedfoldoldscores), line.strip()
+
+ if gene.upper() in markedGenes:
+ if not foldMarkThisScore:
+ print gene, score, newscore
+
+ markedoldscores.append(score)
+ markednewscores.append(newscore)
+
+ print score, newscore
+ print fields
+
+ if plotLarge and markFold:
+ plot(oldscores, newscores, "^", markersize=10., color="0.75", alpha=alphaVal)
+ elif plotLarge:
+ plot(oldscores, newscores, "b^", markersize=10., alpha=alphaVal)
+ elif markFold:
+ plot(oldscores, newscores, ",", color="0.75", alpha=alphaVal)
+ else:
+ plot(oldscores, newscores, "b,", alpha=alphaVal)
+
+ if len(markedfoldoldscores) > 0:
+ if plotLarge:
+ plot(markedfoldoldscores, markedfoldnewscores, "b^", markersize=10., alpha=alphaVal)
+ else:
+ plot(markedfoldoldscores, markedfoldnewscores, "b,", alpha=alphaVal)
+
+ if len(markedoldscores) > 0:
+ if plotLarge:
+ plot(markedoldscores, markednewscores, "r^", color="red", markersize=10., alpha=alphaVal)
+ else:
+ plot(markedoldscores, markednewscores, ".", color="red", markersize=4., alpha=alphaVal)
+
+ fitvalues = polyfit(oldscores, newscores, fitOrder)
+ print fitvalues
+ print len(oldscores)
+
+ meanObserved = float(sum(newscores)) / len(newscores)
+ if len(fitvalues) == 2:
+ predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
+ else:
+ predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
+
+ SSt = 0.
+ SSe = 0.
+
+ for index in range(len(newscores)):
+ SSt += (newscores[index] - meanObserved) ** 2
+ SSe += (newscores[index] - predicted[index]) ** 2
+
+ rSquared = 1. - SSe / SSt
+ print "R**2 = %f" % rSquared
+
+ oldscores.sort()
+ if len(fitvalues) == 2:
+ predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
+ else:
+ predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
+
+ if doRegression:
+ plot(oldscores, predicted, "-k", linewidth=2)
+
+ if figtitle == "":
+ figtitle = "%s vs %s (R^2: %.2f)" % (yaxis, xaxis, rSquared)
+
+ title(figtitle)
+
+ if markDiag:
+ min = forcexmin
+ if forceymin < min:
+ min = forceymin
+
+ max = xmax
+ if ymax > max:
+ max = ymax
+
+ if forcexmax > max:
+ max = forcexmax
+
+ if forceymax > max:
+ max = forceymax
+
+ plot([min,max], [min,max], "-g", linewidth=2)
+
+ print forcexmin, forceymin
+
+ if doLogF2:
+ ylabel("log%s(%s)" % (str(base), yaxis))
+ else:
+ ylabel(yaxis)
+
+ if doLogF1:
+ xlabel("log%s(%s)" % (str(base), xaxis))
+ else:
+ xlabel(xaxis)
+
+ if xmax > 0:
+ xlim(forcexmin - 0.05, xmax)
+
+ if ymax > 0:
+ ylim(forceymin - 0.05, ymax)
+
+ if forcexmax > 0 and forceymax > 0:
+ xlim(forcexmin - 0.05, forcexmax)
+ ylim(forceymin - 0.05, forceymax)
+
+ gca().get_xaxis().tick_bottom()
+ gca().get_yaxis().tick_left()
+
+ savefig(outfilename, dpi=100)
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# siteintersects.py
+# ENRAGE
+#
+
+import sys
+
+print "%s: version 2.0" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 4:
+ print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
+ sys.exit(1)
+
+ sitefilename1 = argv[1]
+ sitefilename2 = argv[2]
+ outfilename = argv[3]
+
+ doReject = False
+ if "--reject" in sys.argv:
+ reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
+ reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
+ doReject = True
+
+ doExpanded = False
+ if "--expanded" in sys.argv:
+ doExpanded = True
+
+ siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
+
+
+def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+
+ siteDict = {}
+ file1Dict = {}
+
+ infile1count = 0
+ infile = open(sitefilename1)
+ infile.readline()
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ infile1count += 1
+ fields = line.strip().split()
+ if doExpanded:
+ chrom = fields[1][3:]
+ start = int(fields[2])
+ stop = int(fields[3])
+ rest = fields[4:]
+ else:
+ (chrom, pos) = fields[0].split(":")
+ chrom = chrom[3:]
+ (start, stop) = pos.split("-")
+ start = int(start)
+ stop = int(stop)
+ rest = fields[1:]
+
+ try:
+ siteDict[chrom].append((start, stop, rest))
+ except:
+ siteDict[chrom] = [(start, stop, rest)]
+
+ if doReject:
+ file1Dict[str((chrom, start, stop, rest))] = line
+
+ infile.close()
+
+ print "file1: %d" % infile1count
+
+ infile2count = 0
+ infile = open(sitefilename2)
+ infile.readline()
+
+ commonSites = 0
+ unique2List = []
+ outfile = open(outfilename, "w")
+ for line in infile:
+ if line[0] == "#":
+ continue
+
+ infile2count += 1
+ fields = line.strip().split()
+ if doExpanded:
+ chrom = fields[1][3:]
+ start = int(fields[2])
+ stop = int(fields[3])
+ rest = fields[4:]
+ else:
+ (chrom, pos) = fields[0].split(":")
+ chrom = chrom[3:]
+ (start, stop) = pos.split("-")
+ rest = str(fields[1:])
+
+ start = int(start)
+ stop = int(stop)
+ mid = start + abs(stop - start)/2
+ if chrom not in siteDict:
+ if doReject:
+ unique2List.append(line)
+ continue
+
+ twoNotCommon = True
+ for (rstart, rstop, rline) in siteDict[chrom]:
+ rsize = abs(rstart - rstop) /2
+ rmid = rstart + abs(rstop - rstart)/2
+ if abs(mid - rmid) < rsize:
+ commonSites += 1
+ if twoNotCommon:
+ outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
+ twoNotCommon = False
+
+ try:
+ if doReject:
+ del file1Dict[str((chrom, rstart, rstop, rline))]
+ except:
+ pass
+
+ if doReject and twoNotCommon:
+ unique2List.append(line)
+
+ outfile.close()
+
+ print "file2: %d" % infile2count
+
+ if doReject:
+ reject1file = open(reject1filename, "w")
+ reject2file = open(reject2filename, "w")
+
+ for key in file1Dict:
+ reject1file.write(file1Dict[key])
+
+ for line in unique2List:
+ reject2file.write(line)
+
+ reject1file.close()
+ reject2file.close()
+
+ print commonSites
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# stallCategory.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--out", dest="outFileName")
+ parser.add_option("--statout", dest="statOutFileName")
+ parser.add_option("--expression", type="float", dest="expressionLevel")
+ parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ infile1 = args[1]
+ infile2 = args[2]
+ transcriptFile = args[2]
+
+ stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
+
+
+def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
+
+ infile1 = open(inFile1Name)
+ infile2 = open(inFile2Name)
+ transcriptFile = open(transcriptFileName)
+
+ writeOut = False
+ if outFileName is not None:
+ outfile = open(outFileName, "w")
+ outfile.write("gene\texpression\tratio1\tpromAmount1\ttotal1\trestRPKM1\tratio2\tpromAmount2\ttotal2\trestRPKM2\n")
+ writeOut = True
+
+ statWriteOut = False
+ if statOutFileName is not None:
+ statoutfile = open(statOutFileName, "w")
+ statoutfile.write("ExpressionR1R2Stalled1Stalled2\tCount\n")
+ statWriteOut = True
+
+ dictOne = {}
+ dictTwo = {}
+ expressionDict = {}
+
+ for line in infile1:
+ if "short" in line:
+ continue
+
+ fields = line.strip().split()
+ promAmount = float(fields[4]) + float(fields[5])
+ genelen = float(fields[3])/100
+ total = float(fields[2])
+ if total < 0.1:
+ total = 0.1
+
+ restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
+ ratio = float(fields[-1])
+ dictOne[fields[1]] = (ratio, promAmount, total, restRPKM)
+
+ for line in infile2:
+ if "short" in line:
+ continue
+
+ fields = line.strip().split()
+ promAmount = float(fields[4]) + float(fields[5])
+ genelen = float(fields[3])/100
+ if promAmount == 0.:
+ promAmount = 0.1
+
+ total = float(fields[2])
+ if total < 0.1:
+ total = 0.1
+
+ restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
+ ratio = float(fields[-1])
+ dictTwo[fields[1]] = (ratio, promAmount, total, restRPKM)
+
+ for line in transcriptFile:
+ (gene, transc, transcpercell) = line.strip().split()
+ expressionDict[gene] = float(transcpercell)
+
+ categoryList = []
+ categoryDict = {}
+ for atype in ["HH", "HL", "LH", "LL"]:
+ for expression in ["E", "N"]:
+ for cat1 in ["Y", "N"]:
+ for cat2 in ["Y", "N"]:
+ category = expression + cat1 + cat2 + atype
+ categoryList.append(category)
+ categoryDict[category] = []
+
+ for gene in dictOne:
+ if gene not in expressionDict:
+ if writeOut:
+ print "%s is not in expressionDict - skipping" % gene
+
+ continue
+
+ expression = expressionDict[gene]
+ (ratio1, promAmount1, total1, restRPKM1) = dictOne[gene]
+ (ratio2, promAmount2, total2, restRPKM2) = dictTwo[gene]
+
+ if expression > expressionLevel:
+ category = "E"
+ else:
+ category = "N"
+
+ if total1 > 5.0:
+ category += "Y"
+ else:
+ category += "N"
+
+ if total2 > 5.0:
+ category += "Y"
+ else:
+ category += "N"
+
+ if ratio1 > 15:
+ category += "H"
+ else:
+ category += "L"
+
+ if ratio2 > 15:
+ category += "H"
+ else:
+ category += "L"
+
+ categoryDict[category].append(gene)
+ if writeOut:
+ outfile.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
+)
+ else:
+ print "%s %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %s" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
+
+ if writeOut:
+ outfile.close()
+
+ for category in categoryList:
+ if statWriteOut:
+ statoutfile.write("%s\t%d\n" % (category, len(categoryDict[category])))
+ else:
+ print "%s %d" % (category, len(categoryDict[category]))
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import analyzego
+
+
+class TestAnalyzeGO(unittest.TestCase):
+ genome = "celegans"
+ prefix = "testGO"
+ inFileName = "testAnayzeGOInput.txt"
+
+ def setUp(self):
+ infile = open(self.inFileName, "w")
+ infile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.inFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove("%s.gostat" % self.prefix)
+ except OSError:
+ pass
+
+ try:
+ os.remove("%s.gozscore" % self.prefix)
+ except OSError:
+ pass
+
+ try:
+ os.remove("%s.gosig" % self.prefix)
+ except OSError:
+ pass
+
+
+ #TODO: write more tests
+ def testAnalyzeGO(self):
+ geneInfoList = []
+ analyzego.analyzeGO(self.genome, geneInfoList, self.prefix)
+ self.assertRaises(IOError, open, "%s.gostat" % self.prefix, "r")
+ self.assertRaises(IOError, open, "%s.gozscore" % self.prefix, "r")
+ self.assertRaises(IOError, open, "%s.gosig" % self.prefix, "r")
+
+ geneInfoList = ["worm\tgeneID"]
+ analyzego.analyzeGO(self.genome, geneInfoList, self.prefix)
+ statfile = open("%s.gostat" % self.prefix, "r")
+ stats = statfile.readlines()
+ print len(stats)
+
+ statfile.close()
+ scorefile = open("%s.gozscore" % self.prefix, "r")
+ scores = scorefile.readlines()
+ print len(scores)
+
+ scorefile.close()
+ sigfile = open("%s.gosig" % self.prefix, "r")
+ sigs = sigfile.readlines()
+ print len(sigs)
+
+ sigfile.close()
+
+
+ def testMain(self):
+ argv = ["analyzego", self.genome, self.inFileName, self.prefix]
+ analyzego.main(argv)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestAnalyzeGO))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+# header line
+foo foo chr1 691
+foo2 foo2 chr1 81752
--- /dev/null
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+import string
+import os
+from Erange import chksnp
+
+dbPath = "/Users/sau/work/snpdb/hg18"
+
+class TestChksnp(unittest.TestCase):
+ """ First entries from snpDB using select func, name, start, stop from snp where chrom="1" limit 4;
+ unknown|rs10218492|690|691
+ unknown|rs10218493|766|767
+ unknown|rs10218527|789|790
+ unknown|rs28853987|800|801
+
+ Entry from altSnpDB not in sndDB
+ unknown|rs17160650|81751|81752
+ """
+
+ snpDB = "%s/dbSNP128.db" % dbPath
+ altSnpDB = "%s/snp129cDNA.db" % dbPath
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testChkSNPFile(self):
+ inputFileName = "testChkSNP_input.txt"
+ infile = open(inputFileName, "w")
+ infile.write("# header line\n")
+ snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+ infile.write("%s\n" % snpEntry)
+ snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+ infile.write("%s\n" % snpEntry)
+ infile.close()
+
+ outputFileName = "testChkSNP_output.txt"
+
+ chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
+ outfile = open(outputFileName, "r")
+ line = outfile.readline()
+ result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+ self.assertEquals(result, line)
+ result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n"
+ line = outfile.readline()
+ self.assertEquals(result, line)
+ outfile.close()
+ os.remove(outputFileName)
+
+ chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
+ outfile = open(outputFileName, "r")
+ line = outfile.readline()
+ result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+ self.assertEquals(result, line)
+ result = "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown\n"
+ line = outfile.readline()
+ self.assertEquals(result, line)
+ outfile.close()
+
+ os.remove(inputFileName)
+ os.remove(outputFileName)
+
+
+ def testMain(self):
+ inputFileName = "testChkSNP_input.txt"
+ infile = open(inputFileName, "w")
+ infile.write("# header line\n")
+ snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+ infile.write("%s\n" % snpEntry)
+ snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+ infile.write("%s\n" % snpEntry)
+ infile.close()
+
+ outputFileName = "testChkSNP_output.txt"
+
+ argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+ chksnp.main(argv)
+ outfile = open(outputFileName, "r")
+ line = outfile.readline()
+ result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+ self.assertEquals(result, line)
+ result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n"
+ line = outfile.readline()
+ self.assertEquals(result, line)
+ outfile.close()
+ os.remove(outputFileName)
+
+ def testChkSNP(self):
+ snpPropertiesList = []
+ dbList = [self.snpDB]
+ self.assertEquals({}, chksnp.chkSNP(dbList, snpPropertiesList))
+
+ snpPropertiesList = ["# header line"]
+ snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+ snpPropertiesList.append(snpEntry)
+ snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+ snpPropertiesList.append(snpEntry)
+ dbList = [self.snpDB, self.altSnpDB]
+ result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+ self.assertEquals(result, chksnp.chkSNP(dbList, snpPropertiesList))
+
+
+ def testGetSNPLocationInfo(self):
+ snpPropertiesList = []
+ snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t")
+ snpPropertiesList.append(snpEntry)
+ snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList)
+ self.assertEquals([("1", 20)], snpLocationList)
+ self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict)
+
+ snpPropertiesList = ["# header line"]
+ snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t")
+ snpPropertiesList.append(snpEntry)
+ snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList)
+ self.assertEquals([("1", 20)], snpLocationList)
+ self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict)
+
+
+ def testDoNotProcessLine(self):
+ self.assertTrue(chksnp.doNotProcessLine("#anything"))
+ self.assertFalse(chksnp.doNotProcessLine("line to process"))
+
+
+ def testAnnotateSNPFromDB(self):
+ snpLocationList = [("1", 691), ("1", 81752)]
+ snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+ result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"}
+ self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.snpDB))
+
+ snpLocationList = [("1", 691), ("1", 81752)]
+ snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+ result = {("1", 691): "foo\tfoo\tchr1\t691\tN\\A\tN\\A",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+ self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.altSnpDB))
+
+
+ def testAnnotateSNPFromDBList(self):
+ snpLocationList = []
+ snpDict = {}
+ dbList = [self.snpDB]
+ self.assertEquals({}, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+ snpLocationList = [("1", 21)]
+ snpDict = {("1", 21): "foo\tfoo\tchr1\t21"}
+ dbList = [self.snpDB]
+ result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"}
+ self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+ snpLocationList = [("1", 21)]
+ snpDict = {("1", 21): "foo\tfoo\tchr1\t21"}
+ dbList = [self.snpDB]
+ result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"}
+ self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=10000))
+
+ snpLocationList = [("1", 691)]
+ snpDict = {("1", 691): "foo\tfoo\tchr1\t691"}
+ dbList = [self.snpDB]
+ result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown"}
+ self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+ snpLocationList = [("1", 691), ("1", 81752)]
+ snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+ dbList = [self.snpDB]
+ result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"}
+ self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+ snpLocationList = [("1", 691), ("1", 81752)]
+ snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+ dbList = [self.snpDB, self.altSnpDB]
+ result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+ ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+ self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestChksnp))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 30, 2010
+
+@author: sau
+'''
+import unittest
+import os
+import string
+from array import array
+from Erange import commoncode
+from cistematic.genomes import Genome
+
+
+class TestCommoncode(unittest.TestCase):
+ logFile = "testLogFile"
+ celegansChroms = ["I", "II", "III", "IV", "V", "X", "MtDNA"]
+ genome = Genome("celegans")
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.logFile)
+ except OSError:
+ pass
+
+
+ def testGetReverseComplement(self):
+ self.assertEquals("T", commoncode.getReverseComplement("A"))
+ self.assertEquals("A", commoncode.getReverseComplement("T"))
+ self.assertEquals("C", commoncode.getReverseComplement("G"))
+ self.assertEquals("G", commoncode.getReverseComplement("C"))
+ self.assertEquals("N", commoncode.getReverseComplement("N"))
+ self.assertRaises(KeyError, commoncode.getReverseComplement, "")
+ self.assertRaises(KeyError, commoncode.getReverseComplement, "B")
+
+
+ def testCountDuplicatesInList(self):
+ testList = []
+ self.assertEquals([], commoncode.countDuplicatesInList(testList))
+
+ testList = [0, 1]
+ result = [(0, 1), (1, 1)]
+ self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+ testList = [0, 1, 1]
+ result = [(0, 1), (1, 2)]
+ self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+ testList = [0, 1, 2, 1]
+ result = [(0, 1), (1, 2), (2, 1)]
+ self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+
+ def testWriteLog(self):
+ messenger = "testMessenger"
+ message = "testMessage"
+
+ commoncode.writeLog(self.logFile, messenger, message)
+ file = open(self.logFile)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[%s]" % messenger)
+ self.assertEquals(fields[3], message)
+ line = file.readline()
+ self.assertEquals("", line)
+
+ messenger2 = "testMessenger2"
+ message2 = "testMessage2"
+
+ commoncode.writeLog(self.logFile, messenger2, message2)
+ file = open(self.logFile)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[%s]" % messenger)
+ self.assertEquals(fields[3], message)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[%s]" % messenger2)
+ self.assertEquals(fields[3], message2)
+ line = file.readline()
+ self.assertEquals("", line)
+
+ os.remove(self.logFile)
+
+ commoncode.writeLog(self.logFile, messenger, message)
+ file = open(self.logFile)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[%s]" % messenger)
+ self.assertEquals(fields[3], message)
+ line = file.readline()
+ self.assertEquals("", line)
+
+ os.remove(self.logFile)
+
+ commoncode.writeLog(self.logFile, "", message)
+ file = open(self.logFile)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[]")
+ self.assertEquals(fields[3], message)
+ line = file.readline()
+ self.assertEquals("", line)
+
+ os.remove(self.logFile)
+
+ commoncode.writeLog(self.logFile, "", "")
+ file = open(self.logFile)
+ line = file.readline()
+ fields = line.split()
+ self.assertEquals(fields[2], "[]")
+ self.assertEquals(3, len(fields))
+ line = file.readline()
+ self.assertEquals("", line)
+
+
+ def testGetMergedRegions(self):
+ testfile = open("regionTestFile", "w")
+ regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+ testfile.write(regionEntry)
+ testfile.close()
+ result = {"1": [(10, 20, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+ os.remove("regionTestFile")
+
+
+ def testGetMergedRegionsFromList(self):
+ self.assertEquals({}, commoncode.getMergedRegionsFromList([]))
+
+ regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+ regionList = [regionEntry]
+ result = {"1": [(10, 20, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+ result = {"1": [(5, 25, 20)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
+ result = {"1": [(12, 18, 6)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
+ result = {"chr1": [(10, 20, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+
+ regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
+ regionList = [regionEntry]
+ result = {"1": [(10, 20, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+
+ regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+ regionList = [regionEntry]
+ regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [(10, 40, 30)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+ result = {"1": [(10, 20, 10), (15, 40, 25)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
+ result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+
+ regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
+ regionList = [regionEntry]
+ regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+
+ regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+ regionList = [regionEntry]
+ regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+ result = {"1": [(10, 2040, 2030)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
+ result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
+ result = {"1": [(2030, 2040, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+
+ regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
+ regionList = [regionEntry]
+ regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
+ regionList.append(regionEntry)
+ result = {"2": [(15, 40, 25)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+ self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
+ self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
+ self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
+
+ regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
+ regionList = [regionEntry]
+ result = {"1": [(10, 20, 10)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+ result = {"1": [(10, 20, 10, 3, 40)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ result = {"1": [("1", 10, 20, 10, 3, 40)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+ result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
+ "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
+ }
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+
+ regionList = ["# comment"]
+ regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [(10, 20, 10, 3, 40)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ regionList = ["# pvalue"]
+ regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
+ regionList.append(regionEntry)
+ result = {"1": [(10, 20, 10, 3, 40)]}
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ regionList = ["# readShift"]
+ regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
+ regionList.append(regionEntry)
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ regionList = ["# pvalue readShift"]
+ regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
+ regionList.append(regionEntry)
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+ self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+ #Test fails - the header line is required if there are fields after the peak which isn't so good
+ #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
+
+
+ def testRegionsOverlap(self):
+ self.assertTrue(commoncode.regionsOverlap(100, 200, 1, 300))
+ self.assertTrue(commoncode.regionsOverlap(100, 200, 150, 300))
+ self.assertTrue(commoncode.regionsOverlap(100, 500, 1, 300))
+ self.assertTrue(commoncode.regionsOverlap(100, 200, 110, 160))
+
+ self.assertFalse(commoncode.regionsOverlap(100, 200, 250, 300))
+ self.assertFalse(commoncode.regionsOverlap(100, 200, 1, 60))
+
+ self.assertFalse(commoncode.regionsOverlap(-200, -100, 1, 300))
+ self.assertFalse(commoncode.regionsOverlap(100, 200, -300, -1))
+
+ self.assertTrue(commoncode.regionsOverlap(-200, -100, -300, -1))
+
+ self.assertTrue(commoncode.regionsOverlap(-100, -200, -300, -1))
+ self.assertTrue(commoncode.regionsOverlap(-200, -100, -1, -300))
+ self.assertTrue(commoncode.regionsOverlap(-100, -200, -1, -300))
+
+
+ def testRegionsAreWithinDistance(self):
+ self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 40, 50, 30))
+ self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5))
+ self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 25, 50, 10))
+ self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5))
+
+ self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 100, 150, 5))
+ self.assertFalse(commoncode.regionsAreWithinDistance(100, 200, 10, 15, 5))
+
+ self.assertTrue(commoncode.regionsAreWithinDistance(20, 10, 30, 150, 10))
+ self.assertFalse(commoncode.regionsAreWithinDistance(20, 10, 100, 150, 5))
+ self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 150, 100, 5))
+
+
+ #TODO: write test
+ def testFindPeak(self):
+ hitList = []
+ result = ([], 0.0, array("f"), 0.0)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+
+ hitList= [[4, "+", 0.5]]
+ result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
+ result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
+ result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
+ result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+
+ hitList= [[4, "+", 0.5]]
+ result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+
+ hitList= [[4, "+", 0.5]]
+ result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
+ result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
+ self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+
+
+ #TODO: write test
+ def testGetBestShiftForRegion(self):
+ hitList = [[14, "-", 1.0], [16, "-", 1.0], [24, "+", 1.0], [26, "+", 10.0]]
+ self.assertEquals(74, commoncode.getBestShiftForRegion(hitList, 0, 100))
+ self.assertEquals(16, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=30))
+ self.assertEquals(0, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=10))
+
+
+ #TODO: write test
+ def testGetFeaturesByChromDict(self):
+ firstFeatures = {"I": (4123, 4219, "Y74C9A.3", "R", "3UTR"),
+ "II": (1866, 1910, "2L52.1", "F", "CDS"),
+ "III": (1270, 1506, "cTel54X.1", "R", "CDS"),
+ "IV": (694, 1064, "Y38C1AB.4", "F", "CDS"),
+ "V": (1479, 1578, "cTel3X.1", "F", "CDS"),
+ "X": (3622, 4099, "CE7X_3.1", "F", "CDS"),
+ "MtDNA": (112, 543, "MTCE.3", "F", "CDS")
+ }
+ featureDict = commoncode.getFeaturesByChromDict(self.genome)
+ for chrom in featureDict.keys():
+ self.assertTrue(chrom in self.celegansChroms)
+ self.assertEquals(firstFeatures[chrom], featureDict[chrom][0])
+
+ restrictList = ["almost certainly not a value feature"]
+ featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList)
+ self.assertEquals({}, featureDict)
+
+ restrictList = ["Y74C9A.3"]
+ featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList)
+ self.assertEquals(["I"], featureDict.keys())
+ featureDict, complementDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList, regionComplement=True)
+ result = {"I": [(0, 4123, "nonExon1", "F", "nonExon"),
+ (4219, 4220, "nonExon2", "F", "nonExon"),
+ (4357, 5194, "nonExon3", "F", "nonExon"),
+ (5295, 6036, "nonExon4", "F", "nonExon"),
+ (6326, 9726, "nonExon5", "F", "nonExon"),
+ (9845, 10094, "nonExon6", "F", "nonExon"),
+ (10147, 10148, "nonExon7", "F", "nonExon"),
+ (10231, 250000000, "nonExon8", "F", "nonExon")]
+ }
+ self.assertEquals(result, complementDict)
+
+ regionDict = {"I": [("new feature", 100, 150, 50)]}
+ featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
+ result = (100, 150, "new feature", "+", "custom")
+ self.assertEquals(result, featureDict["I"][0])
+
+
+ def testGetLocusByChromDict(self):
+ firstLoci = {"I": (4123, 10231, "Y74C9A.3", 6108),
+ "II": (1866, 4662, "2L52.1", 2796),
+ "III": (1270, 2916, "cTel54X.1", 1646),
+ "IV": (694, 14925, "Y38C1AB.4", 14231),
+ "V": (1479, 3038, "cTel3X.1", 1559),
+ "X": (3622, 7153, "CE7X_3.1", 3531),
+ "MtDNA": (112, 548, "MTCE.3", 436)
+ }
+
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, useCDS=False))
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, downstream=1, useCDS=False))
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=-1, downstream=-1, useCDS=False, lengthCDS=1))
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstreamSpanTSS=True, lengthCDS=1))
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, downstream=1, lengthCDS=1))
+ self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, lengthCDS=-1))
+
+ locusDict = commoncode.getLocusByChromDict(self.genome)
+ for chrom in locusDict.keys():
+ self.assertTrue(chrom in self.celegansChroms)
+ self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
+
+ regionDict = {"I": [("new region", 100, 150, 50)]}
+ locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
+ self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
+ locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
+ self.assertEquals((100, 150, "new region", 50, "+"), locusDict["I"][0])
+
+ # Long Test
+ #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, upstream=100)
+ #self.assertEquals((150, 250, "new region", 100), locusDict["I"][0])
+
+ # Long Test
+ #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, downstream=10)
+ #self.assertEquals((90, 100, "new region", 10), locusDict["I"][0])
+
+
+ def testComputeRegionBins(self):
+ regionsByChromDict = {}
+ hitDict = {}
+ bins = 4
+ readlen = 10
+ result = ({}, {})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ result = ({"regionID": [0.0, 0.0, 0.0, 0.0]}, {})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[1, "+", 1.0]]}
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+ "2": [("regionID2", 1, 1000, 1000, "F")]
+ }
+ hitDict = {"1": [[1, "+", 1.0]],
+ "2": [[1, "+", 1.0]]
+ }
+ result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5]]}
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [15, "+", 1.0]]}
+ result = ({"regionID": [2.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [200, "+", 2.0]]}
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[1, "+", 1.0]]}
+ regionList = ["regionID"]
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[1, "+", 1.0]]}
+ regionList = ["empty region"]
+ result = ({"empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+ "2": [("regionID2", 1, 1000, 1000, "F")]
+ }
+ hitDict = {"1": [[1, "+", 1.0]],
+ "2": [[1, "+", 1.0]]
+ }
+ regionList = ["regionID", "regionID2"]
+ result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+ "2": [("regionID2", 1, 1000, 1000, "F")]
+ }
+ hitDict = {"1": [[1, "+", 1.0]],
+ "2": [[1, "+", 1.0]]
+ }
+ regionList = ["empty region", "regionID2"]
+ result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+ "2": [("regionID2", 1, 1000, 1000, "F")]
+ }
+ hitDict = {"1": [[1, "+", 1.0]],
+ "2": [[1, "+", 1.0]]
+ }
+ regionList = ["regionID2"]
+ result = ({"regionID2": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[1, "+", 1.0]]}
+ result = ({"regionID": [2.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizedTag=2.0))
+
+ regionsByChromDict = {"1": [(1, 100, "regionID", 100, "F")]}
+ hitDict = {"1": [[1, "+", 1.0]]}
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, defaultRegionFormat=False))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0]]}
+ fixedFirstBin = 20
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0]]}
+ fixedFirstBin = 5
+ result = ({"regionID": [0.0, 1.0, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ fixedFirstBin = 20
+ result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[80, "+", 1.0], [85, "+", 0.5]]}
+ fixedFirstBin = 5
+ result = ({"regionID": [0.0, 1.5, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ binLength = 25
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ binLength = 50
+ result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ binLength = 15
+ result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+ hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+ binLength = 15
+ result = ({"regionID": [1.0, 0.0, 0.7, 0.5]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+ regionsByChromDict = {"1": [("regionID", 1, 100, 100, "R")]}
+ hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+ result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+ result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
+ fixedFirstBin = 10
+ result = ({"regionID": [0.0, 2.2, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+ fixedFirstBin = 20
+ result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+ binLength = 50
+ result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+ binLength = 10
+ result = ({"regionID": [0.0, 0.5, 0.0, 1.7]}, {"regionID": 100})
+ self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestCommoncode))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Runs all unit test for Erange.
+Functionality will eventually be incorporated into unittest in Python 2.7+
+Uses test suites until then
+
+Created on Sep 8, 2010
+
+@author: sau
+'''
+
+import sys
+import unittest
+import testAnalyzeGO
+import testChksnp
+import testCommoncode
+import testGeneMrnaCounts
+#import testGetFasta
+import testGetNovelSNPs
+import testGetSNPGeneInfo
+import testGetSNPs
+import testMakeBamFromRds
+import testmakebedfromrds
+#import testMakeGraphs
+import testMakeRdsFromBam
+import testMakeSNPTrack
+import testMarkLinkers
+import testPeaksToRegion
+import testProcessVelvet
+import testReadDataset
+import testRnaAToIFilter
+import testRnaEditing
+import testRNAPATH
+import testTranscripts
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ suite = unittest.TestSuite()
+ suite.addTest(testAnalyzeGO.suite())
+ suite.addTest(testChksnp.suite())
+ suite.addTest(testCommoncode.suite())
+ suite.addTest(testGeneMrnaCounts.suite())
+ #suite.addTest(testGetFasta.suite())
+ suite.addTest(testGetNovelSNPs.suite())
+ suite.addTest(testGetSNPGeneInfo.suite())
+ suite.addTest(testGetSNPs.suite())
+ suite.addTest(testMakeBamFromRds.suite())
+ suite.addTest(testmakebedfromrds.suite())
+ #suite.addTest(testMakeGraphs.suite())
+ suite.addTest(testMakeRdsFromBam.suite())
+ suite.addTest(testMakeSNPTrack.suite())
+ suite.addTest(testMarkLinkers.suite())
+ suite.addTest(testPeaksToRegion.suite())
+ suite.addTest(testProcessVelvet.suite())
+ suite.addTest(testReadDataset.suite())
+ suite.addTest(testRnaAToIFilter.suite())
+ suite.addTest(testRnaEditing.suite())
+ suite.addTest(testRNAPATH.suite())
+ #suite.addTest(testTranscripts.suite())
+
+ unittest.TextTestRunner(verbosity=2).run(suite)
+
+if __name__ == '__main__':
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 19, 2010
+
+@author: sau
+
+Located feature 728439 by:
+ from Erange.commoncode import getFeaturesByChromDict
+ genome = Genome(self.genomeName)
+ featuresByChromDict = getFeaturesByChromDict(genome)
+ print featuresByChromDict["1"][:3]
+
+'''
+import unittest
+import os
+from Erange import geneMrnaCounts
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+from Erange.commoncode import readDataset
+
+
+class TestGeneMrnaCounts(unittest.TestCase):
+ idb = geneinfoDB(cache=True)
+ testDBName = "testRDS.rds"
+ genomeName = "hsapiens"
+ outfilename = "testGeneMrnaCounts.txt"
+
+ def setUp(self):
+ self.rds = readDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
+
+
+ def tearDown(self):
+ del(self.rds)
+ os.remove(self.testDBName)
+
+
+ def testGeneMrnaCounts(self):
+ geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename)
+ outfile = open(self.outfilename, "r")
+ for line in outfile:
+ fields = line.split("\t")
+ self.assertEquals("0\n", fields[2])
+
+ outfile.close()
+ os.remove(self.outfilename)
+
+ rdsEntryList = [("testRead", "chr1", 18700, 18800, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename)
+ possibleCounts = ["0\n", "1\n"]
+ outfile = open(self.outfilename, "r")
+ for line in outfile:
+ fields = line.split("\t")
+ self.assertTrue(fields[2] in possibleCounts)
+
+ outfile.close()
+ os.remove(self.outfilename)
+
+ geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
+ markGID=True, trackStrand=True)
+
+ possibleCounts = ["0\n", "1\n"]
+ outfile = open(self.outfilename, "r")
+ for line in outfile:
+ fields = line.split("\t")
+ self.assertTrue(fields[2] in possibleCounts)
+
+ outfile.close()
+ os.remove(self.outfilename)
+ reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+ self.assertEquals("728439", reads["1"][0]["flag"])
+
+ geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
+ countFeats=True, markGID=True, cachePages=150000)
+
+ possibleCounts = ["0\n", "1\n"]
+ outfile = open(self.outfilename, "r")
+ for line in outfile:
+ fields = line.split("\t")
+ self.assertTrue(fields[2] in possibleCounts)
+
+ outfile.close()
+ os.remove(self.outfilename)
+ reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+ self.assertEquals("728439", reads["1"][0]["flag"])
+
+
+ def testCountFeatures(self):
+ testDict = {}
+ self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": []}
+ self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+ #TODO: This is likely not the result we want
+ testDict = {"chr1": "not a list"}
+ self.assertEquals(10, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": 10}
+ self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": 10,
+ "chr2": ["f1"]}
+ self.assertEquals(1, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": ["f1", "f2"]}
+ self.assertEquals(2, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": ["f1", "f2"],
+ "chr2": []}
+ self.assertEquals(2, geneMrnaCounts.countFeatures(testDict))
+
+ testDict = {"chr1": ["f1", "f2"],
+ "chr2": ["f1"]}
+ self.assertEquals(3, geneMrnaCounts.countFeatures(testDict))
+
+
+ def testGetGeneSymbol(self):
+ # Case: Null/None inputs
+ gid = ""
+ searchGID = False
+ geneInfoDict = {}
+ idb = None
+ genomeName = ""
+ geneAnnotDict = {}
+ self.assertEquals("LOC", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+ # Case: symbol is in geneInfoDict
+ gid = "1"
+ searchGID = False
+ geneInfoDict = {"1": [["gene1", "wrong name"], ["wrong name 2"]]}
+ idb = None
+ genomeName = "test"
+ geneAnnotDict = {("test", "1"): ["wrong name 3"]}
+ self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+ # Case: symbol not in geneInfoDict, is in geneAnnotDict
+ gid = "1"
+ searchGID = False
+ geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+ idb = None
+ genomeName = "test"
+ geneAnnotDict = {("test", "1"): ["gene1"]}
+ self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+ # Case: symbol not in geneInfoDict or geneAnnotDict - non-null/None inputs
+ gid = "1"
+ searchGID = False
+ geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+ idb = None
+ genomeName = "test"
+ geneAnnotDict = {("test", "0"): ["wrong name 3"]}
+ self.assertEquals("LOC1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+ # Case: using search, gid not in idb
+ gid = "almostCertainlyNotInTheIDB"
+ searchGID = True
+ geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+ idb = self.idb
+ genomeName = "human"
+ geneAnnotDict = {("human", "0"): ["wrong name 3"]}
+ self.assertEquals("LOCalmostCertainlyNotInTheIDB", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+ # Case: using search
+ # sql to get gid: select gID from gene_info where genome="human" and locustag !="-" and locustag != symbol limit 5;
+ gid = "RP11-177A2.3"
+ searchGID = True
+ geneInfoDict = {"27": [["correct"], ["wrong name 2"]]}
+ idb = self.idb
+ genomeName = "human"
+ geneAnnotDict = {("human", "0"): ["wrong name 3"]}
+ self.assertEquals("correct", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+
+ def testWriteOutputFile(self):
+ genome = Genome(self.genomeName)
+ gidList = ["RP11-177A2.3"]
+ gidCount = {"RP11-177A2.3": 1}
+ geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False)
+
+ outfile = open(self.outfilename, "r")
+ line = outfile.readline()
+ result = "RP11-177A2.3\tLOCRP11-177A2.3\t1\n"
+ self.assertEquals(result, line)
+ outfile.close()
+ os.remove(self.outfilename)
+
+ genome = Genome("hsapiens")
+ gidList = ["RP11-177A2.3"]
+ gidCount = {"something else": 1}
+ geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False)
+
+ outfile = open(self.outfilename, "r")
+ line = outfile.readline()
+ result = "RP11-177A2.3\tLOCRP11-177A2.3\t0\n"
+ self.assertEquals(result, line)
+ outfile.close()
+ os.remove(self.outfilename)
+
+ def testMain(self):
+ argv = ["geneMRNACounts", self.genomeName, self.testDBName, self.outfilename]
+ geneMrnaCounts.main(argv)
+ outfile = open(self.outfilename, "r")
+ for line in outfile:
+ fields = line.split("\t")
+ self.assertEquals("0\n", fields[2])
+
+ outfile.close()
+ os.remove(self.outfilename)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGeneMrnaCounts))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 27, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import getfasta
+#from Erange import ReadDataset
+from Erange.commoncode import readDataset
+
+testDBName = "testRDS.rds"
+
+
+class TestGetFasta(unittest.TestCase):
+
+
+ def setUp(self):
+ self.regionDict = {}
+ self.minHitThresh = -1
+ self.maxsize = 3000
+ self.outfilename = "testFileForTestGetFasta.fa"
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.outfilename)
+ except OSError:
+ print "fasta file does not exist"
+
+ try:
+ os.remove(testDBName)
+ except OSError:
+ print "RDS file does not exist"
+
+
+ def testGetDefaultRegion(self):
+ self.assertEquals({}, getfasta.getDefaultRegion(self.regionDict, self.maxsize))
+
+ regionDict = {"1": [],
+ "2": []
+ }
+ result = {"2": [],
+ "1": []
+ }
+ self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10)],
+ "2": []
+ }
+ result = {"2": [],
+ "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+ }
+ self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10)],
+ "2": [(11, 21, 11)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+ "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+ }
+ self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10), (100, 4000, 3900)],
+ "2": [(11, 21, 11)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+ "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+ }
+ self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10), (100, 4000, 3900), (50, 60, 10)],
+ "2": [(11, 21, 11)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+ "1": [{"start": 10, "length": 10, "topPos": [-1]},
+ {"start": 50, "length": 10, "topPos": [-1]}]
+ }
+ self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+
+ def testGetRegionUsingPeaks(self):
+ self.assertEquals({}, getfasta.getRegionUsingPeaks(self.regionDict, self.minHitThresh, self.maxsize))
+
+ regionDict = {"1": [],
+ "2": []
+ }
+ result = {"2": [],
+ "1": []
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10, 15, 1)],
+ "2": []
+ }
+ result = {"2": [],
+ "1": [{"start": 10, "length": 10, "topPos": [5]}]
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+ result = {"2": [],
+ "1": []
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, 3, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10, 15, 1)],
+ "2": [(11, 21, 11, 18, 1)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+ "1": [{"start": 10, "length": 10, "topPos": [5]}]
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1)],
+ "2": [(11, 21, 11, 18, 1)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+ "1": [{"start": 10, "length": 10, "topPos": [5]}]
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+ regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1), (50, 60, 10, 59, 1)],
+ "2": [(11, 21, 11, 18, 1)]
+ }
+ result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+ "1": [{"start": 10, "length": 10, "topPos": [5]},
+ {"start": 50, "length": 10, "topPos": [9]}]
+ }
+ self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+
+ #TODO: write test. This seems to not make sense. We are always returning a "topPos" of range(rlen).
+ # need to check to see if the issue might be with commoncode.findPeak as there is a lot of questionable
+ # logic in that one
+ def testGetRegionUsingRDS(self):
+ rds = readDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+ rds.insertMetadata([("readsize", "100")])
+ rdsEntryList = [("testRead", "chr1", 10, 100, "+", 1.0, "", "")]
+ rds.insertUniqs(rdsEntryList)
+ self.assertEquals({}, getfasta.getRegionUsingRDS(self.regionDict, rds, self.minHitThresh, self.maxsize))
+
+ regionDict = {"1": [],
+ "2": []
+ }
+ result = {"2": [],
+ "1": []
+ }
+ self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize))
+
+ # Ack with a capital ACK.
+ regionDict = {"1": [(1, 600, 5)],
+ "2": []
+ }
+ result = {"1": [{"start": 1, "length": 5, "topPos": [0, 1, 2, 3, 4]}],
+ "2": []
+ }
+ self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize))
+
+ del(rds)
+
+
+ def testWriteFastaFile(self):
+ ncregions = {}
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ for line in open(self.outfilename):
+ self.assertEquals("", line)
+
+ ncregions = {"1": [],
+ "2": []
+ }
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ for line in open(self.outfilename):
+ self.assertEquals("", line)
+
+ ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}],
+ "2": []
+ }
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ fastaFile = open(self.outfilename)
+ self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+ self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+ self.assertEquals("", fastaFile.readline())
+
+ ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}],
+ "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+ }
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ fastaFile = open(self.outfilename)
+ self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+ self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+ self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+ self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+ self.assertEquals("", fastaFile.readline())
+
+ ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6, 20]}],
+ "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+ }
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ fastaFile = open(self.outfilename)
+ self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+ self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+ self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+ self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+ self.assertEquals("", fastaFile.readline())
+
+ ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]},
+ {"start": 15000, "length": 50, "topPos": [2]}
+ ],
+ "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+ }
+ getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+ fastaFile = open(self.outfilename)
+ self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+ self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+ self.assertEquals(">chr1:14952-15053\n", fastaFile.readline())
+ self.assertEquals("AGTGAATGAGGGAAAGGGCAGGGCCCGGGACTGGGGAATCTGTAGGGTCAATGGAGGAGTTCAGAGAAGGTGCAACATTTCTGACCCCCTACAAGGTGCTT\n", fastaFile.readline())
+ self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+ self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+ self.assertEquals("", fastaFile.readline())
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGetFasta))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+
+
+class TestGetNovelSNPs(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testName(self):
+ pass
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGetNovelSNPs))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import getSNPGeneInfo
+
+
+class TestGetSNPGeneInfo(unittest.TestCase):
+
+
+ def setUp(self):
+ self.geneDict = {}
+ self.snpDict = {}
+ self.rpkmDict = {}
+ self.withSense = False
+
+
+ def tearDown(self):
+ pass
+
+
+ def testDoNotProcessLine(self):
+ self.assertTrue(getSNPGeneInfo.doNotProcessLine("#anything"))
+ self.assertFalse(getSNPGeneInfo.doNotProcessLine("line to process"))
+
+
+ def testGetSNPGeneInfoList(self):
+ geneInfoList = getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+ self.assertEquals([], geneInfoList)
+
+ badGeneDict = {"badEntry": "foo"}
+ self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneInfoList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"}
+ self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneInfoList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+ self.snpDict[("1", 1)] = "chr1\tpos 1\n"
+ result = [{"symbol": "gene1",
+ "rpkm": "N\\A",
+ "geneID": "ID1",
+ "snpDescription": "chr1\tpos 1" }
+ ]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ self.rpkmDict["ID1"] = 300
+ result = [{"symbol": "gene1",
+ "rpkm": "300",
+ "geneID": "ID1",
+ "snpDescription": "chr1\tpos 1" }
+ ]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"}
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ result = [{"symbol": "gene1",
+ "sense": "+",
+ "rpkm": "300",
+ "geneID": "ID1",
+ "snpDescription": "chr1\tpos 1" }
+ ]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, True))
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"}
+ self.snpDict[("1", 10)] = "chr1\tpos 10\n"
+ result = [{"symbol": "gene1",
+ "rpkm": "300",
+ "geneID": "ID1",
+ "snpDescription": "chr1\tpos 10" },
+ {"symbol": "gene1",
+ "rpkm": "300",
+ "geneID": "ID1",
+ "snpDescription": "chr1\tpos 1" }
+ ]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+
+ #TODO: write test
+ def testGetSNPGeneInfo(self):
+ pass
+
+
+ def testGetSNPGeneOutputList(self):
+ geneOutputList = getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+ self.assertEquals([], geneOutputList)
+
+ badGeneDict = {"badEntry": "foo"}
+ self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneOutputList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"}
+ self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneOutputList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+ self.snpDict[("1", 1)] = "chr1\tpos 1\n"
+ result = ["chr1\tpos 1\tgene1\tID1\tN\\A"]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ self.rpkmDict["ID1"] = 300
+ result = ["chr1\tpos 1\tgene1\tID1\t300"]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"}
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+ result = ["chr1\tpos 1\tgene1\tID1\t300\t+"]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, True))
+
+ self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"}
+ self.snpDict[("1", 10)] = "chr1\tpos 10\n"
+ result = ["chr1\tpos 10\tgene1\tID1\t300",
+ "chr1\tpos 1\tgene1\tID1\t300"
+ ]
+ self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+
+ #TODO: write test
+ def testWriteSNPGeneInfo(self):
+ pass
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGetSNPGeneInfo))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import os, unittest
+from Erange.commoncode import readDataset
+from Erange import getSNPs
+
+
+class TestGetSNPs(unittest.TestCase):
+
+ def setUp(self):
+ self.rdsDNA = readDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
+
+ uniqueInsertList = [("uniqueID1", "chr1", 10, 20, "+", 1.0, "", ""),
+ ("uniqueID2", "chr1", 100, 200, "+", 1.0, "", ""),
+ ("uniqueID3", "chr1", 1000, 2000, "+", 1.0, "", "G10A")]
+
+ multiInsertList = [("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""),
+ ("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""),
+ ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+ ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+ ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+ ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", "")]
+
+ self.rdsDNA.insertUniqs(uniqueInsertList)
+ self.rdsDNA.insertMulti(multiInsertList)
+
+
+ def tearDown(self):
+ os.remove("./testDNARDSForUnitTests.rds")
+ self.rdsDNA = None
+
+
+ def testGetMatchDict(self):
+ uniqueTestDict = getSNPs.getMatchDict(self.rdsDNA, "chr1", withSplices=False)
+
+ self.assertEqual(uniqueTestDict[10][0], 20, "incorrect result for unique chr position 10")
+ self.assertEqual(uniqueTestDict[100][0], 200, "incorrect result for unique chr position 100")
+ self.assertEqual(uniqueTestDict[1000][0], 2000, "incorrect result for unique chr position 1000")
+
+ self.assertRaises(KeyError, getSNPs.getMatchDict, self.rdsDNA, "chr2", withSplices=False)
+
+
+ def testGetMismatchDict(self):
+ mismatchDict = getSNPs.getMismatchDict(self.rdsDNA, "chr1")
+ result = {1009: {"totalBaseDict": {"A-G": 1},
+ "uniqueReadCount": 1,
+ "uniqBaseDict": {"A-G": 1},
+ "back": "1000:A-G", "totalCount": 1
+ }
+ }
+ self.assertEquals(result, mismatchDict)
+
+
+ #TODO: write unit test
+ def testGetSNPs(self):
+ pass
+
+
+ #TODO: write unit test
+ def testWriteSNPsToFile(self):
+ pass
+
+
+ def testDoNotProcessChromosome(self):
+ self.assertFalse(getSNPs.doNotProcessChromosome(True, "chr1"))
+ self.assertFalse(getSNPs.doNotProcessChromosome(False, "chr1"))
+ self.assertFalse(getSNPs.doNotProcessChromosome(False, "badName"))
+ self.assertTrue(getSNPs.doNotProcessChromosome(True, "badName"))
+ self.assertTrue(getSNPs.doNotProcessChromosome(True, ""))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGetSNPs))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import MakeBamFromRds
+
+
+class TestMakeBamFromRds(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testGetMismatches(self):
+ mismatchString = "3A10T"
+ self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("A3G, T10A"))
+
+ mismatchString = ""
+ self.assertEqual(mismatchString, MakeBamFromRds.getMismatches(""))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeBamFromRds))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jul 28, 2010
+
+@author: sau
+'''
+
+import os, unittest
+from Erange import makeGraphs
+
+testFileName = "/tmp/testEdgeFileForUnitTests.txt"
+
+class TestMakeGraphs(unittest.TestCase):
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testGetEdges(self):
+ nodeList = []
+ self.assertEquals({}, makeGraphs.getEdges(nodeList))
+
+ nodeEntry = "ex_node1\tex_node2\t1"
+ nodeList.append(nodeEntry)
+ result = {"ex_node1": [("ex_node2", 1)],
+ "ex_node2": [("ex_node1", 1)]}
+ self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+ nodeEntry = "ex_node1\tex_node3\t2"
+ nodeList.append(nodeEntry)
+ result = {"ex_node1": [("ex_node2", 1), ("ex_node3", 2)],
+ "ex_node2": [("ex_node1", 1)],
+ "ex_node3": [("ex_node1", 2)]
+ }
+ self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+ result = {"node1": [("node2", 1), ("node3", 2)],
+ "node2": [("node1", 1)],
+ "node3": [("node1", 2)]
+ }
+ self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True))
+
+ nodeEntry = "ex:node1\tex:node2\t1"
+ nodeList = [nodeEntry]
+ result = {"ex:node1": [("ex:node2", 1)],
+ "ex:node2": [("ex:node1", 1)]}
+ self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True))
+
+ nodeEntry = "badLine"
+ nodeList = [nodeEntry]
+ self.assertEquals({}, makeGraphs.getEdges(nodeList))
+ nodeEntry = "node1\tnode2\t1"
+ nodeList.append(nodeEntry)
+ result = {"node1": [("node2", 1)],
+ "node2": [("node1", 1)]}
+ self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+
+ def testGetEdgesFromFile(self):
+ self.edgeFile = open(testFileName, "w")
+ self.edgeFile.write("node1\tnode2\t1")
+ self.edgeFile.close()
+
+ result = {"node1": [("node2", 1)],
+ "node2": [("node1", 1)]}
+ self.assertEquals(result, makeGraphs.getEdgesFromFile(testFileName))
+
+ os.remove(testFileName)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeGraphs))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jun 10, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import MakeRdsFromBam
+
+class TestMakeRdsFromBam(unittest.TestCase):
+
+
+ def testGetSpliceBounds(self):
+ start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
+
+ self.assertEqual(start, 0, "incorrect start position for 262")
+ self.assertEqual(startR, 8, "incorrect right start position for 262")
+ self.assertEqual(stopL, 2, "incorrect left stop position for 262")
+ self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+
+
+ def testGetMismatches(self):
+ querySequence = "GATTACA"
+
+ resultString = "A3T"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "+"))
+ resultString = "T3A"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "-"))
+ resultString = "T7A"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("6T", querySequence, "+"))
+
+ resultString = "A3T,T7A"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A3T0", querySequence, "+"))
+
+ resultString = ""
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^T", querySequence, "+"))
+
+ resultString = "T5A"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "+"))
+ resultString = "A5T"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "-"))
+
+ resultString = "A3N"
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", "", "+"))
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A"))
+
+ resultString = ""
+ self.assertEquals(resultString, MakeRdsFromBam.getMismatches("badMismatchTagData", querySequence, "+"))
+
+
+ def testIsSpliceEntry(self):
+ self.assertTrue(MakeRdsFromBam.isSpliceEntry([(1,6), (3, 4), (1, 2)]))
+ self.assertFalse(MakeRdsFromBam.isSpliceEntry([(1,6), (2, 4), (1, 2)]))
+ self.assertFalse(MakeRdsFromBam.isSpliceEntry([]))
+ self.assertFalse(MakeRdsFromBam.isSpliceEntry(""))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeRdsFromBam))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import makeSNPtrack
+
+
+class TestMakeSNPTrack(unittest.TestCase):
+
+ baseColor = {"A": "200, 0, 255",
+ "T": "200, 0, 255",
+ "C": "200, 0, 255",
+ "G": "200, 0, 255"
+ }
+
+ specialColors = {"A-G": "255, 0, 0",
+ "T-C": "0, 0, 255"
+ }
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testGetHeader(self):
+ track = "test track"
+ header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track)
+ self.assertEquals(header, makeSNPtrack.getHeader(track))
+
+
+ def testDoNotProcessLine(self):
+ self.assertTrue(makeSNPtrack.doNotProcessLine("#anything"))
+ self.assertFalse(makeSNPtrack.doNotProcessLine("line to process"))
+
+
+ def testGetBedOutputLine(self):
+ chromosome = "chr1"
+ readStart = 10
+ readStop = 11
+ readName = "A"
+ score = "0"
+ sense = "+"
+ color = self.baseColor[readName]
+ snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6", readName]
+ outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color)
+ self.assertEquals(outline, makeSNPtrack.getBedOutputLine(snpPropertiesList))
+
+ snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6"]
+ self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+ snpPropertiesList = []
+ self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+ snpPropertiesList = ["0", "1", chromosome, "some string", "4", "5", "6", readName]
+ self.assertRaises(ValueError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+
+ def testGetSNPColor(self):
+ for base in self.baseColor.keys():
+ self.assertEquals(self.baseColor[base], makeSNPtrack.getSNPColor(base))
+
+ for base in self.specialColors.keys():
+ self.assertEquals(self.specialColors[base], makeSNPtrack.getSNPColor(base))
+
+ defaultColor = "200, 0, 255"
+ self.assertEquals(defaultColor, makeSNPtrack.getSNPColor(""))
+ self.assertEquals(defaultColor, makeSNPtrack.getSNPColor("V"))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeSNPTrack))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Sep 15, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.chiapet import markLinkers
+
+
+class TestMarkLinkers(unittest.TestCase):
+ linkerFileName = "/Users/sau/Eclipse/erange/source/Erange/chiapet/linkers.fa"
+ inFileName = "linkerTestIn.txt"
+ outFileName = "linkerTestOut.txt"
+
+ def setUp(self):
+ infile = open(self.inFileName, "w")
+ infile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.inFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.outFileName)
+ except OSError:
+ pass
+
+
+ def testMarkLinkers(self):
+ markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName)
+ output = open(self.outFileName)
+ for line in output:
+ self.assertEquals("", line)
+
+ output.close()
+ os.remove(self.outFileName)
+
+ infile = open(self.inFileName, "w")
+ print >> infile, ""
+ print >> infile, "@Linker1"
+ print >> infile, "........................GTTGGATAAGATATCGCGG....."
+ print >> infile, "@NoLinker"
+ print >> infile, "GATTACA.GATTACA.GATTACA.GATTACA.GATTACA.GATTACA."
+ print >> infile, "@Linker2"
+ print >> infile, "........................GTTGGAATGTATATCGCGG....."
+ print >> infile, "@Linker1Short"
+ print >> infile, "..............GTTGGAATGTATATCGCGG..............."
+ print >> infile, "@Linker2Short"
+ print >> infile, "..............GTTGGAATGTATATCGCGG..............."
+ infile.close()
+
+ markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName)
+ output = open(self.outFileName)
+ self.assertEquals(">L1_Linker1\n", output.readline())
+ self.assertEquals("....................\n", output.readline())
+ self.assertEquals(">NA_NoLinker\n", output.readline())
+ self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline())
+ self.assertEquals(">NA_NoLinker\n", output.readline())
+ self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline())
+ self.assertEquals(">NA_Linker2\n", output.readline())
+ self.assertEquals("....................\n", output.readline())
+ self.assertEquals(">L2_Linker2\n", output.readline())
+ self.assertEquals("....................\n", output.readline())
+ self.assertEquals(">NA_Linker1Short\n", output.readline())
+ self.assertEquals("..............GTTGGA\n", output.readline())
+ self.assertEquals(">NA_Linker1Short\n", output.readline())
+ self.assertEquals("..............GTTGGA\n", output.readline())
+ self.assertEquals(">NA_Linker2Short\n", output.readline())
+ self.assertEquals("..............GTTGGA\n", output.readline())
+ self.assertEquals(">NA_Linker2Short\n", output.readline())
+ self.assertEquals("..............GTTGGA\n", output.readline())
+
+ output.close()
+ #TODO: Check that we really do want to output the same line
+ #multiple times in the case where neither linker is detected.
+ #See if downstream there is a real reason for doing it this way
+ #or if it was handled as a bug introduced at this stage of the
+ #analysis.
+
+
+ def testGetLinkerInformation(self):
+ linkerDict, linkerList = markLinkers.getLinkerInformation([])
+ resultDict = {}
+ resultList = []
+ self.assertEquals(resultDict, linkerDict)
+ self.assertEquals(resultList, linkerList)
+
+ linkerData = [">linker_b.1",
+ "GTTGGATAAGATATCGCGG",
+ ">linker_b.2",
+ "GTTGGAATGTATATCGCGG"
+ ]
+ linkerDict, linkerList = markLinkers.getLinkerInformation(linkerData)
+ resultDict = {"linker_b.1": "GTTGGATAAG",
+ "linker_b.2": "GTTGGAATGT"
+ }
+ resultList = ["linker_b.1", "linker_b.2"]
+ self.assertEquals(resultDict, linkerDict)
+ self.assertEquals(resultList, linkerList)
+
+
+ def testGetLinkerInformationFromFile(self):
+ linkerDict, linkerList = markLinkers.getLinkerInformationFromFile("bad file name")
+ resultDict = {}
+ resultList = []
+ self.assertEquals(resultDict, linkerDict)
+ self.assertEquals(resultList, linkerList)
+
+ linkerDict, linkerList = markLinkers.getLinkerInformationFromFile(self.linkerFileName)
+ resultDict = {"linker_b.1": "GTTGGATAAG",
+ "linker_b.2": "GTTGGAATGT"
+ }
+ resultList = ["linker_b.1", "linker_b.2"]
+ self.assertEquals(resultDict, linkerDict)
+ self.assertEquals(resultList, linkerList)
+
+
+ def testMain(self):
+ argv = ["markLinkers", self.linkerFileName, self.inFileName, self.outFileName]
+ markLinkers.main(argv)
+ output = open(self.outFileName)
+ for line in output:
+ self.assertEquals("", line)
+
+ output.close()
+ os.remove(self.outFileName)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMarkLinkers))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Oct 4, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import peakstoregion
+
+inFileName = "testPeaksToRegionInFile.txt"
+outFileName = "testPeaksToRegionOutFile.txt"
+
+
+class TestPeaksToRegion(unittest.TestCase):
+
+
+ def setUp(self):
+ self.inFile = open(inFileName, "w")
+ self.inFile.write("stuff\tpeak1\tchr1\t1000\t1.3\n")
+ self.inFile.write("stuff\tpeak2\tchr1\t800\t9.7\n")
+ self.inFile.write("stuff\tpeak3\tchr2\t1000\t3.0\n")
+ self.inFile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(outFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove(inFileName)
+ except OSError:
+ pass
+
+
+ def testPeaksToRegion(self):
+ peakstoregion.peakstoregion(inFileName, outFileName)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0])
+ self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1])
+ self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2])
+
+
+ def testMain(self):
+ argv = ["peakstoregion", inFileName, outFileName]
+ peakstoregion.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0])
+ self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1])
+ self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2])
+
+ argv = ["peakstoregion", inFileName, outFileName, 600, 2, 3, 1, -1]
+ peakstoregion.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("peak1\tchr1\t400\t1600\t1.3\n", results[0])
+ self.assertEquals("peak2\tchr1\t200\t1400\t9.7\n", results[1])
+ self.assertEquals("peak3\tchr2\t400\t1600\t3.0\n", results[2])
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestPeaksToRegion))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Sep 15, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.rnapath import processvelvet
+
+
+class TestProcessVelvet(unittest.TestCase):
+ inFileName = "testProcessVelvetIn.txt"
+ filterFileName = "testProcessVelvetFilter.txt"
+ outFileName = "testProcessVelvetOut.txt"
+
+
+ def setUp(self):
+ infile = open(self.inFileName, "w")
+ infile.close()
+ filter = open(self.filterFileName, "w")
+ filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODE1-1_0\n")
+ filter.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.inFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.filterFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.outFileName)
+ except OSError:
+ pass
+
+
+ def testProcessVelvet(self):
+ processvelvet.processvelvet(self.inFileName, self.outFileName)
+ outfile = open(self.outFileName)
+ for line in outfile:
+ self.assertEquals("", line)
+
+ os.remove(self.outFileName)
+
+ infile = open(self.inFileName, "w")
+ print >> infile, ">NODE1-1_0"
+ print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ infile.close()
+
+ processvelvet.processvelvet(self.inFileName, self.outFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr0\n", outfile.readline())
+ self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+ processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+
+ infile = open(self.inFileName, "w")
+ print >> infile, ">NODE1-1_1"
+ print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ print >> infile, ">NODE1-1_0"
+ print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ infile.close()
+
+ processvelvet.processvelvet(self.inFileName, self.outFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr1\n", outfile.readline())
+ self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+ self.assertEquals(">chr0\n", outfile.readline())
+ self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+ processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr1\n", outfile.readline())
+ self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+
+ infile = open(self.inFileName, "w")
+ print >> infile, ">NODE1-1_1"
+ print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ print >> infile, ">NODE1-1_0"
+ print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ print >> infile, ">NODE1-1_2"
+ print >> infile, "GATTACA"
+ infile.close()
+
+ processvelvet.processvelvet(self.inFileName, self.outFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr1\n", outfile.readline())
+ self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+ self.assertEquals(">chr0\n", outfile.readline())
+ self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+ self.assertEquals(">chr2\n", outfile.readline())
+ self.assertEquals("GATTACA\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+ processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr1\n", outfile.readline())
+ self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+ self.assertEquals(">chr2\n", outfile.readline())
+ self.assertEquals("GATTACA\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ os.remove(self.outFileName)
+ processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName, minSize=10)
+ outfile = open(self.outFileName)
+ self.assertEquals(">chr1\n", outfile.readline())
+ self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+
+
+ def testGetFilterList(self):
+ self.assertEquals([], processvelvet.getFilterList())
+ self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName))
+ self.assertEquals([], processvelvet.getFilterList("whatfile?"))
+
+ filter = open(self.filterFileName, "a")
+ filter.write("some fields without the key trigger string\n")
+ filter.close()
+ self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName))
+
+ filter = open(self.filterFileName, "a")
+ filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n")
+ filter.close()
+ self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+ filter = open(self.filterFileName, "a")
+ filter.write("0\tNODEWrongField\n")
+ filter.close()
+ self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+ filter = open(self.filterFileName, "a")
+ filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n")
+ filter.close()
+ self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+
+ def testWriteNode(self):
+ node = {"contigPrefix": "chr",
+ "completeID": "",
+ "currentSeq": ""
+ }
+
+ counts = {"acceptedSize": 0,
+ "nSize": 0,
+ "contigsAccepted": 0,
+ "filteredSize": 0
+ }
+
+ filterList = []
+
+ outfile = open(self.outFileName, "w")
+ processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+ outfile.close()
+ results = open(self.outFileName)
+ self.assertEquals("", results.readline())
+ results.close()
+ os.remove(self.outFileName)
+
+ node["completeID"] = "<5"
+ node["currentSeq"] = "GATTACA\n"
+ outfile = open(self.outFileName, "w")
+ processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+ self.assertEquals(counts["filteredSize"], 7)
+ counts["filteredSize"] = 0
+ outfile.close()
+ results = open(self.outFileName)
+ self.assertEquals("", results.readline())
+ results.close()
+ os.remove(self.outFileName)
+
+ node["completeID"] = "NODE1_1"
+ node["currentSeq"] = "GATTACA\n"
+ filterList = ["NODE1_1"]
+ outfile = open(self.outFileName, "w")
+ processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+ self.assertEquals(counts["filteredSize"], 7)
+ counts["filteredSize"] = 0
+ outfile.close()
+ results = open(self.outFileName)
+ self.assertEquals("", results.readline())
+ results.close()
+ os.remove(self.outFileName)
+
+ node["completeID"] = "NODE1_1"
+ node["currentSeq"] = "GATTACA\n"
+ filterList = []
+ outfile = open(self.outFileName, "w")
+ processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+ self.assertEquals(counts["acceptedSize"], 7)
+ outfile.close()
+ results = open(self.outFileName)
+ self.assertEquals(">chr1\n", results.readline())
+ self.assertEquals("GATTACA\n", results.readline())
+ self.assertEquals("", results.readline())
+ results.close()
+ os.remove(self.outFileName)
+
+
+ def testMain(self):
+ argv = ["processVelvet", self.inFileName, self.outFileName]
+ processvelvet.main(argv)
+ outfile = open(self.outFileName)
+ for line in outfile:
+ self.assertEquals("", line)
+
+ os.remove(self.outFileName)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestProcessVelvet))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Sep 10, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.rnapath import RNAPATH
+
+compDict = {"A": "T",
+ "T": "A",
+ "G": "C",
+ "C": "G",
+ "S": "S",
+ "W": "W",
+ "R": "Y",
+ "Y": "R",
+ "M": "K",
+ "K": "M",
+ "H": "D",
+ "D": "H",
+ "B": "V",
+ "V": "B",
+ "N": "N",
+ "a": "t",
+ "t": "a",
+ "g": "c",
+ "c": "g",
+ "n": "n",
+ "z": "z"
+}
+
+
+class TestRNAPATH(unittest.TestCase):
+ incontigfilename = "contigIn.txt"
+ distalPairsfile = "distalPair.txt"
+ outpathfilename = "rnapathOut.txt"
+ outcontigfilename = "contigOut.txt"
+
+ def setUp(self):
+ inContigs = open(self.incontigfilename, "w")
+ inContigs.close()
+
+ distal = open(self.distalPairsfile, "w")
+ distal.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(self.incontigfilename)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.distalPairsfile)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.outpathfilename)
+ except OSError:
+ pass
+
+ try:
+ os.remove(self.outcontigfilename)
+ except OSError:
+ pass
+
+
+ def testCompNT(self):
+ for nt in compDict.keys():
+ self.assertEquals(compDict[nt], RNAPATH.compNT(nt))
+
+ self.assertEquals("N", RNAPATH.compNT("5"))
+ self.assertEquals("N", RNAPATH.compNT("anything"))
+
+
+ def testComplement(self):
+ self.assertEquals("", RNAPATH.complement(""))
+ for nt in compDict.keys():
+ self.assertEquals(compDict[nt], RNAPATH.complement(nt))
+
+ self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA"))
+ self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", 7))
+ self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", -75632))
+ self.assertEquals("TGTA", RNAPATH.complement("GATTACA", 4))
+
+ #TODO: do we want to return when length > seqlength? This is
+ # the current return and it seems very wrong we only N fill
+ # after going more then seqlength in negative direction
+ self.assertEquals("TGTAATCTG", RNAPATH.complement("GATTACA", 9))
+ self.assertEquals("TGTAATCTGTAATCNNNNN", RNAPATH.complement("GATTACA", 19))
+
+ #TODO: write test
+ def testRnaPath(self):
+ RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename)
+ outfile = open(self.outpathfilename)
+ self.assertTrue("#settings:" in outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ outcontig = open(self.outcontigfilename)
+ self.assertEquals(0, len(outcontig.readlines()))
+ outcontig.close()
+
+ #infile = open(self.incontigfilename, "w")
+ #infile.write(">chr1 stuff\n")
+ #infile.write("GATTACA\n")
+ #infile.close()
+ #RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename)
+ #outfile = open(self.outpathfilename)
+ #self.assertTrue("#settings:" in outfile.readline())
+ #self.assertEquals("", outfile.readline())
+ #outfile.close()
+
+
+ #TODO: write test
+ def testGetPath(self):
+ pass
+
+
+ #TODO: write test
+ def testTraverseGraph(self):
+ leafList = []
+ edgeMatrix = RNAPATH.EdgeMatrix(0)
+ pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+ self.assertEquals([], pathList)
+ self.assertEquals({}, visitedDict)
+
+ leafList = [1]
+ edgeMatrix = RNAPATH.EdgeMatrix(3)
+ edgeMatrix.edgeArray[2][1] = 3
+ edgeMatrix.edgeArray[1][2] = 3
+ pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+ self.assertEquals([ [1, 2] ], pathList)
+ self.assertEquals({1: "", 2: ""}, visitedDict)
+
+ leafList = [1, 2]
+ edgeMatrix = RNAPATH.EdgeMatrix(3)
+ edgeMatrix.edgeArray[2][1] = 3
+ edgeMatrix.edgeArray[1][2] = 3
+ pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+ self.assertEquals([ [1, 2] ], pathList)
+ self.assertEquals({1: "", 2: ""}, visitedDict)
+
+
+ #TODO: write test
+ def testGetContigsFromFile(self):
+ contigNum, nameList, contigDict, origSize = RNAPATH.getContigsFromFile(self.incontigfilename)
+ self.assertEquals(0, contigNum)
+ self.assertEquals([], nameList)
+ self.assertEquals({}, contigDict)
+ self.assertEquals([], origSize)
+
+
+ #TODO: check for boundary condition and special cases
+ def testEdgeMatrix(self):
+ edgeMatrix = RNAPATH.EdgeMatrix(0)
+ result = "[]"
+ self.assertEquals(result, str(edgeMatrix.edgeArray))
+
+ edgeMatrix = RNAPATH.EdgeMatrix(3)
+ result = "[[0 0 0]\n [0 0 0]\n [0 0 0]]"
+ self.assertEquals(result, str(edgeMatrix.edgeArray))
+ self.assertEquals([], edgeMatrix.visitLink(0))
+
+ edgeMatrix.edgeArray[0][1] = 1
+ self.assertEquals([], edgeMatrix.visitLink(0))
+
+ edgeMatrix.edgeArray[0][1] = 2
+ result = [0]
+ self.assertEquals(result, edgeMatrix.visitLink(0))
+
+ edgeMatrix.edgeArray[2][1] = 2
+ result = []
+ self.assertEquals(result, edgeMatrix.visitLink(0))
+ edgeMatrix.edgeArray[2][1] = 2
+ result = []
+ self.assertEquals(result, edgeMatrix.visitLink(1))
+ edgeMatrix.edgeArray[2][1] = 2
+ result = [2]
+ self.assertEquals(result, edgeMatrix.visitLink(2))
+
+ edgeMatrix.edgeArray[2][1] = 3
+ edgeMatrix.edgeArray[1][2] = 3
+ result = [1, 2]
+ self.assertEquals(result, edgeMatrix.visitLink(1))
+
+
+ def testMain(self):
+ argv = ["RNAPATH", self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename]
+ RNAPATH.main(argv)
+ outfile = open(self.outpathfilename)
+ self.assertTrue("#settings:" in outfile.readline())
+ self.assertEquals("", outfile.readline())
+ outfile.close()
+ outcontig = open(self.outcontigfilename)
+ self.assertEquals(0, len(outcontig.readlines()))
+ outcontig.close()
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestRNAPATH))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jul 21, 2010
+
+@author: sau
+'''
+import unittest
+import os
+import sqlite3 as sqlite
+from Erange import ReadDataset
+
+testDBName = "testRDS.rds"
+rnaTestDBName = "testRDSRNA.rds"
+
+class TestReadDataset(unittest.TestCase):
+
+
+ def setUp(self):
+ self.rds = ReadDataset.ReadDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+ self.rnaRds = ReadDataset.ReadDataset(rnaTestDBName, initialize=True, datasetType="RNA", verbose=False)
+
+
+ def tearDown(self):
+ del(self.rds)
+ os.remove(testDBName)
+ del(self.rnaRds)
+ os.remove(rnaTestDBName)
+
+
+ #TODO: rename and integrate
+ def testZeeNewStuff(self):
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", ""),
+ ("dup start", "chr1", 1, 150, "+", 1.0, "", ""),
+ ("new read", "chr1", 80, 100, "+", 1.0, "", ""),
+ ("testRead", "chr2", 201, 400, "+", 1.0, "", ""),
+ ("dup start", "chr2", 201, 450, "+", 1.0, "", ""),
+ ("new read", "chr2", 280, 400, "+", 1.0, "", ""),
+ ("three up", "chr3", 1, 80, "+", 1.0, "", ""),
+ ("three two", "chr3", 201, 230, "+", 1.0, "", "")
+ ]
+ self.rds.insertUniqs(rdsEntryList)
+ dbcon = sqlite.connect(testDBName)
+ sql = dbcon.cursor()
+ sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1)")
+ result = [("chr1", 1), ("chr2", 201)]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+ result = [("chr1", 80), ("chr2", 280), ("chr3", 1), ("chr3", 201)]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ sql.execute("select * from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select * from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+ result = [(2, "dup start", "chr1", 1, 150, "+", 1.0, "", ""),
+ (3, "new read", "chr1", 80, 100, "+", 1.0, "", ""),
+ (5, "dup start", "chr2", 201, 450, "+", 1.0, "", ""),
+ (6, "new read", "chr2", 280, 400, "+", 1.0, "", ""),
+ (7, "three up", "chr3", 1, 80, "+", 1.0, "", ""),
+ (8, "three two", "chr3", 201, 230, "+", 1.0, "", "")
+ ]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ sql.execute("select chrom,start from uniqs where start > 100 group by chrom,start having ( count(start) > 1 and count(chrom) > 1) order by chrom,start")
+ result = [("chr2", 201)]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+
+ rdsEntryList = [("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr2", 80, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr2", 1, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr2", 5000, 25000, "+", 0.5, "", ""),
+ ("testMultiRead", "chr3", 1, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr3", 70, 500, "+", 0.5, "", "")
+ ]
+ self.rds.insertMulti(rdsEntryList)
+ sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1)")
+ result = [("chr1", 1), ("chr2", 201), ("chr3", 1)]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+ result = [("chr1", 80),
+ ("chr2", 1), ("chr2", 80), ("chr2", 280), ("chr2", 5000),
+ ("chr3", 70), ("chr3", 201)
+ ]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+ result = sql.fetchall()
+ result = [("chr1", 1), ("chr1", 80),
+ ("chr2", 1), ("chr2", 80), ("chr2", 201), ("chr2", 280), ("chr2", 5000),
+ ("chr3", 1), ("chr3", 70), ("chr3", 201)
+ ]
+ for eachEntry in sql.fetchall():
+ self.assertTrue(eachEntry in result)
+
+ result = {"1": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}],
+ "3": [{"start": 1, "sense": "+"}, {"start": 70, "sense": "+"}, {"start": 201, "sense": "+"}],
+ "2": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}, {"start": 201, "sense": "+"}, {"start": 280, "sense": "+"}, {"start": 5000, "sense": "+"}]
+ }
+ self.assertEquals(result, self.rds.getReadsDict(combine5p=True, doMulti=True))
+
+ print self.rds.getReadsDict(combine5p=True, doMulti=True, withWeight=True)
+
+ def testReadDatasetBuiltIns(self):
+ # Initialize an existing rds file
+ self.assertRaises(sqlite.OperationalError, ReadDataset.ReadDataset, testDBName, initialize=True, datasetType="DNA", verbose=True)
+ self.assertEquals(0, len(self.rds))
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, len(self.rds))
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+ self.assertEquals(2, len(self.rds))
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList)
+ self.rnaRds.insertSplices(rdsEntryList)
+ self.assertEquals(2, len(self.rds))
+ self.assertEquals(1, len(self.rnaRds))
+
+
+ def testInsertUniqs(self):
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, len(self.rds))
+
+ rdsEntryList = [("testRead2", "chr1", 200, 300, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(2, len(self.rds))
+
+
+ def testInsertMulti(self):
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+ self.assertEquals(1, len(self.rds))
+
+
+ def testInsertSplices(self):
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList)
+ self.rnaRds.insertSplices(rdsEntryList)
+ self.assertEquals(0, len(self.rds))
+ self.assertEquals(1, len(self.rnaRds))
+
+
+ def testGetChromosomes(self):
+ result = []
+ self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True))
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ result = ["chr1"]
+ self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True))
+
+ self.assertRaises(sqlite.OperationalError, self.rds.getChromosomes, table="badTableName")
+
+
+ #TODO: write unit test
+ def testAttachDB(self):
+ pass
+
+
+ #TODO: write unit test
+ def testDetachDB(self):
+ pass
+
+
+ #TODO: write unit test
+ def testImportFromDB(self):
+ pass
+
+
+ def testGetTables(self):
+ result = ["metadata", "uniqs", "multi"]
+ self.assertEquals(result, self.rds.getTables())
+
+ result = ["metadata", "uniqs", "multi", "splices"]
+ self.assertEquals(result, self.rnaRds.getTables())
+
+
+ def testHasIndex(self):
+ self.assertFalse(self.rds.hasIndex())
+ self.rds.buildIndex()
+ self.assertTrue(self.rds.hasIndex())
+
+
+ def testGetMetadata(self):
+ returnDict = self.rds.getMetadata()
+ self.assertTrue(returnDict.has_key("rdsVersion"))
+ self.assertEquals(returnDict["dataType"], "DNA")
+
+ result = {"dataType": "RNA"}
+ self.assertEquals(result, self.rnaRds.getMetadata("dataType"))
+
+ result = {}
+ self.assertEquals(result, self.rds.getMetadata("badMetaDataName"))
+
+
+ def testGetReadSize(self):
+ self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getReadSize)
+
+ self.rds.insertMetadata([("readsize", "100")])
+ self.assertEquals(100, self.rds.getReadSize())
+
+ self.rds.updateMetadata("readsize", 100)
+ self.assertEquals(100, self.rds.getReadSize())
+
+ self.rds.updateMetadata("readsize", "100 import")
+ self.assertEquals(100, self.rds.getReadSize())
+
+ self.rds.updateMetadata("readsize", "badReadSize")
+ self.assertRaises(ValueError, self.rds.getReadSize)
+
+
+ def testGetDefaultCacheSize(self):
+ self.assertEquals(100000, self.rds.getDefaultCacheSize())
+
+
+ def testGetMaxCoordinate(self):
+ self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1"))
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+ self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr2"))
+ self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1", doUniqs=False))
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+ self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True))
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+ self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True))
+ self.assertEquals(1150, self.rnaRds.getMaxCoordinate("chr1", doSplices=True))
+
+
+ def testGetReadsDict(self):
+ self.assertEquals({}, self.rds.getReadsDict())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ reads = self.rds.getReadsDict()
+ self.assertEquals(1, len(reads))
+ self.assertEquals(1, len(reads["1"]))
+ read = reads["1"][0]
+ self.assertEquals(["start", "sense"], read.keys())
+ self.assertEquals(1, read["start"])
+ self.assertEquals("+", read["sense"])
+
+ reads = self.rds.getReadsDict(bothEnds=True, noSense=False, fullChrom=True,
+ withWeight=True, withFlag=True, withMismatch=True, withID=True,
+ withChrom=True, readIDDict=True)
+ self.assertEquals(1, len(reads))
+ self.assertEquals(1, len(reads["testRead"]))
+ read = reads["testRead"][0]
+ self.assertEquals(["readID", "weight", "stop", "mismatch","start", "flag","sense", "chrom"], read.keys())
+ self.assertEquals("testRead", read["readID"])
+ self.assertEquals(1.0, read["weight"])
+ self.assertEquals(100, read["stop"])
+ self.assertEquals("", read["mismatch"])
+ self.assertEquals(1, read["start"])
+ self.assertEquals("", read["flag"])
+ self.assertEquals("+", read["sense"])
+ self.assertEquals("chr1", read["chrom"])
+
+ self.assertEquals({}, self.rds.getReadsDict(hasMismatch=True))
+ self.assertEquals({}, self.rds.getReadsDict(strand="-"))
+ self.assertEquals(1, len(self.rds.getReadsDict(strand="+")))
+
+ rdsEntryList = [("testRead2", "chr1", 201, 300, "-", 1.0, "A", "G22A")]
+ self.rds.insertUniqs(rdsEntryList)
+ reads = self.rds.getReadsDict()
+ self.assertEquals(1, len(reads))
+ reads = self.rds.getReadsDict()
+ self.assertEquals(2, len(reads["1"]))
+ read = reads["1"][1]
+ self.assertEquals(201, read["start"])
+ reads = self.rds.getReadsDict(strand="+")
+ self.assertEquals(1, len(reads))
+ read = reads["1"][0]
+ self.assertEquals("+", read["sense"])
+ reads = self.rds.getReadsDict(strand="-")
+ self.assertEquals(1, len(reads))
+ reads = self.rds.getReadsDict(start=199)
+ self.assertEquals(1, len(reads["1"]))
+ reads = self.rds.getReadsDict(hasMismatch=True)
+ self.assertEquals(1, len(reads["1"]))
+
+ rdsEntryList = [("testMultiRead", "chr2", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr2", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+ reads = self.rds.getReadsDict()
+ self.assertEquals(1, len(reads))
+ reads = self.rds.getReadsDict(doMulti=True)
+ self.assertEquals(2, len(reads))
+ reads = self.rds.getReadsDict(doUniqs=False, doMulti=True)
+ self.assertFalse(reads.has_key("1"))
+
+
+ def testGetSplicesDict(self):
+ self.assertRaises(sqlite.OperationalError, self.rds.getSplicesDict)
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ reads = self.rnaRds.getSplicesDict()
+ self.assertEquals(1, len(reads))
+ self.assertEquals(1, len(reads["1"]))
+ read = reads["1"][0]
+ result = ["startR", "stopL", "sense", "startL", "stopR"]
+ self.assertEquals(result, read.keys())
+ self.assertEquals(1000, read["startL"])
+ self.assertEquals("+", read["sense"])
+ reads = self.rnaRds.getSplicesDict(splitRead=True)
+ self.assertEquals(2, len(reads["1"]))
+ self.assertEquals(1000, reads["1"][0]["startL"])
+ self.assertFalse(reads["1"][0].has_key("startR"))
+ self.assertFalse(reads["1"][0].has_key("stopR"))
+ self.assertEquals(1150, reads["1"][1]["startR"])
+ self.assertFalse(reads["1"][1].has_key("startL"))
+ self.assertFalse(reads["1"][1].has_key("stopL"))
+ self.assertEquals(reads["1"][0]["sense"], reads["1"][1]["sense"])
+
+ reads = self.rnaRds.getSplicesDict(noSense=False, fullChrom=True,
+ withWeight=True, withFlag=True, withMismatch=True, withID=True,
+ withChrom=True, readIDDict=True)
+ self.assertEquals(1, len(reads))
+ self.assertEquals(1, len(reads["testSpliceRead"]))
+ read = reads["testSpliceRead"][0]
+ result = ["readID", "weight", "startR", "mismatch","stopR", "stopL", "flag", "startL", "sense", "chrom"]
+ self.assertEquals(result, read.keys())
+ self.assertEquals("testSpliceRead", read["readID"])
+ self.assertEquals(1.0, read["weight"])
+ self.assertEquals(1150, read["startR"])
+ self.assertEquals("", read["mismatch"])
+ self.assertEquals(1200, read["stopR"])
+ self.assertEquals(1100, read["stopL"])
+ self.assertEquals("", read["flag"])
+ self.assertEquals(1000, read["startL"])
+ self.assertEquals("+", read["sense"])
+ self.assertEquals("chr1", read["chrom"])
+
+ self.assertEquals({}, self.rnaRds.getSplicesDict(hasMismatch=True))
+ self.assertEquals({}, self.rnaRds.getSplicesDict(strand="-"))
+ self.assertEquals(1, len(self.rnaRds.getSplicesDict(strand="+")))
+
+ rdsEntryList = [("testSpliceRead2", "chr1", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ reads = self.rnaRds.getSplicesDict()
+ self.assertEquals(1, len(reads))
+ reads = self.rnaRds.getSplicesDict()
+ self.assertEquals(2, len(reads["1"]))
+ read = reads["1"][1]
+ self.assertEquals(2000, read["startL"])
+ reads = self.rnaRds.getSplicesDict(strand="+")
+ self.assertEquals(1, len(reads))
+ read = reads["1"][0]
+ self.assertEquals("+", read["sense"])
+ reads = self.rnaRds.getSplicesDict(strand="-")
+ self.assertEquals(1, len(reads))
+ reads = self.rnaRds.getSplicesDict(start=1199)
+ self.assertEquals(1, len(reads["1"]))
+ reads = self.rnaRds.getSplicesDict(hasMismatch=True)
+ self.assertEquals(1, len(reads["1"]))
+
+ rdsEntryList = [("testSpliceRead3", "chr2", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ reads = self.rnaRds.getSplicesDict()
+ self.assertEquals(2, len(reads))
+ self.assertEquals(2, len(reads["1"]))
+ self.assertEquals(1, len(reads["2"]))
+ reads = self.rnaRds.getSplicesDict(withID=True, chrom="chr2")
+ self.assertFalse(reads.has_key("1"))
+ self.assertEquals("testSpliceRead3", reads["2"][0]["readID"])
+
+
+ def testGetCounts(self):
+ self.assertEquals(0, self.rds.getCounts())
+ self.assertEquals((0, 0, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, self.rds.getCounts())
+ self.assertEquals((1, 0, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+ self.assertEquals(2, self.rds.getCounts(multi=True))
+ self.assertEquals((1, 1, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+ self.assertEquals(1, self.rds.getCounts(chrom="chr1"))
+ self.assertEquals(0, self.rds.getCounts(chrom="chr2"))
+ self.assertEquals(1, self.rds.getCounts(rmin=1))
+ self.assertEquals(1, self.rds.getCounts(rmin=1, rmax=1000))
+ self.assertEquals(1, self.rds.getCounts(rmax=1000))
+ self.assertEquals(0, self.rds.getCounts(rmin=1000))
+ self.assertEquals(0, self.rds.getCounts(rmax=0))
+ self.assertEquals(1, self.rds.getCounts(sense="+"))
+ self.assertEquals(0, self.rds.getCounts(sense="-"))
+
+ self.assertEquals(0, self.rnaRds.getCounts())
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ self.assertEquals(1, self.rnaRds.getCounts(splices=True))
+
+
+ def testGetTotalCounts(self):
+ self.assertEquals(0, self.rds.getTotalCounts())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, self.rds.getTotalCounts())
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+
+ self.assertEquals(2, self.rds.getTotalCounts())
+ self.assertEquals(2, self.rds.getTotalCounts(chrom="chr1"))
+ self.assertEquals(0, self.rds.getTotalCounts(chrom="chr2"))
+ self.assertEquals(2, self.rds.getTotalCounts(rmin=1))
+ self.assertEquals(2, self.rds.getTotalCounts(rmax=1000))
+ self.assertEquals(1, self.rds.getTotalCounts(rmin=101, rmax=1000))
+ self.assertEquals(1, self.rds.getTotalCounts(rmin=1, rmax=100))
+ self.assertEquals(0, self.rds.getTotalCounts(rmin=1000))
+ self.assertEquals(0, self.rds.getTotalCounts(rmax=0))
+
+
+ def testGetTableEntryCount(self):
+ table = "uniqs"
+ self.assertEquals(0, self.rds.getTableEntryCount(table))
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, self.rds.getTableEntryCount(table))
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+
+ self.assertEquals(1, self.rds.getTableEntryCount(table))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1"))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2"))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=101, rmax=1000))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' "))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' "))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True))
+
+ table="multi"
+ self.assertEquals(1, self.rds.getTableEntryCount(table))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1"))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2"))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=101, rmax=1000))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' "))
+ self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' "))
+ self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True))
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ table="splices"
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table))
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, chrom="chr1"))
+ self.assertEquals(0, self.rnaRds.getTableEntryCount(table, chrom="chr2"))
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1, startField="startL"))
+ self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmin=1)
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmax=2000, startField="startL"))
+ self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmax=2000)
+ self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=999, startField="startL"))
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1000, startField="startL"))
+ self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=0, startField="startL"))
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, restrict=" sense ='+' "))
+ self.assertEquals(0, self.rnaRds.getTableEntryCount(table, restrict=" sense ='-' "))
+ self.assertEquals(1, self.rnaRds.getTableEntryCount(table, distinct=True, startField="startL"))
+ self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, distinct=True)
+
+
+ def testGetUniqsCount(self):
+ self.assertEquals(0, self.rds.getUniqsCount())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(1, self.rds.getUniqsCount())
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+
+ self.assertEquals(1, self.rds.getUniqsCount())
+ self.assertEquals(1, self.rds.getUniqsCount(chrom="chr1"))
+ self.assertEquals(0, self.rds.getUniqsCount(chrom="chr2"))
+ self.assertEquals(1, self.rds.getUniqsCount(rmin=1))
+ self.assertEquals(1, self.rds.getUniqsCount(rmax=1000))
+ self.assertEquals(0, self.rds.getUniqsCount(rmin=101, rmax=1000))
+ self.assertEquals(0, self.rds.getUniqsCount(rmin=1000))
+ self.assertEquals(0, self.rds.getUniqsCount(rmax=0))
+ self.assertEquals(1, self.rds.getUniqsCount(restrict=" sense ='+' "))
+ self.assertEquals(0, self.rds.getUniqsCount(restrict=" sense ='-' "))
+ self.assertEquals(1, self.rds.getUniqsCount(distinct=True))
+
+
+ def testGetSplicesCount(self):
+ self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+
+ self.assertEquals(1, self.rnaRds.getSplicesCount())
+ self.assertEquals(1, self.rnaRds.getSplicesCount(chrom="chr1"))
+ self.assertEquals(0, self.rnaRds.getSplicesCount(chrom="chr2"))
+ self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1))
+ self.assertEquals(1, self.rnaRds.getSplicesCount(rmax=2000))
+ self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=999))
+ self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1000))
+ self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=0))
+ self.assertEquals(1, self.rnaRds.getSplicesCount(restrict=" sense ='+' "))
+ self.assertEquals(0, self.rnaRds.getSplicesCount(restrict=" sense ='-' "))
+ self.assertEquals(1, self.rnaRds.getSplicesCount(distinct=True))
+
+
+ def testGetMultiCount(self):
+ self.assertEquals(0, self.rds.getMultiCount())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.assertEquals(0, self.rds.getMultiCount())
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rds.insertMulti(rdsEntryList)
+
+ self.assertEquals(1, self.rds.getMultiCount())
+ self.assertEquals(1, self.rds.getMultiCount(chrom="chr1"))
+ self.assertEquals(0, self.rds.getMultiCount(chrom="chr2"))
+ self.assertEquals(1, self.rds.getMultiCount(rmin=1))
+ self.assertEquals(1, self.rds.getMultiCount(rmax=1000))
+ self.assertEquals(0, self.rds.getMultiCount(rmin=1, rmax=100))
+ self.assertEquals(0, self.rds.getMultiCount(rmin=1000))
+ self.assertEquals(0, self.rds.getMultiCount(rmax=0))
+ self.assertEquals(1, self.rds.getMultiCount(restrict=" sense ='+' "))
+ self.assertEquals(0, self.rds.getMultiCount(restrict=" sense ='-' "))
+ self.assertEquals(1, self.rds.getMultiCount(distinct=True))
+
+
+ def testGetReadIDs(self):
+ self.assertEquals([], self.rnaRds.getReadIDs())
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ result = ["testRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs())
+
+ rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ result = ["testRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs())
+ result = ["testMultiRead", "testRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(multi=True))
+
+ rdsEntryList = [("testRead2", "chr1", 201, 300, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ result = ["testRead", "testRead2"]
+ self.assertEquals(result, self.rnaRds.getReadIDs())
+ result = ["testRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(limit=1))
+ result = ["testMultiRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(multi=True, limit=1))
+
+ rdsEntryList = [("testPair/1", "chr1", 301, 400, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ result = ["testPair", "testRead", "testRead2"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(paired=True))
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ result = ["testSpliceRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(uniqs=False, splices=True))
+ result = ["testPair/1", "testRead", "testRead2", "testSpliceRead"]
+ self.assertEquals(result, self.rnaRds.getReadIDs(splices=True))
+
+
+ def testGetMismatches(self):
+ self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getMismatches)
+ self.rds.insertMetadata([("readsize", "5")])
+
+ rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ result = {"chr1": []}
+ self.assertEquals(result, self.rds.getMismatches())
+
+ rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "C3T")]
+ self.rds.insertUniqs(rdsEntryList)
+ result = {"chr1": [[1, 3, "T", "C"]]}
+ self.assertEquals(result, self.rds.getMismatches())
+ result = {"chr2": []}
+ self.assertEquals(result, self.rds.getMismatches(mischrom="chr2"))
+
+ rdsEntryList = [("testRead", "chr1", 10, 15, "+", 1.0, "", "C3T")]
+ self.rds.insertUniqs(rdsEntryList)
+ result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]]}
+ self.assertEquals(result, self.rds.getMismatches())
+
+ rdsEntryList = [("testRead", "chr2", 10, 15, "+", 1.0, "", "C3T")]
+ self.rds.insertUniqs(rdsEntryList)
+ result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]],
+ "chr2": [[10, 12, "T", "C"]]}
+ self.assertEquals(result, self.rds.getMismatches())
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "C41T")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ self.rnaRds.insertMetadata([("readsize", "150")])
+ result = {"chr1": [[1000, 1040, "T", "C"]]}
+ #TODO: This test case fails. If there are only splice entries for a chromosome it shouldn't
+ # be necessary to specify the chromosome.
+ #self.assertEquals(result, self.rnaRds.getMismatches())
+ self.assertEquals(result, self.rnaRds.getMismatches(mischrom="chr1"))
+
+
+ #TODO: needs fixing up
+ def testGetChromProfile(self):
+ chromProfile = self.rds.getChromProfile("chr1")
+ result = []
+ self.assertEquals(result, chromProfile.tolist())
+
+ rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ chromProfile = self.rds.getChromProfile("chr1")
+ result = []
+ self.assertEquals(result, chromProfile.tolist())
+
+ self.rds.insertMetadata([("readsize", "5")])
+ chromProfile = self.rds.getChromProfile("chr1")
+ result = [0.0, 1.0, 1.0, 1.0, 1.0]
+ self.assertEquals(result, chromProfile.tolist())
+
+ rdsEntryList = [("testRead2", "chr1", 7, 11, "+", 1.0, "", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ # This doesn't seem to make sense the default behavior is to only get the first readlen bases
+ chromProfile = self.rds.getChromProfile("chr1")
+ result = [0.0, 1.0, 1.0, 1.0, 1.0]
+ self.assertEquals(result, chromProfile.tolist())
+
+ # as it stands this doesn't see right either. Getting an indexError at currentpos 5.
+ chromProfile = self.rds.getChromProfile("chr1", cstop=11)
+ result = [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+ self.assertEquals(result, chromProfile.tolist())
+
+
+ def testInsertMetadata(self):
+ result = {}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.insertMetadata([("testMeta", "100")])
+ result = {"testMeta": "100"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.insertMetadata([("testMeta", "200")])
+ result = {"testMeta:2": "200", "testMeta": "100"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+
+ def testUpdateMetadata(self):
+ result = {}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.insertMetadata([("testMeta", "100")])
+ result = {"testMeta": "100"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.updateMetadata("testMeta", "200")
+ result = {"testMeta": "200"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.updateMetadata("testMeta", "300", "200")
+ result = {"testMeta": "300"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+ self.rds.updateMetadata("testMeta", "200", "200")
+ result = {"testMeta": "300"}
+ self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+
+ def testFlagReads(self):
+ readData = self.rnaRds.getReadsDict(withFlag=True)
+ self.assertEquals({}, readData)
+
+ rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ result = [""]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+ regions = [()]
+ self.assertRaises(sqlite.ProgrammingError, self.rnaRds.flagReads, regions)
+
+ regions = [("test", "chr1", "0", "1000")]
+ self.rnaRds.flagReads(regions)
+ result = ["test"]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+ regions = [("test2", "chr1", "600", "1000")]
+ self.rnaRds.flagReads(regions)
+ result = ["test"]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+ rdsEntryList = [("testRead2", "chr1", 101, 200, "+", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ regions = [("test2", "chr1", "101", "1000")]
+ self.rnaRds.flagReads(regions)
+ result = ["test", "test2"]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+ rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "", ""),
+ ("testMultiRead", "chr1", 201, 300, "+", 0.5, "", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ regions = [("test", "chr1", "0", "1000")]
+ self.rnaRds.flagReads(regions)
+ result = ["test", "test", "", ""]
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+ self.assertEquals(result, flags)
+
+ regions = [("multi", "chr1", "1", "1000")]
+ self.rnaRds.flagReads(regions, uniqs=False, multi=True)
+ result = ["test", "test", "multi", "multi"]
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+ self.assertEquals(result, flags)
+
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+ regions = [("test", "chr1", "0", "1500")]
+ self.rnaRds.flagReads(regions)
+ result = ["", "test", "test", "multi", "multi"]
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+ self.assertEquals(result, flags)
+
+ regions = [("splice", "chr1", "1", "1500")]
+ self.rnaRds.flagReads(regions, uniqs=False, multi=False, splices=True)
+ result = [" L:splice R:splice", "test", "test", "multi", "multi"]
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+ self.assertEquals(result, flags)
+
+ rdsEntryList = [("testNegSense", "chr1", 301, 400, "-", 1.0, "", "")]
+ self.rnaRds.insertUniqs(rdsEntryList)
+ regions = [("test", "chr1", "0", "1500", "+")]
+ self.rnaRds.flagReads(regions, sense="anythingBut'Both'")
+ result = ["test", "test", ""]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+ regions = [("neg", "chr1", "0", "1500", "-")]
+ self.rnaRds.flagReads(regions, sense="anythingBut'Both'")
+ result = ["test", "test", "neg"]
+ flags = self.getRDSFlags("1", self.rnaRds)
+ self.assertEquals(result, flags)
+
+
+ def getRDSFlags(self, chromosome, rds, doMulti=False, splice=False):
+ if splice:
+ readData = rds.getSplicesDict(withFlag=True)
+ else:
+ readData = rds.getReadsDict(withFlag=True, doMulti=doMulti)
+
+ flags = []
+ for read in readData[chromosome]:
+ flags.append(read["flag"])
+
+ if splice:
+ nonSplice = self.getRDSFlags(chromosome, rds, doMulti, splice=False)
+ for flag in nonSplice:
+ flags.append(flag)
+
+ return flags
+
+
+ def testSetFlags(self):
+ rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.rnaRds.insertUniqs(rdsEntryList)
+ rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+ ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+
+ result = ["reset"]
+ self.rds.setFlags("reset")
+ flags = self.getRDSFlags("1", self.rds)
+ self.assertEquals(result, flags)
+
+ result = ["splice", "uniq", "resetMulti", "resetMulti"]
+ self.rnaRds.setFlags("resetMulti", uniqs=False, splices=False)
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+ self.assertEquals(result, flags)
+
+ result = ["resetAll", "resetAll", "resetAll", "resetAll"]
+ self.rnaRds.setFlags("resetAll")
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+ self.assertEquals(result, flags)
+
+
+ def testResetFlags(self):
+ rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")]
+ self.rds.insertUniqs(rdsEntryList)
+ self.rnaRds.insertUniqs(rdsEntryList)
+ rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+ ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+ self.rnaRds.insertMulti(rdsEntryList)
+ rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")]
+ self.rnaRds.insertSplices(rdsEntryList)
+
+ self.rds.resetFlags()
+ result = [""]
+ flags = self.getRDSFlags("1", self.rds)
+ self.assertEquals(result, flags)
+
+ self.rnaRds.resetFlags()
+ result = ["", "", ""]
+ flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+ self.assertEquals(result, flags)
+
+ self.rnaRds.resetFlags()
+ result = ["", ""]
+ flags = self.getRDSFlags("1", self.rnaRds, splice=True)
+ self.assertEquals(result, flags)
+
+
+ def testReweighMultireads(self):
+ rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+ ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+ self.rds.insertMulti(rdsEntryList)
+ readData = ("0.25", "chr1", "201", "testMultiRead")
+ self.rds.reweighMultireads([readData])
+ readDict = self.rds.getReadsDict(withWeight=True, doMulti=True)
+ read = readDict["1"][0]
+ self.assertEquals(0.25, read["weight"])
+
+
+ #TODO: write unit test
+ def testSetSynchronousPragma(self):
+ pass
+
+
+ #TODO: write unit test
+ def testSetDBcache(self):
+ pass
+
+
+ #TODO: write unit test
+ def testExecute(self):
+ pass
+
+
+ #TODO: write unit test
+ def testExecuteCommit(self):
+ pass
+
+
+ def testBuildIndex(self):
+ self.assertFalse(self.rds.hasIndex())
+ self.rds.buildIndex()
+ self.assertTrue(self.rds.hasIndex())
+
+
+ def testDropIndex(self):
+ self.assertFalse(self.rds.hasIndex())
+ self.rds.buildIndex()
+ self.assertTrue(self.rds.hasIndex())
+ self.rds.dropIndex()
+ self.assertFalse(self.rds.hasIndex())
+
+ self.assertFalse(self.rnaRds.hasIndex())
+ self.rnaRds.buildIndex()
+ self.assertTrue(self.rnaRds.hasIndex())
+ self.rnaRds.dropIndex()
+ self.assertFalse(self.rnaRds.hasIndex())
+
+
+ #TODO: write unit test
+ def testMemSync(self):
+ pass
+
+
+ #TODO: write unit test
+ def testCopyDBEntriesToMemory(self):
+ pass
+
+
+ #TODO: write unit test
+ def testCopySpliceDBEntriesToMemory(self):
+ pass
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestReadDataset))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import rnaAToIFilter
+
+
+class TestRnaAToIFilter(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testRnaAToIFilter(self):
+ snpPropertiesList = []
+ self.assertEquals([], rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 7 8 9 10 11 12 13"]
+ result = []
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+ result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"]
+ result = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 R"]
+ result = []
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"]
+ result = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 F"]
+ result = []
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+ "0 1 2 3 4 5 6 7 8 9 10 11 12 13"
+ ]
+ result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+ "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"
+ ]
+ result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+ "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"
+ ]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+ "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"
+ ]
+ result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+ "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"
+ ]
+ self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+ snpPropertiesList = ["invalid entry"]
+ self.assertRaises(IndexError, rnaAToIFilter.rnaAToIFilter, snpPropertiesList)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestRnaAToIFilter))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testRnaAToIFilter']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Aug 23, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import rnaEditing
+
+
+class TestRnaEditing(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testGetGenesWithMultipleSNPs(self):
+ snpList = []
+ self.assertEquals([], rnaEditing.getGenesWithMultipleSNPs(snpList))
+
+ snpList = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp1"],
+ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp2"],
+ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"]
+ ]
+
+ result = ["snp3", "snp2", "snp1"]
+ self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList))
+ result = []
+ self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2))
+
+ snpList.append([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"])
+ result = ["snp3"]
+ self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestRnaEditing))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Oct 4, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import transcripts
+
+inFileName = "testTranscriptsInFile.txt"
+outFileName = "testTranscriptsOutFile.txt"
+
+
+class TestTranscripts(unittest.TestCase):
+
+
+ def setUp(self):
+ self.inFile = open(inFileName, "w")
+ self.inFile.write("line1\t3.5\n")
+ self.inFile.write("line2\t1.5\n")
+ self.inFile.write("line3\tpadding\t3.5\n")
+ self.inFile.close()
+
+
+ def tearDown(self):
+ try:
+ os.remove(outFileName)
+ except OSError:
+ pass
+
+ try:
+ os.remove(inFileName)
+ except OSError:
+ pass
+
+
+ def testTranscripts(self):
+ transcripts.transcripts(inFileName, outFileName)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("line1\t700000.0\t2.3\n", results[0])
+ self.assertEquals("line2\t300000.0\t1.0\n", results[1])
+ self.assertEquals("line3\t700000.0\t2.3\n", results[2])
+
+ def testMain(self):
+ argv = ["transcripts.py", inFileName, outFileName]
+ transcripts.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("line1\t700000.0\t2.3\n", results[0])
+ self.assertEquals("line2\t300000.0\t1.0\n", results[1])
+ self.assertEquals("line3\t700000.0\t2.3\n", results[2])
+
+ argv = ["transcripts.py", inFileName, outFileName, "--transcriptome", "400000"]
+ transcripts.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("line1\t1400000.0\t4.7\n", results[0])
+ self.assertEquals("line2\t600000.0\t2.0\n", results[1])
+ self.assertEquals("line3\t1400000.0\t4.7\n", results[2])
+
+ argv = ["transcripts.py", inFileName, outFileName, "--cells", "5e5"]
+ transcripts.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("line1\t700000.0\t4.7\n", results[0])
+ self.assertEquals("line2\t300000.0\t2.0\n", results[1])
+ self.assertEquals("line3\t700000.0\t4.7\n", results[2])
+
+ argv = ["transcripts.py", inFileName, outFileName, "--efficiency", "0.15"]
+ transcripts.main(argv)
+ output = open(outFileName)
+ results = output.readlines()
+ output.close()
+ self.assertEquals(3, len(results))
+ self.assertEquals("line1\t700000.0\t4.7\n", results[0])
+ self.assertEquals("line2\t300000.0\t2.0\n", results[1])
+ self.assertEquals("line3\t700000.0\t4.7\n", results[2])
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestTranscripts))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import makebedfromrds
+
+
+class TestMakeBedFromRds(unittest.TestCase):
+
+ def testGetSenseColor(self):
+ senseColor = makebedfromrds.getSenseColor('+', .5)
+ self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for low weight and plus sense color")
+
+ senseColor = makebedfromrds.getSenseColor('-', .5)
+ self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for low weight and non-plus sense")
+
+ senseColor = makebedfromrds.getSenseColor('+', 5)
+ self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for high weight and plus sense")
+
+ senseColor = makebedfromrds.getSenseColor('-', 5)
+ self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for high weight and non-plus sense")
+
+
+ def testGetMultiSenseColor(self):
+ senseColor = makebedfromrds.getMultiSenseColor('+')
+ self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for plus sense")
+
+ senseColor = makebedfromrds.getMultiSenseColor('-')
+ self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for non-plus sense")
+
+
+ def testGetSingleSenseColor(self):
+ senseColor = makebedfromrds.getSingleSenseColor('+')
+ self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for plus sense")
+
+ senseColor = makebedfromrds.getSingleSenseColor('-')
+ self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for non-plus sense")
+
+
+ def testGetReadSizes(self):
+ numPieces = 3
+ startList = [0, 1, 2]
+ stopList = [3, 4, 5]
+ readSizes = makebedfromrds.getReadSizes(numPieces, startList, stopList)
+ self.assertEqual(readSizes, "3,3,3", "incorrect read size list")
+
+ readSizes = makebedfromrds.getReadSizes(1, startList, stopList)
+ self.assertEquals(readSizes, "3", "incorrect read size list for numPieces=1")
+
+ self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, [], stopList)
+ self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, startList, [])
+ self.assertRaises(IndexError, makebedfromrds.getReadSizes, 4, startList, stopList)
+
+
+ def testGetReadCoords(self):
+ numPieces = 3
+ startList = [0, 1, 2]
+ readCoords = makebedfromrds.getReadCoords(numPieces, startList)
+ self.assertEqual(readCoords, "0,1,2", "incorrect read coords list")
+
+ readCoords = makebedfromrds.getReadCoords(1, startList)
+ self.assertEqual(readCoords, "0", "incorrect read coords list for numPieces=1")
+
+ self.assertRaises(IndexError, makebedfromrds.getReadCoords, numPieces, [])
+ self.assertRaises(IndexError, makebedfromrds.getReadCoords, 4, startList)
+
+
+ def testGetSpliceColor(self):
+ lpart = 1
+ rpart = 2
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for hacktype 1 splice")
+ self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for hacktype 1 splice")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 1.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left unique")
+ self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left unique")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 0.0
+ rightweight = 1.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 right unique")
+ self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 right unique")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 1.0
+ rightweight = 1.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left and right unique")
+ self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left and right unique")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 multi")
+ self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 multi")
+
+ lpart = 1
+ rpart = 1
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+ self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 lpart + rpart = 2")
+ self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 lpart + rpart = 2")
+
+ lpart = 2
+ rpart = 0
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+ self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for left splice")
+ self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for left splice")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 1.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+ self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for left unique")
+ self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for left unique")
+
+ lpart = 0
+ rpart = 0
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+ self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for multi splice")
+ self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for multi splice")
+
+ lpart = 1
+ rpart = 0
+ leftweight = 0.0
+ rightweight = 0.0
+ aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+ self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for lpart = 1 multi splice")
+ self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for lpart = 1 multi splice")
+
+
+ def testDoNotOutputChromosome(self):
+ self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", True), "chrM is output when enforceChr=True")
+ self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", False), "chrM is output when enforceChr=False")
+ self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", True), "chr is not output when enforceChr=True")
+ self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", False), "chr is not output when enforceChr=False")
+ self.assertTrue(makebedfromrds.doNotOutputChromosome("Bad", True), "bad name chr is output when enforceChr=True")
+ self.assertFalse(makebedfromrds.doNotOutputChromosome("Bad", False), "bad name chr is not output when enforceChr=True")
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestMakeBedFromRds))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
--- /dev/null
+#
+# transcripts.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 1/25/08.
+#
+""" usage: python %s rpkmFile outFile [--transcriptome size] [--cells count] [--efficiency fraction]
+ where transcriptome size is in Gbp, cell count is in arbitrary units and efficiency is a fraction
+"""
+
+import sys, optparse
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ print "%prog: version 3.0"
+ usage = "usage: python %prog rpkmFile outFile [options]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--transcriptome", type="float", dest="tSize",
+ help="transcriptome size in Gbp [default 200000.0]")
+ parser.add_option("--cells", type="float", dest="cellCount",
+ help="arbitrary units [default 1e6]")
+ parser.add_option("--efficiency", type="float", dest="efficiency",
+ help="fraction [default 0.3]")
+ parser.set_defaults(tSize=200000.0, cellCount=1e6, efficiency=0.3)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 2:
+ print usage
+ sys.exit(1)
+
+ infile = args[0]
+ outfile = args[1]
+
+ transcripts(infile, outfile, options.tSize, options.cellCount, options.efficiency)
+
+
+def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency=0.3):
+ infile = open(infilename)
+ outfile = open(outfilename, "w")
+ for line in infile:
+ fields = line.strip().split()
+ rpkm = float(fields[-1])
+ transcripts = rpkm * tSize
+ transPerCell = transcripts / cellCount / efficiency
+ outfile.write("%s\t%.1f\t%.1f\n" % (fields[0], transcripts, transPerCell))
+ infile.close()
+ outfile.close()
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# trimquery.py
+# ENRAGE
+#
+# Created by Ali Mortazavi on 8/12/08.
+#
+
+import sys, optparse
+from cistematic.core import complement
+
+print "%prog: version 2.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %prog length infile outfile [--fastq] [--fromback] [--paired] [--flip] [--filter maxN]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--fastq", action="store_true", dest="fastq")
+ parser.add_option("--fromback", action="store_true", dest="fromBack")
+ parser.add_option("--paired", action="store_true", dest="paired")
+ parser.add_option("--flip", action="store_true", dest="flipseq")
+ parser.add_option("--filter", type="int", dest="maxN")
+ parser.set_defaults(fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ print "\t where paired fragments are separated by a : when given the -paired flag"
+ sys.exit(1)
+
+ length = int(args[0])
+ infile = args[1]
+ outfile = args[2]
+
+ trimreads(length, infile, outfile, options.fastq, options.fromBack, options.paired, options.flipseq, options.maxN)
+
+
+def trimreads(length, inFileName, outFileName, fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None):
+ infile = open(inFileName)
+ outfile = open(outFileName, "w")
+
+ if paired:
+ pairedlength = 2 * length
+ index = 0
+
+ if fromBack:
+ length = -1 * length
+
+ filtering = False
+ if maxN is not None:
+ filtering = True
+ print "filtering out reads with more than %d Ns" % maxN
+ else:
+ maxN = 2
+
+ print "trimming reads from %s to %d bp and saving them in %s" % (inFileName, length, outFileName)
+
+ filtered = 0
+ header = ""
+ for line in infile:
+ line = line.strip()
+ if len(line) == 0:
+ continue
+
+ firstChar = line[0]
+ if (not fastq and firstChar == ">") or (fastq and firstChar in ["@", "+"]):
+ header = line + "\n"
+ else:
+ if filtering:
+ if line.count("N") > maxN:
+ filtered += 1
+ continue
+
+ seq1 = line[length:]
+ seq2 = line[:length]
+ if flipseq:
+ try:
+ tempseq1 = seq1
+ seq1 = complement(tempseq1)
+ except:
+ seq1 = tempseq1
+
+ try:
+ tempseq2 = seq2
+ seq2 = complement(tempseq2)
+ except:
+ seq2 = tempseq2
+
+ if paired:
+ if len(line) < pairedlength:
+ continue
+
+ outfile.write("%s%s:%s\n" % (header, seq1, seq2))
+ else:
+ if fromBack:
+ outfile.write("%s%s\n" % (header, seq1))
+ else:
+ outfile.write("%s%s\n" % (header, seq2))
+
+ index += 1
+ if index % 1000000 == 0:
+ print ".",
+
+ sys.stdout.flush()
+
+ outfile.close()
+ print "returned %d reads" % index
+ if filtering:
+ print "%d additional reads filtered" % filtered
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# utrChanges.py
+# ENRAGE
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+import sys
+from commoncode import getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+
+print "%s: version 1.3" % sys.argv[0]
+
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ if len(argv) < 4:
+ print "usage: python %s genome acceptedfile outfile" % argv[0]
+ sys.exit(1)
+
+ genome = argv[1]
+ acceptfile = argv[2]
+ outfile = argv[3]
+
+ utrChanges(genome, acceptfile, outfile)
+
+
+def utrChanges(genome, acceptfile, outFileName):
+ acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+ outfile = open(outFileName, "w")
+
+ hg = Genome(genome)
+
+ origLocusByChromDict = getLocusByChromDict(hg, keepSense = True)
+ newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict = acceptDict, keepSense = True)
+
+ new3utr = 0
+ new5utr = 0
+ changedGene = 0
+
+ for chrom in origLocusByChromDict:
+ for (gstart, gstop, gid, glen, sense) in origLocusByChromDict[chrom]:
+ for (newstart, newstop, newgid, newlen, newsense) in newLocusByChromDict[chrom]:
+ if gid == newgid:
+ changedBoundary = False
+ new3p = "F"
+ new5p = "F"
+ if newstart < gstart:
+ if sense == "R":
+ new3utr += 1
+ new3p = "T"
+ changedBoundary = True
+ elif sense == "F":
+ new5utr += 1
+ new5p = "T"
+ changedBoundary = True
+ else:
+ print sense
+
+ if newstop > gstop:
+ if sense == "R":
+ new5utr += 1
+ new5p = "T"
+ changedBoundary = True
+ elif sense == "F":
+ new3utr += 1
+ new3p = "T"
+ changedBoundary = True
+ else:
+ print sense
+
+ if changedBoundary:
+ changedGene += 1
+ outfile.write("%s\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\t%s\n" % (gid, chrom, gstart, gstop, sense, chrom, newstart, newstop, new5p, new3p))
+
+ continue
+
+ outfile.close()
+ print "%d new 5'utr" % new5utr
+ print "%d new 3'utr" % new3utr
+ print "%s affected genes" % changedGene
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
--- /dev/null
+#
+# weightMultireads.py
+# ENRAGE
+#
+
+# Created by Ali Mortazavi on 10/02/08.
+#
+
+try:
+ import psyco
+ psyco.full()
+except:
+ pass
+
+from commoncode import readDataset
+import sys, time, string, optparse
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+ if not argv:
+ argv = sys.argv
+
+ usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--radius", type="int", dest="radius")
+ parser.add_option("--noradius", action="store_false", dest="doRadius")
+ parser.add_option("--usePairs", type="int", dest="pairDist")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 1:
+ print usage
+ sys.exit(1)
+
+ rdsfile = args[0]
+
+ weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
+
+
+def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
+
+ if radius is not None:
+ doRadius = True
+ else:
+ radius = 100
+
+ usePairs = False
+ if pairDist is not None:
+ usePairs = True
+
+ tooFar = pairDist * 10
+
+ doCache = False
+ if cachePages is not None:
+ doCache = True
+ else:
+ cachePages = 1
+
+ RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ readlen = RDS.getReadSize()
+ halfreadlen = readlen / 2
+
+ if cachePages > RDS.getDefaultCacheSize():
+ RDS.setDBcache(cachePages)
+
+ if verbose:
+ print time.ctime()
+
+ multiIDs = RDS.getReadIDs(uniqs=False,multi=True)
+ if verbose:
+ print "got multiIDs ", time.ctime()
+
+ fixedPair = 0
+ fixedReads = []
+ if usePairs:
+ print "doing pairs with pairDist = %d" % pairDist
+ uidDict = {}
+ midDict = {}
+ jointList = []
+ bothMultiList = []
+ mainIDList = []
+ guDict = {}
+ muDict = {}
+
+ if RDS.dataType == "RNA":
+ uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True)
+ else:
+ uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False)
+
+ if verbose:
+ print "got uniqIDs ", time.ctime()
+
+ for readID in uniqIDs:
+ (mainID, pairID) = readID.split("/")
+ try:
+ uidDict[mainID].append(pairID)
+ except:
+ uidDict[mainID] = [pairID]
+ mainIDList.append(mainID)
+
+ if verbose:
+ print "uidDict all ", len(uidDict), time.ctime()
+
+ for mainID in mainIDList:
+ if len(uidDict[mainID]) == 2:
+ del uidDict[mainID]
+
+ if verbose:
+ print "uidDict first candidates ", len(uidDict), time.ctime()
+
+ for readID in multiIDs:
+ (frontID, multiplicity) = readID.split("::")
+ (mainID, pairID) = frontID.split("/")
+ try:
+ if pairID not in midDict[mainID]:
+ midDict[mainID].append(pairID)
+ except:
+ midDict[mainID] = [pairID]
+
+ if verbose:
+ print "all multis ", len(midDict), time.ctime()
+
+ mainIDList = uidDict.keys()
+ for mainID in mainIDList:
+ if mainID not in midDict:
+ del uidDict[mainID]
+
+ if verbose:
+ print "uidDict actual candidates ", len(uidDict), time.ctime()
+
+ for readID in midDict:
+ listLen = len(midDict[readID])
+ if listLen == 1:
+ if readID in uidDict:
+ jointList.append(readID)
+ elif listLen == 2:
+ bothMultiList.append(readID)
+
+ if verbose:
+ print "joint ", len(jointList), time.ctime()
+ print "bothMulti ", len(bothMultiList), time.ctime()
+
+ del uidDict
+ del midDict
+ del mainIDList
+ del uniqIDs
+
+ uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+ if verbose:
+ print "got uniq dict ", len(uniqDict), time.ctime()
+
+ if RDS.dataType == "RNA":
+ spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
+ if verbose:
+ print "got splice dict ", len(spliceDict), time.ctime()
+
+ for readID in jointList:
+ try:
+ guDict[readID] = uniqDict[readID][0]
+ except:
+ if RDS.dataType == "RNA":
+ guDict[readID] = spliceDict[readID][0]
+
+ del uniqDict
+ del spliceDict
+ if verbose:
+ print "guDict actual ", len(guDict), time.ctime()
+
+ multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
+ if verbose:
+ print "got multi dict ", len(multiDict), time.ctime()
+
+ for readID in jointList:
+ muDict[readID] = multiDict[readID]
+
+ for readID in bothMultiList:
+ muDict[readID] = multiDict[readID]
+
+ del multiDict
+ if verbose:
+ print "muDict actual ", len(muDict), time.ctime()
+
+ RDS.setSynchronousPragma("OFF")
+ for readID in jointList:
+ try:
+ (ustart, uchrom, upair) = guDict[readID]
+ ustop = ustart + readlen
+ except:
+ (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID]
+
+ muList = muDict[readID]
+ muLen = len(muList)
+ bestMatch = [tooFar] * muLen
+ found = False
+ for index in range(muLen):
+ (mstart, mchrom, mpair) = muList[index]
+ if uchrom != mchrom:
+ continue
+
+ if abs(mstart - ustart) < pairDist:
+ bestMatch[index] = abs(mstart - ustart)
+ found = True
+ elif abs(mstart - ustop) < pairDist:
+ bestMatch[index] = abs(mstart - ustop)
+ found = True
+
+ if found:
+ theMatch = -1
+ theDist = tooFar
+ reweighList = []
+ for index in range(muLen):
+ if theDist > bestMatch[index]:
+ theMatch = index
+ theDist = bestMatch[index]
+
+ theID = string.join([readID, mpair], "/")
+ for index in range(muLen):
+ if index == theMatch:
+ score = 1 - (muLen - 1) / (100. * (muLen))
+ else:
+ score = 1 / (100. * muLen)
+
+ start = muList[index][0]
+ chrom = "chr%s" % muList[index][1]
+ reweighList.append((round(score,3), chrom, start, theID))
+
+ if theMatch > 0:
+ RDS.reweighMultireads(reweighList)
+ fixedPair += 1
+ if verbose and fixedPair % 10000 == 1:
+ print "fixed %d" % fixedPair
+ print guDict[readID]
+ print muDict[readID]
+ print reweighList
+
+ fixedReads.append(theID)
+
+ RDS.setSynchronousPragma("ON")
+
+ del guDict
+ del muDict
+ print "fixed %d pairs" % fixedPair
+ print time.ctime()
+
+ skippedReads = 0
+ if doRadius:
+ print "doing uniq read radius with radius = %d" % radius
+ multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
+ print "got multiDict"
+ RDS.setSynchronousPragma("OFF")
+ rindex = 0
+ for readID in multiIDs:
+ theID = readID
+ if theID in fixedReads:
+ skippedReads += 1
+ continue
+
+ if "::" in readID:
+ (readID, multiplicity) = readID.split("::")
+
+ scores = []
+ coords = []
+ for read in multiDict[readID]:
+ (start, weight, rID, chrom) = read
+ achrom = "chr%s" % chrom
+ regionStart = start + halfreadlen - radius
+ regionStop = start + halfreadlen + radius
+ uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
+ scores.append(uniqs + 1)
+ coords.append((achrom, start, theID))
+
+ total = float(sum(scores))
+ reweighList = []
+ for index in range(len(scores)):
+ reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
+
+ RDS.reweighMultireads(reweighList)
+ rindex += 1
+ if rindex % 10000 == 0:
+ print rindex
+
+ RDS.setSynchronousPragma("ON")
+ if verbose:
+ print "skipped ", skippedReads
+
+ print "reweighted ", rindex
+
+ if doCache:
+ RDS.saveCacheDB(rdsfile)
+
+ if verbose:
+ print "finished", time.ctime()
+
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file