import re
import optparse
import random
+import string
import pysam
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
def main(argv=None):
if not argv:
argv = sys.argv
- verstring = "MakeBamFromRds: version 1.0"
+ verstring = "makeBamFromRds: version 1.0"
print verstring
doPairs = False
usage = "usage: python %prog rdsFile bamFile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--nouniq", action="store_false", dest="withUniqs")
- parser.add_option("--nomulti", action="store_false", dest="withMulti")
- parser.add_option("--splices", action="store_true", dest="doSplices")
- parser.add_option("--flag", dest="withFlag")
- parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
- parser.add_option("--pairs", action="store_true", dest="doPairs")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
- parser.add_option("--chrom", action="append", dest="chromList")
- parser.add_option("--fasta", dest="fastaFileName")
- parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
- doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
- doCache=False, cachePages=100000, fastaFileName="",
- chromList=[])
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
options.chromList, options.fastaFileName)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+ parser.add_option("--pairs", action="store_true", dest="doPairs")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--chrom", action="append", dest="chromList")
+ parser.add_option("--fasta", dest="fastaFileName")
+
+ configParser = getConfigParser()
+ section = "MakeBamFromRds"
+ withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+ withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
+ withFlag = getConfigOption(configParser, section, "withFlag", "")
+ useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+ enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
+
+ parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
+ doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+ doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
+ chromList=[])
+
+ return parser
+
+
def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
doSplices=False, doPairs=False, withFlag="",
useFlagLike=False, enforceChr=False, allChrom=True,
sys.exit(1)
print "\nsample:"
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
if cachePages > RDS.getDefaultCacheSize():
RDS.setDBcache(cachePages)
outfile = pysam.Samfile(outfilename, "wb", header=header)
totalWrites = 0
- noncanonicalSplices = 0
+ noncanonicalSpliceCount = 0
for chrom in chromList:
index = 0
print "chromosome %s" % (chrom)
if withUniqs or withMulti:
hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
- withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
- flagLike=useFlagLike, entryDict=True)
+ doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+ flagLike=useFlagLike, withMismatch=True)
for read in hitDict[chrom]:
writeBAMEntry(outfile, chrom, read, readlength)
index += 1
if doSplices:
- numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+ numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
index += numSpliceReadsWritten
- noncanonicalSplices += noncanonical
+ noncanonicalSpliceCount += noncanonical
print index
totalWrites += index
outfile.close()
print "%d total reads written" % totalWrites
- print "%d non-canonical splices" % noncanonicalSplices
+ print "%d non-canonical splices" % noncanonicalSpliceCount
-def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
index = 0
noncanonicalSplices = 0
- spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
+ withMismatch=True)
if chrom not in spliceDict:
pass
else:
for read in spliceDict[chrom]:
if fastaSequenceDict.has_key(chrom):
- read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+ read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
noncanonicalSplices += noncanonical
writeBAMEntry(outfile, chrom, read, readlength)
def writeBAMEntry(outfile, chrom, outputDict, readlength):
+ """ We need to subtract 1 from the position because rds is 1 based and
+ most of the rest of the entire world is 0 based.
+ """
tagList = []
alignedRead = pysam.AlignedRead()
- alignedRead.qname = outputDict["readID"]
+ try:
+ (readID, pairID) = outputDict["readID"].split("/")
+ paired = True
+ except ValueError:
+ readID = outputDict["readID"]
+ paired = False
+
+ alignedRead.qname = readID
if outputDict["sense"] == "-":
alignedRead.is_reverse = True
alignedRead.rname = outfile.references.index(chrom)
if outputDict.has_key("startL"):
- startL = outputDict["startL"]
- stopL = outputDict["stopL"]
- startR = outputDict["startR"]
- stopR = outputDict["stopR"]
+ startL = outputDict["startL"] - 1
+ stopL = outputDict["stopL"] - 1
+ startR = outputDict["startR"] - 1
+ stopR = outputDict["stopR"] - 1
alignedRead.pos = startL
- alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
- tagList.append(("XS", outputDict["sense"]))
+ alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
+ tagList.append(("XS", str(outputDict["sense"])))
else:
- alignedRead.pos = outputDict["start"]
+ alignedRead.pos = outputDict["start"] - 1
alignedRead.cigar = [(0, readlength)]
- if outputDict.has_key("pairID"):
- pairID = outputDict["pairID"]
+ if paired:
if pairID == "1":
alignedRead.is_read1 = True
alignedRead.is_proper_pair = True
if outputDict.has_key("mismatch"):
mismatchTag = getMismatches(outputDict["mismatch"])
if mismatchTag:
- tagList.append(("MD", mismatchTag))
-
+ tagList.append(("MD", str(mismatchTag)))
+
if tagList:
- alignedRead.tags = tagList
+ alignedRead.tags = tuple(tagList)
outfile.write(alignedRead)
def getMismatches(mismatchString):
- mismatch = ""
+ mismatchList = []
positions = re.findall("\d+", mismatchString)
nucleotides = re.findall("([ACGTN])\d+", mismatchString)
for index in range(0, len(positions)):
- mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+ mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
+
+ mismatch = string.join(mismatchList, "")
return mismatch
return fastaSeqDict
-def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
spliceSense = {"GTAG": "+",
"GCAG": "+",
"ATAC": "+",
intronlen = startRight - stopLeft
leftJunctionSig =fastaSequence[intronstart:intronstart+2]
rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
- spliceJunction = leftJunctionSig + rightJunctionSig
+ spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
spliceJunction = spliceJunction.upper()
+ print spliceJunction
if spliceSense.has_key(spliceJunction):
sense = spliceSense[spliceJunction]
else:
except:
pass
-import sys, string, optparse, re
+import sys
+import string
+import optparse
+import re
import pysam
-from commoncode import readDataset, writeLog
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
+import ReadDataset
-verstring = "%prog: version 1.0"
+INSERT_SIZE = 100000
+verstring = "makeRdsFromBam: version 1.0"
def main(argv=None):
usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
\ninput reads must be sorted to properly record multireads"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--append", action="store_false", dest="init",
- help="append to existing rds file [default: create new]")
- parser.add_option("--RNA", action="store_true", dest="rnaDataType",
- help="set data type to RNA [default: DNA]")
- parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
- help="input file is in sam format")
- parser.add_option("--index", action="store_true", dest="doIndex",
- help="index the output rds file")
- parser.add_option("--cache", type="int", dest="cachePages",
- help="number of cache pages to use [default: 100000")
- parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
- help="multi counts over this value are discarded [default: 10]")
- parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
- help="use the raw read names")
- parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
- maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
try:
options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init",
+ help="append to existing rds file [default: create new]")
+ parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+ help="set data type to RNA [default: DNA]")
+ parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+ help="input file is in sam format")
+ parser.add_option("--index", action="store_true", dest="doIndex",
+ help="index the output rds file")
+ parser.add_option("--cache", type="int", dest="cachePages",
+ help="number of cache pages to use [default: 100000")
+ parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+ help="multi counts over this value are discarded [default: 10]")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+ help="use the raw read names")
+
+ configParser = getConfigParser()
+ section = "makeRdsFromBam"
+ init = getConfigBoolOption(configParser, section, "init", True)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
+ rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+ trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+
+ parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
+ maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
+
+ return parser
+
+
def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
- rds = readDataset(outDbName, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
if not init and doIndex:
try:
if rds.hasIndex():
if len(propertyList) > 0:
rds.insertMetadata(propertyList)
- countReads = {"unmapped": 0,
- "total": 0,
- "unique": 0,
- "multi": 0,
- "multiDiscard": 0,
- "splice": 0
+ totalReadCounts = {"unmapped": 0,
+ "total": 0,
+ "unique": 0,
+ "multi": 0,
+ "multiDiscard": 0,
+ "splice": 0
}
readsize = 0
- insertSize = 100000
uniqueInsertList = []
multiInsertList = []
for read in samFileIterator:
if read.is_unmapped:
- countReads["unmapped"] += 1
+ totalReadCounts["unmapped"] += 1
continue
if readsize == 0:
- take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+ take = (0, 1) # CIGAR operation (M/match, I/insertion)
readsize = sum([length for op,length in read.cigar if op in take])
if init:
rds.insertMetadata([("readsize", readsize)])
pairReadSuffix = getPairedReadNumberSuffix(read)
readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
if trimReadID:
- rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+ rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
else:
rdsEntryName = read.qname
else:
uniqueReadDict[readName] = (read, rdsEntryName)
- if countReads["total"] % insertSize == 0:
+ if totalReadCounts["total"] % INSERT_SIZE == 0:
for entry in uniqueReadDict.keys():
(readData, rdsEntryName) = uniqueReadDict[entry]
chrom = samfile.getrname(readData.rname)
uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
- countReads["unique"] += 1
+ totalReadCounts["unique"] += 1
for entry in spliceReadDict.keys():
(readData, rdsEntryName) = spliceReadDict[entry]
chrom = samfile.getrname(readData.rname)
spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
- countReads["splice"] += 1
+ totalReadCounts["splice"] += 1
for entry in multiReadDict.keys():
(readData, count, rdsEntryName) = multiReadDict[entry]
chrom = samfile.getrname(readData.rname)
if count > maxMultiReadCount:
- countReads["multiDiscard"] += 1
+ totalReadCounts["multiDiscard"] += 1
else:
multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
- countReads["multi"] += 1
+ totalReadCounts["multi"] += 1
rds.insertUniqs(uniqueInsertList)
rds.insertMulti(multiInsertList)
sys.stdout.flush()
processedEntryDict = {}
- countReads["total"] += 1
+ totalReadCounts["total"] += 1
if len(uniqueReadDict.keys()) > 0:
for entry in uniqueReadDict.keys():
(readData, rdsEntryName) = uniqueReadDict[entry]
chrom = samfile.getrname(readData.rname)
uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
- countReads["unique"] += 1
+ totalReadCounts["unique"] += 1
rds.insertUniqs(uniqueInsertList)
(readData, count, rdsEntryName) = multiReadDict[entry]
chrom = samfile.getrname(readData.rname)
if count > maxMultiReadCount:
- countReads["multiDiscard"] += 1
+ totalReadCounts["multiDiscard"] += 1
else:
multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
- countReads["multi"] += 1
+ totalReadCounts["multi"] += 1
- countReads["multi"] += len(multiInsertList)
+ totalReadCounts["multi"] += len(multiInsertList)
if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
for entry in spliceReadDict.keys():
(readData, rdsEntryName) = spliceReadDict[entry]
chrom = samfile.getrname(readData.rname)
spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
- countReads["splice"] += 1
+ totalReadCounts["splice"] += 1
rds.insertSplices(spliceInsertList)
- countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
- countString += "\t%d unique reads" % countReads["unique"]
- countString += "\t%d multi reads" % countReads["multi"]
- countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
+ countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
+ countStringList.append("%d unique reads" % totalReadCounts["unique"])
+ countStringList.append("%d multi reads" % totalReadCounts["multi"])
+ countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
if dataType == "RNA":
- countString += "\t%d spliced reads" % countReads["splice"]
+ countStringList.append("%d spliced reads" % totalReadCounts["splice"])
- print countString.replace("\t", "\n")
-
- writeLog("%s.log" % outDbName, verstring, countString)
+ print string.join(countStringList, "\n")
+ outputCountText = string.join(countStringList, "\t")
+ writeLog("%s.log" % outDbName, verstring, outputCountText)
if doIndex:
print "building index...."
def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
start = int(alignedRead.pos)
- stop = int(start+readSize)
+ stop = int(start + readSize)
sense = getReadSense(alignedRead.is_reverse)
try:
mismatchTag = alignedRead.opt("MD")
genomicNucleotide = "N"
if sense == "-":
- mismatch = getComplementNucleotide(mismatch)
- genomicNucleotide = getComplementNucleotide(genomicNucleotide)
+ mismatch = getReverseComplement(mismatch)
+ genomicNucleotide = getReverseComplement(genomicNucleotide)
- elandCompatiblePosition = int(position + 1)
- output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+ erange1BasedElandCompatiblePosition = int(position + 1)
+ output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
position += 1
except IndexError:
if logErrors:
return string.join(output, ",")
-def getComplementNucleotide(nucleotide):
- complement = {"A": "T",
- "T": "A",
- "C": "G",
- "G": "C",
- "N": "N"
- }
-
- return complement[nucleotide]
-
-
def getSpliceBounds(start, readsize, cigarTupleList):
stopR = int(start + readsize)
offset = 0
--- /dev/null
+class Peak(object):
+ """
+ Class describing a peak.
+ """
+
+ def __init__(self, topPos, numHits, smoothArray, numPlus, numLeftPlus=0, shift=0):
+ self._topPos = topPos
+ self._numHits = numHits
+ self.smoothArray = smoothArray
+ self.numPlus = numPlus
+ self.numLeftPlus = numLeftPlus
+ self.shift = shift
+
+
+ @property
+ def topPos(self):
+ return self._topPos
+
+
+ @topPos.setter
+ def topPos(self, topPos):
+ self._topPos = topPos
+
+
+ @property
+ def numHits(self):
+ return self._numHits
+
+
+ @numHits.setter
+ def numHits(self, numHits):
+ self._numHits = numHits
+
\ No newline at end of file
-"""
-Created on Jul 1, 2010
-
-@author: sau
-"""
-
import sqlite3 as sqlite
import string
import tempfile
import shutil
import os
-from os import environ
from array import array
-from commoncode import getReverseComplement
-
-if environ.get("CISTEMATIC_TEMP"):
- cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
- cisTemp = "/tmp"
+from commoncode import getReverseComplement, getConfigParser, getConfigOption
-tempfile.tempdir = cisTemp
-currentRDSVersion = "1.1"
+currentRDSVersion = "2.0"
class ReadDatasetError(Exception):
def cacheDB(self, filename):
""" copy geneinfoDB to a local cache.
"""
+ configParser = getConfigParser()
+ cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
+ tempfile.tempdir = cisTemp
self.cachedDBFile = "%s.db" % tempfile.mktemp()
shutil.copyfile(filename, self.cachedDBFile)
and which can be restricted by chromosome or custom-flag.
Returns unique reads by default, but can return multireads
with doMulti set to True.
+
+ Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
"""
whereClause = []
resultsDict = {}
--- /dev/null
+import string
+
+class Region(object):
+ """
+ Region description
+ """
+
+
+ def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+ self.label = label
+ self.index = index
+ self.chrom = chrom
+ self.start = start
+ self.stop = stop
+ self.numReads = numReads
+ self.foldRatio = foldRatio
+ self.multiP = multiP
+ self.peakDescription = peakDescription
+ self.shift = shift
+ self.length = abs(self.stop - self.start)
+ self.peakPos = peakPos
+ self.peakHeight = peakHeight
+
+
+ def printRegion(self, delimiter="\t"):
+ fields = ["%s%d" % (self.label, self.index),
+ "%s" % self.chrom,
+ "%d" % self.start,
+ "%d" % self.stop,
+ "%.1f" % self.numReads,
+ "%.1f" % self.foldRatio,
+ "%.1f" % self.multiP,
+ "%s" % self.peakDescription
+ ]
+
+ return string.join(fields, delimiter)
+
+
+ def printRegionWithShift(self, delimiter="\t"):
+ fields = [self.printRegion(delimiter)]
+ fields.append("%d" % self.shift)
+
+ return string.join(fields, delimiter)
+
+
+class DirectionalRegion(Region):
+ """
+ Region with percentage of plus reads.
+ """
+
+ def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., plusP=0., leftP=0., peakDescription="", shift=0):
+ Region.__init__(self, start, stop, label, index, chrom, numReads, foldRatio, multiP, peakDescription, shift)
+ self.plusP = plusP
+ self.leftP = leftP
+
+
+ def printRegion(self, delimiter="\t"):
+ fields = ["%s%d" % (self.label, self.index),
+ "%s" % self.chrom,
+ "%d" % self.start,
+ "%d" % self.stop,
+ "%.1f" % self.numReads,
+ "%.1f" % self.foldRatio,
+ "%.1f" % self.multiP,
+ "%.1f" % self.plusP,
+ "%.1f" % self.leftP,
+ "%s" % self.peakDescription
+ ]
+
+ return string.join(fields, delimiter)
\ No newline at end of file
except:
print 'psyco not running'
-print 'version 3.6'
+print "altSpliceCounts: version 3.7"
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption
def main(argv=None):
if not argv:
usage = "usage: python %s rdsfile outfilename [--cache pages]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--cache", type="int", dest="numCachePages",
- help="number of cache pages to use [default: 100000]")
- parser.set_defaults(numCachePages=None)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
altSpliceCounts(hitfile, outfilename, doCache, cachePages)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", type="int", dest="numCachePages",
+ help="number of cache pages to use [default: 100000]")
+
+ configParser = getConfigParser()
+ section = "altSpliceCounts"
+ numCachePages = getConfigOption(configParser, section, "numCachePages", None)
+
+ parser.set_defaults(numCachePages=numCachePages)
+
+ return parser
+
+
def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
startDict = {}
stopDict = {}
resultDict = {}
- hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
index = 0
for chrom in hitDict:
- for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ tagStart = read["startL"]
+ tagStop = read["stopR"]
index += 1
length = tagStop - tagStart
if length < readlen + 5:
except:
print "psyco not running"
-import sys, optparse
+import sys
+import optparse
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
from cistematic.cisstat.analyzego import calculateGOStats
-from cistematic.core.geneinfo import geneinfoDB
-print "version 2.1"
+print "analyzego: version 2.2"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--geneName", action="store_true", dest="translateGene",
- help="translate gene")
- parser.add_option("--field", type="int", dest="fieldID",
- help="column containing gene ID/Name")
- parser.set_defaults(translateGene=False, fieldID=None)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--geneName", action="store_true", dest="translateGene",
+ help="translate gene")
+ parser.add_option("--field", type="int", dest="fieldID",
+ help="column containing gene ID/Name")
+
+ configParser = getConfigParser()
+ section = "analyzego"
+ translateGene = getConfigOption(configParser, section, "translateGene", False)
+ fieldID = getConfigOption(configParser, section, "fieldID", None)
+
+ parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
+
+ return parser
+
+
def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
infile = open(infilename)
analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
if translateGene:
- idb = geneinfoDB(cache=True)
- geneinfoDict = idb.getallGeneInfo(genome)
- symbolToGidDict = {}
- for gid in geneinfoDict:
- symbol = geneinfoDict[gid][0][0].strip()
- symbolToGidDict[symbol] = gid
+ symbolToGidDict = getSymbolDict(genome)
locusList = []
for line in geneInfoList:
if len(locusList) > 0:
calculateGOStats(locusList, prefix)
+
+def getSymbolDict(genome):
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
+ symbolToGidDict = {}
+ for gid in geneinfoDict:
+ symbol = geneinfoDict[gid][0][0].strip()
+ symbolToGidDict[symbol] = gid
+
+ return symbolToGidDict
+
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
-import sys, string
+import sys
+import string
def main(argv=None):
if not argv:
import sys
-print 'version 1.0'
+print "binstocdf: version 1.1"
def main(argv=None):
if not argv:
#
# Created by Ali Mortazavi on 3/6/09.
#
-import sys, string, optparse
-from commoncode import writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption
-versionString = "%prog: version 1.3"
+versionString = "buildMatrix: version 1.5"
print versionString
usage = "usage: python %prog matrix.step.N-1 data.part matrix.step.N [--rescale] [--truncate maxRPKM] [--log altlogfile]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--rescale", action="store_true", dest="rescale")
- parser.add_option("--truncate", type="int", dest="maxRPKM")
- parser.add_option("--log", dest="logfilename")
- parser.set_defaults(rescale=False, maxRPKM=None, logfilename="buildMatrix.log")
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.rescale, options.logfilename)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--rescale", action="store_true", dest="rescale")
+ parser.add_option("--truncate", type="int", dest="maxRPKM")
+ parser.add_option("--log", dest="logfilename")
+
+ configParser = getConfigParser()
+ section = "buildMatrix"
+ rescale = getConfigBoolOption(configParser, section, "rescale", False)
+ maxRPKM = getConfigOption(configParser, section, "maxRPKM", None)
+ logfilename = getConfigOption(configParser, section, "logfilename", "buildMatrix.log")
+
+ parser.set_defaults(rescale=rescale, maxRPKM=maxRPKM, logfilename=logfilename)
+
+ return parser
+
+
def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
maxRPKM=100000000, rescale=False, logfilename="buildMatrix.log"):
if not argv:
argv = sys.argv
- print "version 2.0"
+ print "buildrmaskdb: version 2.1"
if len(argv) < 3:
print "usage: python %s rmaskdir rmaskdbfile" % argv[0]
exit(1)
if not argv:
argv = sys.argv
- print "version 2.0"
+ print "buildsnpdb: version 2.1"
if len(argv) < 3:
print "usage: python %s snpfile snpdbname" % argv[0]
sys.exit(1)
except:
pass
-import sqlite3 as sqlite
-import sys, string, optparse
+import sys
+import string
+import optparse
import os.path
-from commoncode import writeLog
+import sqlite3 as sqlite
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
-versionString = "%prog: version 3.5"
+versionString = "checkrmask: version 3.6"
print versionString
usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--startField", type="int", dest="startField")
- parser.add_option("--log", dest="logfilename")
- parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--startField", type="int", dest="startField")
+ parser.add_option("--log", dest="logfilename")
+
+ configParser = getConfigParser()
+ section = "checkrmask"
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
+ startField = getConfigIntOption(configParser, section, "startField", 0)
+ logfilename = getConfigOption(configParser, section, "logfilename", None)
+
+ parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
+
+ return parser
+
+
def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
outfile = open(outFileName, "w")
except:
pass
-import sqlite3 as sqlite
import sys
-import tempfile, shutil, os, optparse
-from os import environ
+import tempfile
+import shutil
+import os
+import optparse
+import sqlite3 as sqlite
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
-if environ.get("CISTEMATIC_TEMP"):
- cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
- cisTemp = "/tmp"
+configParser = getConfigParser()
+cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
tempfile.tempdir = cisTemp
-print "version 3.3: %prog"
+print "chkSNPrmask: version 3.4"
def main(argv=None):
usage = "usage: python %s dbfile snpsfile nr_snps_outfile [--cache numPages] [--repeats]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--repeats", action="store_true", dest="repeats")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.set_defaults(repeats=False, cachePages=None)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
chkSNPrmask(dbfile, filename, outfile, options.repeats, options.cachePages)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--repeats", action="store_true", dest="repeats")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "checkSNPrmask"
+ repeats = getConfigBoolOption(configParser, section, "repeats", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+ parser.set_defaults(repeats=repeats, cachePages=cachePages)
+
+ return parser
+
+
def chkSNPrmask(dbfile, filename, outfile, repeats=False, cachePages=None):
print dbfile
import os
import string
import sqlite3 as sqlite
+from commoncode import getConfigParser, getConfigOption
-print "version 3.6: %s" % sys.argv[0]
+print "chksnp: version 3.7"
def main(argv=None):
def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
- if os.environ.get("CISTEMATIC_TEMP"):
- cisTemp = os.environ.get("CISTEMATIC_TEMP")
- else:
- cisTemp = "/tmp"
+ configParser = getConfigParser()
+ cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
tempfile.tempdir = cisTemp
for dbFileName in dbList:
if not argv:
argv = sys.argv
- print "version 1.2"
+ print "colsum: version 1.3"
if len(argv) < 3:
print "usage: python %s field filename" % argv[0]
print "\n\tfields are counted starting at zero.\n"
# ENRAGE
#
-print 'version 1.0'
+print "combineRPKMs: version 1.1"
try:
import psyco
psyco.full()
except:
pass
-import sys, optparse
+import sys
+import optparse
+import string
+from commoncode import getConfigParser, getConfigBoolOption
def main(argv=None):
argv = sys.argv
usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
- parser.set_defaults(doFraction=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
+
+ configParser = getConfigParser()
+ section = "combineRPKMs"
+ doFraction = getConfigBoolOption(configParser, section, "doFraction", False)
+
+ parser.set_defaults(doFraction=doFraction)
+
+ return parser
+
+
def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
- firstfile = open(firstfileName)
- expandedfile = open(expandedfileName)
- finalfile = open(finalfileName)
- outfile = open(outfileName, "w")
firstDict = {}
- gidDict = {}
- expandedDict = {}
-
+ firstfile = open(firstfileName)
for line in firstfile:
fields = line.strip().split()
firstDict[fields[1]] = fields[-1]
firstfile.close()
+ expandedDict = {}
+ gidDict = {}
+ expandedfile = open(expandedfileName)
for line in expandedfile:
fields = line.strip().split()
expandedDict[fields[1]] = fields[-1]
else:
header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
+ outfile = open(outfileName, "w")
outfile.write(header)
+ finalfile = open(finalfileName)
for line in finalfile:
fields = line.strip().split()
gene = fields[0]
rnakb = fields[1]
finalRPKM = fields[2]
firstRPKM = firstDict.get(gene, "")
- outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+ outputFields = [gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM]
if doFraction:
fraction = fields[3]
- outline += "\t%s" % fraction
-
- outfile.write(outline + '\n')
+ outputFields.append(fraction)
+
+ outline = "%s\n" % string.join(outputFields, "\t")
+ outfile.write(outline)
finalfile.close()
outfile.close()
pass
import sys
-from commoncode import readDataset
+import ReadDataset
-print '%s: version 1.1' % sys.argv[0]
+print "combinerds: version 1.2"
def main(argv=None):
print "destination RDS: %s" % datafile
if '--initrna' in argv:
- rds = readDataset(datafile, initialize=True, datasetType='RNA')
+ rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
elif '--init' in argv:
- rds = readDataset(datafile, initialize=True)
+ rds = ReadDataset.ReadDataset(datafile, initialize=True)
withFlag = ''
if '--flag' in argv:
withFlag = argv[sys.argv.index('-flag') + 1]
print "restrict to flag = %s" % withFlag
- rds = readDataset(datafile, verbose=True, cache=doCache)
+ rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
if cachePages > rds.getDefaultCacheSize():
rds.setDBcache(cachePages)
# ENRAGE
#
-import tempfile
-import shutil
+import ConfigParser
import os
-from os import environ
import string
-import sqlite3 as sqlite
from time import strftime
from array import array
from collections import defaultdict
+import Peak
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+import Region
-commoncodeVersion = 5.5
-currentRDSversion = 1.1
-
-if environ.get("CISTEMATIC_TEMP"):
- cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
- cisTemp = "/tmp"
-
-tempfile.tempdir = cisTemp
+commoncodeVersion = 5.6
+currentRDSversion = 2.0
def getReverseComplement(base):
logfile.close()
+def getGeneInfoDict(genome, cache=False):
+ idb = geneinfoDB(cache=cache)
+ if genome == "dmelanogaster":
+ geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+ else:
+ geneinfoDict = idb.getallGeneInfo(genome)
+
+ return geneinfoDict
+
+
+def getGeneAnnotDict(genome, inRAM=False):
+ return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
+
+
+def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
+ hg = Genome(genome, inRAM=inRAM)
+ if extendGenome != "":
+ hg.extendFeatures(extendGenome, replace=replaceModels)
+
+ geneannotDict = hg.allAnnotInfo()
+
+ return geneannotDict
+
+
+def getConfigParser(fileList=[]):
+ configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
+ for filename in fileList:
+ configFiles.append(filename)
+
+ config = ConfigParser.SafeConfigParser()
+ config.read(configFiles)
+
+ return config
+
+
+def getConfigOption(parser, section, option, default=None):
+ try:
+ setting = parser.get(section, option)
+ except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+ setting = default
+
+ return setting
+
+
+def getConfigIntOption(parser, section, option, default=None):
+ try:
+ setting = parser.getint(section, option)
+ except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+ setting = default
+
+ return setting
+
+
+def getConfigFloatOption(parser, section, option, default=None):
+ try:
+ setting = parser.getfloat(section, option)
+ except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+ setting = default
+
+ return setting
+
+
+def getConfigBoolOption(parser, section, option, default=None):
+ try:
+ setting = parser.getboolean(section, option)
+ except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
+ setting = default
+
+ return setting
+
+
+def getAllConfigSectionOptions(parser, section):
+ try:
+ setting = parser.items(section)
+ except ConfigParser.NoSectionError:
+ setting = []
+
+ return setting
+
+
def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
- fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+ fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
doMerge=True, keepPeak=False, returnTop=0):
- """ returns a list of merged overlapping regions;
+ """ returns a dictionary containing a list of merged overlapping regions by chromosome;
can optionally filter regions that have a scoreField fewer than minHits.
Can also optionally return the label of each region, as well as the
peak, if supplied (peakPos and peakHeight should be the last 2 fields).
def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
doMerge=True, keepPeak=False, returnTop=0):
- """ returns a list of merged overlapping regions;
+ """ returns a dictionary containing a list of merged overlapping regions by chromosome;
can optionally filter regions that have a scoreField fewer than minHits.
Can also optionally return the label of each region, as well as the
peak, if supplied (peakPos and peakHeight should be the last 2 fields).
if not fullChrom:
chrom = chrom[3:]
- length = abs(stop - start)
if keepPeak:
peakPos = int(fields[-2 - hasPvalue - hasShift])
peakHeight = float(fields[-1 - hasPvalue - hasShift])
if doMerge and len(regions[chrom]) > 0:
for index in range(len(regions[chrom])):
- if keepLabel and keepPeak:
- (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
- elif keepLabel:
- (rlabel, rstart, rstop, rlen) = regions[chrom][index]
- elif keepPeak:
- (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
- else:
- (rstart, rstop, rlen) = regions[chrom][index]
-
+ region = regions[chrom][index]
+ rstart = region.start
+ rstop = region.stop
if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
if start < rstart:
rstart = start
if rstop < stop:
rstop = stop
- rlen = abs(rstop - rstart)
if keepPeak:
+ rpeakPos = region.peakPos
+ rpeakHeight = region.peakHeight
if peakHeight > rpeakHeight:
rpeakHeight = peakHeight
rpeakPos = peakPos
- if keepLabel and keepPeak:
- regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight)
- elif keepLabel:
- regions[chrom][index] = (label, rstart, rstop, rlen)
- elif keepPeak:
- regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight)
- else:
- regions[chrom][index] = (rstart, rstop, rlen)
+ regions[chrom][index].start = rstart
+ regions[chrom][index].stop = rstop
+ regions[chrom][index].length = abs(rstop - rstart)
+ if keepLabel:
+ regions[chrom][index].label = label
+
+ if keepPeak:
+ regions[chrom][index].peakPos = rpeakPos
+ regions[chrom][index].peakHeight = rpeakHeight
+
mergeCount += 1
merged = True
break
if not merged:
- if keepLabel and keepPeak:
- regions[chrom].append((label, start, stop, length, peakPos, peakHeight))
- elif keepLabel:
- regions[chrom].append((label, start, stop, length))
- elif keepPeak:
- regions[chrom].append((start, stop, length, peakPos, peakHeight))
- else:
- regions[chrom].append((start, stop, length))
+ region = Region.Region(start, stop)
+ if keepLabel:
+ region.label = label
+ if keepPeak:
+ region.peakPos = peakPos
+ region.peakHeight = peakHeight
+
+ regions[chrom].append(region)
count += 1
if verbose and (count % 100000 == 0):
regionCount = 0
for chrom in regions:
regionCount += len(regions[chrom])
- if keepLabel:
- regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1]))
- else:
- regions[chrom].sort()
+ regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
if verbose:
print "merged %d times" % mergeCount
def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
- shift=0, returnShift=False, maxshift=75):
+ shift=0, maxshift=75):
""" find the peak in a list of reads (hitlist) in a region
of a given length and absolute start point. returns a
list of peaks, the number of hits, a triangular-smoothed
the peak, taken to be the first TopPos position.
"""
- seqArray = array("f", [0.] * length)
- smoothArray = array("f", [0.] * length)
- numHits = 0.
- numPlus = 0.
- regionArray = []
if shift == "auto":
shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
- # once we have the best shift, compute seqArray
- for read in hitList:
- currentpos = read[0] - start
- if read[1] == "+":
- currentpos += shift
- else:
- currentpos -= shift
-
- if (currentpos < 1 - readlen) or (currentpos >= length):
- continue
-
- hitIndex = 0
- if doWeight:
- weight = read[2]
- else:
- weight = 1.0
-
- numHits += weight
- if leftPlus:
- regionArray.append(read)
-
- while currentpos < 0:
- hitIndex += 1
- currentpos += 1
-
- while hitIndex < readlen and currentpos < length:
- seqArray[currentpos] += weight
- hitIndex += 1
- currentpos += 1
-
- if read[1] == "+":
- numPlus += weight
+ seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
# implementing a triangular smooth
+ smoothArray = array("f", [0.] * length)
for pos in range(2,length -2):
smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
- topNucleotide = 0
- topPos = []
- for currentpos in xrange(length):
- if topNucleotide < smoothArray[currentpos]:
- topNucleotide = smoothArray[currentpos]
- topPos = [currentpos]
- elif topNucleotide == smoothArray[currentpos]:
- topPos.append(currentpos)
+ topPos = getPeakPositionList(smoothArray, length)
+ peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
if leftPlus:
numLeftPlus = 0
maxPos = topPos[0]
for read in regionArray:
if doWeight:
- weight = read[2]
+ weight = read["weight"]
else:
weight = 1.0
- currentPos = read[0] - start
- if currentPos <= maxPos and read[1] == "+":
+ currentPos = read["start"] - start
+ if currentPos <= maxPos and read["sense"] == "+":
numLeftPlus += weight
- if returnShift:
- return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift)
- else:
- return (topPos, numHits, smoothArray, numPlus, numLeftPlus)
- else:
- if returnShift:
- return (topPos, numHits, smoothArray, numPlus, shift)
- else:
- return (topPos, numHits, smoothArray, numPlus)
+ peak.numLeftPlus = numLeftPlus
+
+ return peak
def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
for testShift in xrange(maxShift + 1):
shiftArray = array("f", [0.] * length)
for read in hitList:
- currentpos = read[0] - start
- if read[1] == "+":
+ currentpos = read["start"] - start
+ if read["sense"] == "+":
currentpos += testShift
else:
currentpos -= testShift
continue
if doWeight:
- weight = read[2]
+ weight = read["weight"]
else:
weight = 1.0
- if read[1] == "+":
+ if read["sense"] == "+":
shiftArray[currentpos] += weight
else:
shiftArray[currentpos] -= weight
return bestShift
+def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
+ seqArray = array("f", [0.] * length)
+ numHits = 0.
+ numPlus = 0.
+ regionArray = []
+ for read in hitList:
+ currentpos = read["start"] - start
+ if read["sense"] == "+":
+ currentpos += shift
+ else:
+ currentpos -= shift
+
+ if (currentpos < 1 - readlen) or (currentpos >= length):
+ continue
+
+ if doWeight:
+ weight = read["weight"]
+ else:
+ weight = 1.0
+
+ numHits += weight
+ if leftPlus:
+ regionArray.append(read)
+
+ hitIndex = 0
+ while currentpos < 0:
+ hitIndex += 1
+ currentpos += 1
+
+ while hitIndex < readlen and currentpos < length:
+ seqArray[currentpos] += weight
+ hitIndex += 1
+ currentpos += 1
+
+ if read["sense"] == "+":
+ numPlus += weight
+
+ return seqArray, regionArray, numHits, numPlus
+
+
+def getPeakPositionList(smoothArray, length):
+ topNucleotide = 0
+ peakList = []
+ for currentpos in xrange(length):
+ if topNucleotide < smoothArray[currentpos]:
+ topNucleotide = smoothArray[currentpos]
+ peakList = [currentpos]
+ elif topNucleotide == smoothArray[currentpos]:
+ peakList.append(currentpos)
+
+ return peakList
+
+
def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
restrictList=[], regionComplement=False, maxStop=250000000):
""" return a dictionary of cistematic gene features. Requires
if len(additionalRegionsDict) > 0:
sortList = []
for chrom in additionalRegionsDict:
- for (label, start, stop, length) in additionalRegionsDict[chrom]:
+ for region in additionalRegionsDict[chrom]:
+ label = region.label
if label not in sortList:
sortList.append(label)
else:
sense = featuresDict[label][0][-1]
- featuresDict[label].append(("custom", chrom, start, stop, sense))
+ featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
for gid in sortList:
featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
if len(additionalRegionsDict) > 0:
sortList = []
for chrom in additionalRegionsDict:
- for (label, start, stop, length) in additionalRegionsDict[chrom]:
+ for region in additionalRegionsDict[chrom]:
+ label = region.label
if label not in sortList:
sortList.append(label)
else:
sense = featuresDict[label][0][-1]
- featuresDict[label].append(("custom", chrom, start, stop, sense))
+ featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
for gid in sortList:
featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
print "%s\n" % chrom
startRegion = 0
- for (tagStart, sense, weight) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ tagStart = read["start"]
+ weight = read["weight"]
index += 1
if index % 100000 == 0:
print "read %d " % index,
stopPoint = stop
return (regionsBins, regionsLen)
-
-
-# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will
-# require going through all the code to make the changes needed. Major project for another
-# day, but it really needs to be done
-class readDataset:
- """ Class for storing reads from experiments. Assumes that custom scripts
- will translate incoming data into a format that can be inserted into the
- class using the insert* methods. Default class subtype ('DNA') includes
- tables for unique and multireads, whereas 'RNA' subtype also includes a
- splices table.
- """
-
- def __init__(self, datafile, initialize=False, datasetType='', verbose=False,
- cache=False, reportCount=True):
- """ creates an rds datafile if initialize is set to true, otherwise
- will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
- """
- self.dbcon = ""
- self.memcon = ""
- self.dataType = ""
- self.rdsVersion = "1.1"
- self.memBacked = False
- self.memChrom = ""
- self.memCursor = ""
- self.cachedDBFile = ""
-
- if cache:
- if verbose:
- print "caching ...."
-
- self.cacheDB(datafile)
- dbfile = self.cachedDBFile
- else:
- dbfile = datafile
-
- self.dbcon = sqlite.connect(dbfile)
- self.dbcon.row_factory = sqlite.Row
- self.dbcon.execute("PRAGMA temp_store = MEMORY")
- if initialize:
- if datasetType == "":
- self.dataType = "DNA"
- else:
- self.dataType = datasetType
-
- self.initializeTables(self.dbcon)
- else:
- metadata = self.getMetadata("dataType")
- self.dataType = metadata["dataType"]
-
- try:
- metadata = self.getMetadata("rdsVersion")
- self.rdsVersion = metadata["rdsVersion"]
- except:
- try:
- self.insertMetadata([("rdsVersion", currentRDSversion)])
- except:
- print "could not add rdsVersion - read-only ?"
- self.rdsVersion = "pre-1.0"
-
- if verbose:
- if initialize:
- print "INITIALIZED dataset %s" % datafile
- else:
- print "dataset %s" % datafile
-
- metadata = self.getMetadata()
- print "metadata:"
- pnameList = metadata.keys()
- pnameList.sort()
- for pname in pnameList:
- print "\t" + pname + "\t" + metadata[pname]
-
- if reportCount:
- ucount = self.getUniqsCount()
- mcount = self.getMultiCount()
- if self.dataType == "DNA" and not initialize:
- try:
- print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
- except:
- print "\n%s unique reads and %s multireads" % (ucount, mcount)
- elif self.dataType == 'RNA' and not initialize:
- scount = self.getSplicesCount()
- try:
- print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
- except:
- print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
-
- print "default cache size is %d pages" % self.getDefaultCacheSize()
- if self.hasIndex():
- print "found index"
- else:
- print "not indexed"
-
-
- def __len__(self):
- """ return the number of usable reads in the dataset.
- """
- try:
- total = self.getUniqsCount()
- except:
- total = 0
-
- try:
- total += self.getMultiCount()
- except:
- pass
-
- if self.dataType == "RNA":
- try:
- total += self.getSplicesCount()
- except:
- pass
-
- try:
- total = int(total)
- except:
- total = 0
-
- return total
-
-
- def __del__(self):
- """ cleanup copy in local cache, if present.
- """
- if self.cachedDBFile != "":
- self.uncacheDB()
-
-
- def cacheDB(self, filename):
- """ copy geneinfoDB to a local cache.
- """
- self.cachedDBFile = tempfile.mktemp() + ".db"
- shutil.copyfile(filename, self.cachedDBFile)
-
-
- def saveCacheDB(self, filename):
- """ copy geneinfoDB to a local cache.
- """
- shutil.copyfile(self.cachedDBFile, filename)
-
-
- def uncacheDB(self):
- """ delete geneinfoDB from local cache.
- """
- global cachedDBFile
- if self.cachedDBFile != "":
- try:
- os.remove(self.cachedDBFile)
- except:
- print "could not delete %s" % self.cachedDBFile
-
- self.cachedDB = ""
-
-
- def attachDB(self, filename, asname):
- """ attach another database file to the readDataset.
- """
- stmt = "attach '%s' as %s" % (filename, asname)
- self.execute(stmt)
-
-
- def detachDB(self, asname):
- """ detach a database file to the readDataset.
- """
- stmt = "detach %s" % (asname)
- self.execute(stmt)
-
-
- def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
- """ import into current RDS the table (with columns destcolumns,
- with default all columns) from the database file asname,
- using the column specification of ascolumns (default all).
- """
- stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
- if flagged != "":
- stmt += " where flag = '%s' " % flagged
-
- self.execute(stmt, forceCommit=True)
-
-
- def getTables(self, asname=""):
- """ get a list of table names in a particular database file.
- """
- resultList = []
-
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- if asname != "":
- asname += "."
-
- stmt = "select name from %ssqlite_master where type='table'" % asname
- sql.execute(stmt)
- results = sql.fetchall()
-
- for row in results:
- resultList.append(row["name"])
-
- return resultList
-
-
- def hasIndex(self):
- """ check whether the RDS file has at least one index.
- """
- stmt = "select count(*) from sqlite_master where type='index'"
- count = int(self.execute(stmt, returnResults=True)[0][0])
- if count > 0:
- return True
-
- return False
-
-
- def initializeTables(self, acon, cache=100000):
- """ creates table schema in database connection acon, which is
- typically a database file or an in-memory database.
- """
- acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
- acon.execute("create table metadata (name varchar, value varchar)")
- acon.execute("insert into metadata values('dataType','%s')" % self.dataType)
- acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
- acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
- if self.dataType == "RNA":
- acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)")
-
- acon.commit()
-
-
- def getFileCursor(self):
- """ returns a cursor to file database for low-level (SQL)
- access to the data.
- """
- return self.dbcon.cursor()
-
-
- def getMemCursor(self):
- """ returns a cursor to memory database for low-level (SQL)
- access to the data.
- """
- return self.memcon.cursor()
-
-
- def getMetadata(self, valueName=""):
- """ returns a dictionary of metadata.
- """
- whereClause = ""
- resultsDict = {}
-
- if valueName != "":
- whereClause = " where name = '%s' " % valueName
-
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- sql.execute("select name, value from metadata" + whereClause)
- results = sql.fetchall()
-
- for row in results:
- pname = row["name"]
- pvalue = row["value"]
- if pname not in resultsDict:
- resultsDict[pname] = pvalue
- else:
- trying = True
- index = 2
- while trying:
- newName = pname + ":" + str(index)
- if newName not in resultsDict:
- resultsDict[newName] = pvalue
- trying = False
-
- index += 1
-
- return resultsDict
-
-
- def getReadSize(self):
- """ returns readsize if defined in metadata.
- """
- metadata = self.getMetadata()
- if "readsize" not in metadata:
- print "no readsize parameter defined - returning 0"
- return 0
- else:
- mysize = metadata["readsize"]
- if "import" in mysize:
- mysize = mysize.split()[0]
-
- return int(mysize)
-
-
- def getDefaultCacheSize(self):
- """ returns the default cache size.
- """
- return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
-
-
- def getChromosomes(self, table="uniqs", fullChrom=True):
- """ returns a list of distinct chromosomes in table.
- """
- statement = "select distinct chrom from %s" % table
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- sql.execute(statement)
- results = []
- for row in sql:
- if fullChrom:
- if row["chrom"] not in results:
- results.append(row["chrom"])
- else:
- if len(row["chrom"][3:].strip()) < 1:
- continue
-
- if row["chrom"][3:] not in results:
- results.append(row["chrom"][3:])
-
- results.sort()
-
- return results
-
-
- def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
- doMulti=False, doSplices=False):
- """ returns the maximum coordinate for reads on a given chromosome.
- """
- maxCoord = 0
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- if doUniqs:
- try:
- sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
- maxCoord = int(sql.fetchall()[0][0])
- except:
- print "couldn't retrieve coordMax for chromosome %s" % chrom
-
- if doSplices:
- sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
- try:
- spliceMax = int(sql.fetchall()[0][0])
- if spliceMax > maxCoord:
- maxCoord = spliceMax
- except:
- pass
-
- if doMulti:
- sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
- try:
- multiMax = int(sql.fetchall()[0][0])
- if multiMax > maxCoord:
- maxCoord = multiMax
- except:
- pass
-
- if verbose:
- print "%s maxCoord: %d" % (chrom, maxCoord)
-
- return maxCoord
-
-
- def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="",
- flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
- withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
- readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
- flagLike=False, strand="", entryDict=False, combine5p=False):
- """ returns a dictionary of reads in a variety of formats
- and which can be restricted by chromosome or custom-flag.
- Returns unique reads by default, but can return multireads
- with doMulti set to True.
- """
- whereClause = []
- resultsDict = {}
-
- if chrom != "" and chrom != self.memChrom:
- whereClause.append("chrom = '%s'" % chrom)
-
- if flag != "":
- if flagLike:
- flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
- whereClause.append(flagLikeClause)
- else:
- whereClause.append("flag = '%s'" % flag)
-
- if start > -1:
- whereClause.append("start > %d" % start)
-
- if stop > -1:
- whereClause.append("stop < %d" % stop)
-
- if len(readLike) > 0:
- readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
- whereClause.append(readIDClause)
-
- if hasMismatch:
- whereClause.append("mismatch != ''")
-
- if strand in ["+", "-"]:
- whereClause.append("sense = '%s'" % strand)
-
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
-
- groupBy = []
- if findallOptimize:
- selectClause = ["select start, sense, sum(weight)"]
- groupBy = ["GROUP BY start, sense"]
- else:
- selectClause = ["select ID, chrom, start, readID"]
- if bothEnds:
- selectClause.append("stop")
-
- if not noSense:
- selectClause.append("sense")
-
- if withWeight:
- selectClause.append("weight")
-
- if withFlag:
- selectClause.append("flag")
-
- if withMismatch:
- selectClause.append("mismatch")
-
- if limit > 0 and not combine5p:
- groupBy.append("LIMIT %d" % limit)
-
- selectQuery = string.join(selectClause, ",")
- groupQuery = string.join(groupBy)
- if doUniqs:
- stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
- if doMulti:
- stmt.append("UNION ALL")
- stmt.append(selectQuery)
- stmt.append("from multi")
- stmt.append(whereQuery)
- stmt.append(groupQuery)
- else:
- stmt = [selectQuery, "from multi", whereQuery]
-
- if combine5p:
- if findallOptimize:
- selectQuery = "select start, sense, weight, chrom"
-
- if doUniqs:
- subSelect = [selectQuery, "from uniqs", whereQuery]
- if doMulti:
- subSelect.append("union all")
- subSelect.append(selectQuery)
- subSelect.append("from multi")
- subSelect.append(whereQuery)
- else:
- subSelect = [selectQuery, "from multi", whereQuery]
-
- sqlStmt = string.join(subSelect)
- if findallOptimize:
- selectQuery = "select start, sense, sum(weight)"
-
- stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
- selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
-
- if findallOptimize:
- if self.memBacked:
- self.memcon.row_factory = None
- sql = self.memcon.cursor()
- else:
- self.dbcon.row_factory = None
- sql = self.dbcon.cursor()
-
- stmt.append("order by start")
- elif readIDDict:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- stmt.append("order by readID, start")
- else:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- stmt.append("order by chrom, start")
-
- sqlQuery = string.join(stmt)
- sql.execute(sqlQuery)
-
- if findallOptimize:
- resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql]
- if self.memBacked:
- self.memcon.row_factory = sqlite.Row
- else:
- self.dbcon.row_factory = sqlite.Row
- else:
- currentChrom = ""
- currentReadID = ""
- pairID = 0
- for row in sql:
- readID = row["readID"]
- if fullChrom:
- chrom = row["chrom"]
- else:
- chrom = row["chrom"][3:]
-
- if not readIDDict and chrom != currentChrom:
- resultsDict[chrom] = []
- currentChrom = chrom
- dictKey = chrom
- elif readIDDict:
- theReadID = readID
- if "::" in readID:
- (theReadID, multiplicity) = readID.split("::")
-
- if "/" in theReadID and withPairID:
- (theReadID, pairID) = readID.split("/")
-
- if theReadID != currentReadID:
- resultsDict[theReadID] = []
- currentReadID = theReadID
- dictKey = theReadID
-
- if entryDict:
- newrow = {"start": int(row["start"])}
- if bothEnds:
- newrow["stop"] = int(row["stop"])
-
- if not noSense:
- newrow["sense"] = row["sense"]
-
- if withWeight:
- newrow["weight"] = float(row["weight"])
-
- if withFlag:
- newrow["flag"] = row["flag"]
-
- if withMismatch:
- newrow["mismatch"] = row["mismatch"]
-
- if withID:
- newrow["readID"] = readID
-
- if withChrom:
- newrow["chrom"] = chrom
-
- if withPairID:
- newrow["pairID"] = pairID
- else:
- newrow = [int(row["start"])]
- if bothEnds:
- newrow.append(int(row["stop"]))
-
- if not noSense:
- newrow.append(row["sense"])
-
- if withWeight:
- newrow.append(float(row["weight"]))
-
- if withFlag:
- newrow.append(row["flag"])
-
- if withMismatch:
- newrow.append(row["mismatch"])
-
- if withID:
- newrow.append(readID)
-
- if withChrom:
- newrow.append(chrom)
-
- if withPairID:
- newrow.append(pairID)
-
- resultsDict[dictKey].append(newrow)
-
- return resultsDict
-
-
- def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="",
- flag="", withWeight=False, withFlag=False, withMismatch=False,
- withID=False, withChrom=False, withPairID=False, readIDDict=False,
- splitRead=False, hasMismatch=False, flagLike=False, start=-1,
- stop=-1, strand="", entryDict=False):
- """ returns a dictionary of spliced reads in a variety of
- formats and which can be restricted by chromosome or custom-flag.
- Returns unique spliced reads for now.
- """
- whereClause = []
- resultsDict = {}
-
- if chrom != "" and chrom != self.memChrom:
- whereClause = ["chrom = '%s'" % chrom]
-
- if flag != "":
- if flagLike:
- flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
- whereClause.append(flagLikeClause)
- else:
- whereClause.append("flag = '%s'" % flag)
-
- if hasMismatch:
- whereClause.append("mismatch != ''")
-
- if strand != "":
- whereClause.append("sense = '%s'" % strand)
-
- if start > -1:
- whereClause.append("startL > %d" % start)
-
- if stop > -1:
- whereClause.append("stopR < %d" % stop)
-
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
-
- selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
- if not noSense:
- selectClause.append("sense")
-
- if withWeight:
- selectClause.append("weight")
-
- if withFlag:
- selectClause.append("flag")
-
- if withMismatch:
- selectClause.append("mismatch")
-
- selectQuery = string.join(selectClause, " ,")
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- if chrom == "" and not readIDDict:
- stmt = "select distinct chrom from splices %s" % whereQuery
- sql.execute(stmt)
- for row in sql:
- if fullChrom:
- chrom = row["chrom"]
- else:
- chrom = row["chrom"][3:]
-
- resultsDict[chrom] = []
- elif chrom != "" and not readIDDict:
- resultsDict[chrom] = []
-
- stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
- sql.execute(stmt)
- currentReadID = ""
- for row in sql:
- pairID = 0
- readID = row["readID"]
- if fullChrom:
- chrom = row["chrom"]
- else:
- chrom = row["chrom"][3:]
-
- if readIDDict:
- if "/" in readID:
- (theReadID, pairID) = readID.split("/")
- else:
- theReadID = readID
-
- if theReadID != currentReadID:
- resultsDict[theReadID] = []
- currentReadID = theReadID
- dictKey = theReadID
- else:
- dictKey = chrom
-
- if entryDict:
- newrow = {"startL": int(row["startL"])}
- newrow["stopL"] = int(row["stopL"])
- newrow["startR"] = int(row["startR"])
- newrow["stopR"] = int(row["stopR"])
- if not noSense:
- newrow["sense"] = row["sense"]
-
- if withWeight:
- newrow["weight"] = float(row["weight"])
-
- if withFlag:
- newrow["flag"] = row["flag"]
-
- if withMismatch:
- newrow["mismatch"] = row["mismatch"]
-
- if withID:
- newrow["readID"] = readID
-
- if withChrom:
- newrow["chrom"] = chrom
-
- if withPairID:
- newrow["pairID"] = pairID
-
- if splitRead:
- leftDict = newrow
- del leftDict["startR"]
- del leftDict["stopR"]
- rightDict = newrow
- del rightDict["start"]
- del rightDict["stopL"]
- resultsDict[dictKey].append(leftDict)
- resultsDict[dictKey].append(rightDict)
- else:
- resultsDict[dictKey].append(newrow)
- else:
- newrow = [int(row["startL"])]
- newrow.append(int(row["stopL"]))
- newrow.append(int(row["startR"]))
- newrow.append(int(row["stopR"]))
- if not noSense:
- newrow.append(row["sense"])
-
- if withWeight:
- newrow.append(float(row["weight"]))
-
- if withFlag:
- newrow.append(row["flag"])
-
- if withMismatch:
- newrow.append(row["mismatch"])
-
- if withID:
- newrow.append(readID)
-
- if withChrom:
- newrow.append(chrom)
-
- if withPairID:
- newrow.append(pairID)
-
- if splitRead:
- resultsDict[dictKey].append(newrow[:2] + newrow[4:])
- resultsDict[dictKey].append(newrow[2:])
- else:
- resultsDict[dictKey].append(newrow)
-
- return resultsDict
-
-
- def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
- splices=False, reportCombined=True, sense="both"):
- """ return read counts for a given region.
- """
- ucount = 0
- mcount = 0
- scount = 0
- restrict = ""
- if sense in ["+", "-"]:
- restrict = " sense ='%s' " % sense
-
- if uniqs:
- try:
- ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
- except:
- ucount = 0
-
- if multi:
- try:
- mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
- except:
- mcount = 0
-
- if splices:
- try:
- scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
- except:
- scount = 0
-
- if reportCombined:
- total = ucount + mcount + scount
- return total
- else:
- return (ucount, mcount, scount)
-
-
- def getTotalCounts(self, chrom="", rmin="", rmax=""):
- return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
-
-
- def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
- """ returns the number of row in the uniqs table.
- """
- whereClause = []
- count = 0
-
- if chrom !="" and chrom != self.memChrom:
- whereClause = ["chrom='%s'" % chrom]
-
- if rmin != "":
- whereClause.append("%s >= %s" % (startField, str(rmin)))
-
- if rmax != "":
- whereClause.append("%s <= %s" % (startField, str(rmax)))
-
- if restrict != "":
- whereClause.append(restrict)
-
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
-
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- if distinct:
- sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery))
- else:
- sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
-
- result = sql.fetchone()
-
- try:
- count = int(result[0])
- except:
- count = 0
-
- return count
-
-
- def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the number of row in the splices table.
- """
- return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
-
-
- def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the number of distinct readIDs in the uniqs table.
- """
- return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
-
-
- def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the total weight of readIDs in the multi table.
- """
- return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
-
-
- def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
- """ get readID's.
- """
- stmt = []
- limitPart = ""
- if limit > 0:
- limitPart = "LIMIT %d" % limit
-
- if uniqs:
- stmt.append("select readID from uniqs")
-
- if multi:
- stmt.append("select readID from multi")
-
- if splices:
- stmt.append("select readID from splices")
-
- if len(stmt) > 0:
- selectPart = string.join(stmt, " union ")
- else:
- selectPart = ""
-
- sqlQuery = "%s group by readID %s" (selectPart, limitPart)
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- sql.execute(sqlQuery)
- result = sql.fetchall()
-
- if paired:
- return [x.split("/")[0][0] for x in result]
- else:
- return [x[0] for x in result]
-
-
- def getMismatches(self, mischrom = None, verbose=False, useSplices=True):
- """ returns the uniq and spliced mismatches in a dictionary.
- """
- revcomp = {"A": "T",
- "T": "A",
- "G": "C",
- "C": "G",
- "N": "N"
- }
-
- readlen = self.getReadSize()
- if mischrom:
- hitChromList = [mischrom]
- else:
- hitChromList = self.getChromosomes()
- hitChromList.sort()
-
- snpDict = {}
- for achrom in hitChromList:
- if verbose:
- print "getting mismatches from chromosome %s" % (achrom)
-
- snpDict[achrom] = []
- hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True)
- if useSplices and self.dataType == "RNA":
- spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
- spliceIDList = spliceDict.keys()
- for k in spliceIDList:
- (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0]
- spMismatchList = mismatches.split(",")
- for mismatch in spMismatchList:
- if "N" in mismatch:
- continue
-
- change_len = len(mismatch)
- if sense == "+":
- change_from = mismatch[0]
- change_base = mismatch[change_len-1]
- change_pos = int(mismatch[1:change_len-1])
- elif sense == "-":
- change_from = revcomp[mismatch[0]]
- change_base = revcomp[mismatch[change_len-1]]
- change_pos = readlen - int(mismatch[1:change_len-1]) + 1
-
- firsthalf = int(lefthalf)-int(startpos)+1
- secondhalf = 0
- if int(change_pos) <= int(firsthalf):
- change_at = startpos + change_pos - 1
- else:
- secondhalf = change_pos - firsthalf
- change_at = rightstart + secondhalf
-
- snpDict[achrom].append([startpos, change_at, change_base, change_from])
-
- if achrom not in hitDict:
- continue
-
- for (start, sense, mismatches) in hitDict[achrom]:
- mismatchList = mismatches.split(",")
- for mismatch in mismatchList:
- if "N" in mismatch:
- continue
-
- change_len = len(mismatch)
- if sense == "+":
- change_from = mismatch[0]
- change_base = mismatch[change_len-1]
- change_pos = int(mismatch[1:change_len-1])
- elif sense == "-":
- change_from = revcomp[mismatch[0]]
- change_base = revcomp[mismatch[change_len-1]]
- change_pos = readlen - int(mismatch[1:change_len-1]) + 1
-
- change_at = start + change_pos - 1
- snpDict[achrom].append([start, change_at, change_base, change_from])
-
- return snpDict
-
-
- def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
- useSplices=False, normalizationFactor = 1.0, trackStrand=False,
- keepStrand="both", shiftValue=0):
- """return a profile of the chromosome as an array of per-base read coverage....
- keepStrand = 'both', 'plusOnly', or 'minusOnly'.
- Will also shift position of unique and multireads (but not splices) if shift is a natural number
- """
- metadata = self.getMetadata()
- readlen = int(metadata["readsize"])
- dataType = metadata["dataType"]
- scale = 1. / normalizationFactor
- shift = {}
- shift["+"] = int(shiftValue)
- shift["-"] = -1 * int(shiftValue)
-
- if cstop > 0:
- lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
- else:
- lastNT = cstop - cstart + readlen + shift["+"]
-
- chromModel = array("f", [0.] * lastNT)
- hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
- if cstart < 0:
- cstart = 0
-
- for (hstart, sense, weight) in hitDict[chromosome]:
- hstart = hstart - cstart + shift[sense]
- for currentpos in range(hstart,hstart+readlen):
- try:
- if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
- chromModel[currentpos] += scale * weight
- elif sense == '-' and keepStrand != "plusOnly":
- chromModel[currentpos] -= scale * weight
- except:
- continue
-
- del hitDict
- if useSplices and dataType == "RNA":
- if cstop > 0:
- spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
- else:
- spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
-
- if chromosome in spliceDict:
- for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]:
- if (Rstop - cstart) < lastNT:
- for index in range(abs(Lstop - Lstart)):
- currentpos = Lstart - cstart + index
- # we only track unique splices
- if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
- chromModel[currentpos] += scale
- elif rsense == "-" and keepStrand != "plusOnly":
- chromModel[currentpos] -= scale
-
- for index in range(abs(Rstop - Rstart)):
- currentpos = Rstart - cstart + index
- # we only track unique splices
- if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
- chromModel[currentpos] += scale
- elif rsense == "-" and keepStrand != "plusOnly":
- chromModel[currentpos] -= scale
-
- del spliceDict
-
- return chromModel
-
-
- def insertMetadata(self, valuesList):
- """ inserts a list of (pname, pvalue) into the metadata
- table.
- """
- self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
- self.dbcon.commit()
-
-
- def updateMetadata(self, pname, newValue, originalValue=""):
- """ update a metadata field given the original value and the new value.
- """
- stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
- if originalValue != "":
- stmt += " and value='%s' " % str(originalValue)
-
- self.dbcon.execute(stmt)
- self.dbcon.commit()
-
-
- def insertUniqs(self, valuesList):
- """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
- into the uniqs table.
- """
- self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def insertMulti(self, valuesList):
- """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
- into the multi table.
- """
- self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def insertSplices(self, valuesList):
- """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
- into the splices table.
- """
- self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
- """ update reads on file database in a list region of regions for a chromosome to have a new flag.
- regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
- sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
- """
- restrict = ""
- if sense != "both":
- restrict = " and sense = ? "
-
- if uniqs:
- self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
-
- if multi:
- self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
-
- if self.dataType == "RNA" and splices:
- self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
- self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
-
- self.dbcon.commit()
-
-
- def setFlags(self, flag, uniqs=True, multi=True, splices=True):
- """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
- """
- if uniqs:
- self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
-
- if multi:
- self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
-
- if self.dataType == 'RNA' and splices:
- self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
-
- self.dbcon.commit()
-
-
- def resetFlags(self, uniqs=True, multi=True, splices=True):
- """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
- """
- if uniqs:
- self.dbcon.execute("UPDATE uniqs SET flag = ''")
-
- if multi:
- self.dbcon.execute("UPDATE multi SET flag = ''")
-
- if self.dataType == "RNA" and splices:
- self.dbcon.execute("UPDATE splices SET flag = ''")
-
- self.dbcon.commit()
-
-
- def reweighMultireads(self, readList):
- self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
-
-
- def setSynchronousPragma(self, value="ON"):
- try:
- self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
- except:
- print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
-
-
- def setDBcache(self, cache, default=False):
- self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
- if default:
- self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache)
-
-
- def execute(self, statement, returnResults=False, forceCommit=False):
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
- sql.execute(statement)
- if returnResults:
- result = sql.fetchall()
- return result
-
- if forceCommit:
- if self.memBacked:
- self.memcon.commit()
- else:
- self.dbcon.commit()
-
-
- def buildIndex(self, cache=100000):
- """ Builds the file indeces for the main tables.
- Cache is the number of 1.5 kb pages to keep in memory.
- 100000 pages translates into 150MB of RAM, which is our default.
- """
- if cache > self.getDefaultCacheSize():
- self.setDBcache(cache)
- self.setSynchronousPragma("OFF")
- self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
- print "built uPosIndex"
- self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
- print "built uChromIndex"
- self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
- print "built mPosIndex"
- self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
- print "built mChromIndex"
-
- if self.dataType == "RNA":
- self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
- print "built sPosIndex"
- self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
- print "built sPosIndex2"
- self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
- print "built sChromIndex"
-
- self.dbcon.commit()
- self.setSynchronousPragma("ON")
-
-
- def dropIndex(self):
- """ drops the file indices for the main tables.
- """
- try:
- self.setSynchronousPragma("OFF")
- self.dbcon.execute("DROP INDEX uPosIndex")
- self.dbcon.execute("DROP INDEX uChromIndex")
- self.dbcon.execute("DROP INDEX mPosIndex")
- self.dbcon.execute("DROP INDEX mChromIndex")
-
- if self.dataType == "RNA":
- self.dbcon.execute("DROP INDEX sPosIndex")
- try:
- self.dbcon.execute("DROP INDEX sPosIndex2")
- except:
- pass
-
- self.dbcon.execute("DROP INDEX sChromIndex")
-
- self.dbcon.commit()
- except:
- print "problem dropping index"
-
- self.setSynchronousPragma("ON")
-
-
- def memSync(self, chrom="", index=False):
- """ makes a copy of the dataset into memory for faster access.
- Can be restricted to a "full" chromosome. Can also build the
- memory indices.
- """
- self.memcon = ""
- self.memcon = sqlite.connect(":memory:")
- self.initializeTables(self.memcon)
- cursor = self.dbcon.cursor()
- whereclause = ""
- if chrom != "":
- print "memSync %s" % chrom
- whereclause = " where chrom = '%s' " % chrom
- self.memChrom = chrom
- else:
- self.memChrom = ""
-
- self.memcon.execute("PRAGMA temp_store = MEMORY")
- self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
- # copy metadata to memory
- self.memcon.execute("delete from metadata")
- results = cursor.execute("select name, value from metadata")
- results2 = []
- for row in results:
- results2.append((row["name"], row["value"]))
-
- self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
- # copy uniqs to memory
- results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause)
- results2 = []
- for row in results:
- results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
- self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
- # copy multi to memory
- results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause)
- results2 = []
- for row in results:
- results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
- self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
- # copy splices to memory
- if self.dataType == "RNA":
- results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause)
- results2 = []
- for row in results:
- results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
- self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2)
- if index:
- if chrom != "":
- self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
- self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
- if self.dataType == "RNA":
- self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
- self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
- else:
- self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
- self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
- if self.dataType == "RNA":
- self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
- self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
-
- self.memBacked = True
- self.memcon.row_factory = sqlite.Row
- self.memcon.commit()
if not argv:
argv = sys.argv
- print "version 1.1"
+ print "crossmatch: version 1.2"
if len(argv) < 7:
print "usage: python %s prefix directory genome1 genefile1 genome2 genefile2 [genome3 genefile3 .....]" % argv[0]
sys.exit(1)
except:
pass
-from commoncode import readDataset
-import sys, time, optparse
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
def main(argv=None):
if not argv:
argv = sys.argv
- print "%prog: version 3.3"
+ print "distalPairs: version 3.4"
print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--maxDist", type="int", dest="maxDist")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "distalPairs"
+ sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ maxDist = getConfigIntOption(configParser, section, "maxDist", 1000000000)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+ parser.set_defaults(sameChromOnly=sameChromOnly, doSplices=doSplices, doVerbose=doVerbose, maxDist=maxDist, cachePages=cachePages)
+
+ return parser
+
+
def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
if cachePages is not None:
doCache = True
doCache = False
cachePages = -1
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
if not RDS.hasIndex():
print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
sys.exit(1)
readList = uniqDict[readID]
if len(readList) == 2:
total += 1
- (start1, sense1, chrom1, pair1) = readList[0]
- (start2, sense2, chrom2, pair2) = readList[1]
+ start1 = readList[0]["start"]
+ sense1 = readList[0]["sense"]
+ chrom1 = readList[0]["chrom"]
+ start2 = readList[1]["start"]
+ sense2 = readList[1]["sense"]
+ chrom2 = readList[1]["chrom"]
if chrom1 != chrom2:
diffChrom += 1
The makerdsfromeland2.py script is used to import the reads
into RDS:
-python makerdsfromeland2.py label infilename outrdsfile [-append] [-RNA ucscGeneModels]
-[propertyName::propertyValue] [-index] [-paired 1 or 2] [-extended] [-verbose]
-[-olddelimiter] [-maxlines num] [-cache numPages]
+python makerdsfromeland2.py label infilename outrdsfile [--append] [--RNA ucscGeneModels]
+[propertyName::propertyValue] [--index] [--paired 1 or 2] [--extended] [--verbose]
+[--olddelimiter] [--maxlines num] [--cache numPages]
The first 3 arguments are required:
- label is any label that you wish (a combination flowcell+lane#
- outdbname is the name of the rds file, e.g. test.rds
If the reads are from paired-end runs, enter each eland_multi
-(or extended) file separately with the "-paired 1" or "-paired 2"
+(or extended) file separately with the "--paired 1" or "--paired 2"
flag, as appropriate.
-If entering more than one lane, use -append for all subsequent
-lanes. Upon entering the last lane, use -index to build a read
+If entering more than one lane, use --append for all subsequent
+lanes. Upon entering the last lane, use --index to build a read
index. Refer to MANIPULATING RDS METADATA AND CACHING for
information on the optional property::value pairs and caching.
For RNA-seq, you must in addition specify the path to knownGene.txt
-using the -RNA flag, e.g.
+using the --RNA flag, e.g.
-python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds --RNA ../mm9/knownGene.txt [more options]
6. MAPPING READS WITH BOWTIE
The options for the script are:
python makerdsfrombowtie.py label infilename outrdsfile
-[-RNA ucscGeneModels] [-append] [-index] [propertyName::propertyValue]
-[-rawreadID] [-verbose] [-cache numPages]
+[--RNA ucscGeneModels] [--append] [--index] [propertyName::propertyValue]
+[--rawreadID] [--verbose] [--cache numPages]
Refer to "MAPPING READS WITH ELAND" for a description of label,
-infilename, outdbname, '-append', '-index', and '-cache'.
+infilename, outdbname, '--append', '--index', and '--cache'.
-****REMEMBER TO USE -index WHEN LOADING THE LAST LANE OF YOUR
+****REMEMBER TO USE --index WHEN LOADING THE LAST LANE OF YOUR
DATASET.****
The script assumes that the read ID are from Illumina, i.e. that
throw_away:uniqueid/1 and throw_away:uniqueid/2 for paired-ends.
For RNA-seq, you must in addition specify the path to knownGene.txt
-using the -RNA flag, e.g.
+using the --RNA flag, e.g.
-python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds --RNA ../mm9/knownGene.txt [more options]
7. MAPPING READS WITH BLAT
script is used to import the mapped reads (in the example
above s3_1.hg18.blatbetter) into RDS:
-python makerdsfromblat.py label infilename outrdsfile [-append] [-index] [propertyName::propertyValue]
-[-rawreadID] [-forceRNA] [-flag] [-strict minSpliceLen] [-spliceonly] [-verbose] [-cache numPages]
+python makerdsfromblat.py label infilename outrdsfile [--append] [--index] [propertyName::propertyValue]
+[--rawreadID] [--forceRNA] [--flag] [--strict minSpliceLen] [--spliceonly] [--verbose] [--cache numPages]
If you are using BLAT for RNA-seq, please be sure to use
--forceRNA in order to import spliced reads and consider
-using -strict to require a minimum length of bases on
+--forceRNA in order to import spliced reads and consider
+using --strict to require a minimum length of bases on
each side of the splice.
You can combine BOWTIE and BLAT by mapping reads with BOWTIE
first, and then using BLAT to map the unmapped reads. In
that case, you may want to only load the spliced reads
-using the -spliceonly flag. To track those reads in the RDS
-file, use -flag ; you can then retrieve those reads using
-the options "-flag blat -flagLike" with the makebedfromrds.py
+using the --spliceonly flag. To track those reads in the RDS
+file, use --flag ; you can then retrieve those reads using
+the options "--flag blat --flagLike" with the makebedfromrds.py
script.
The command line options are similar to those for other
scripts described in part 5-7:
-python makerdsfrombed.py label bedfile outrdsfile [-append] [-index] [propertyName::propertyValue] [-cache numPages]
+python makerdsfrombed.py label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]
9. COMBINING RDS FILES
The combinerds.py command options are:
-python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [-init] [-initrna] [-index] [-cache pages]
+python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]
10. MANIPULATING RDS METADATA AND CACHING
RELEASE HISTORY
+version 3.3 November 2010 - updated command line options
version 3.2 October 2009 - added combinerds.py
version 3.01 February 2009 - bug fixes
version 3.0 January 2009 - added logging to buildrdsfrom*
(a) is the default in the current release of ERANGE.
Simply proceed to RUNNING THE PEAK FINDER for (a) and
-(a). You can ignore multireads (b) by using the -nomulti
+(a). You can ignore multireads (b) by using the --nomulti
flag with findall.py. For (c), use weighMultireads.py
to weigh multireads based on a unique reads in the
respective radius of each potential location. Once run,
To run the peak finder without read shifting, use the
following command:
-python $ERANGEPATH/findall.py label chip.rds chip.regions.txt -control control.rds -listPeak -revbackground
+python $ERANGEPATH/findall.py label chip.rds chip.regions.txt --control control.rds --listPeak --revbackground
which will run the peak finder on chip.rds / control.rds ,
store the enriched region coordinates in chip.regions.txt,
You will *NEED* to change some of the default parameters
if working in smaller genomes (e.g. use smaller -spacing),
if working with certain types of IPs such as histones and
-polymerases (test with and without -notrim and
--nodirectionality), if working with rather weak IPs
-(e.g. -minimum and -ratio), or if working with larger
+polymerases (test with and without --notrim and
+--nodirectionality), if working with rather weak IPs
+(e.g. --minimum and --ratio), or if working with larger
fragment sizes (see the paragraph below discussing read
shifting).
findall.py returns a per-peak p-value. By default, this
is calculated using a Poisson distribution of peak RPMs
-(or counts, if using -raw) for each chromosome in the IP.
+(or counts, if using --raw) for each chromosome in the IP.
P-value calculations can be turned off using
-'-pvalue none '. Alternatively, the p-value can be
+'--pvalue none '. Alternatively, the p-value can be
calculated from the background using the option
-'-pvalue back ', which must be combined with the option
--revbackground.
+'--pvalue back ', which must be combined with the option
+--revbackground.
By default, findall.py does not try to adjust the location
of the reads based on half the size of the expected fragment
length (the "shift"). If you believe that you need to shift
your peaks, findall.py can try to pick the best shift based
on the best shift for strong sites using the parameter
-'-shift learn '. You can also either manually specify a
-shift value using '-shift #bp ' or ou can calculate a
-"best shift" for each region using '-autoshift'. If you
+'--shift learn '. You can also either manually specify a
+shift value using '--shift #bp ' or ou can calculate a
+"best shift" for each region using '--autoshift'. If you
need to using the shift options, the recommended usage is:
-(i) first run findall.py with '-shift learn ', which will
+(i) first run findall.py with '--shift learn ', which will
peak a shift if there are at least 30 regions that meet
its training criteria.
(ii) if (i) couldn't pick a shift, run findall.py with
--autoshift and -reportshift
+--autoshift and --reportshift
(iii) look at the mode (most common #) for the shift
-(iv) rerun findall.py with -shift #bp where #bp is the mode
+(iv) rerun findall.py with --shift #bp where #bp is the mode
If you are storing the RDS files on an network-mounted
-directory, make sure to use '-cache XXXXX' to enable
+directory, make sure to use '--cache XXXXX' to enable
local caching, where is as large as appropriate as
described in section 9 of README.build-rds .
RELEASE HISTORY
+version 3.2 November 2010 - updated command line options
version 3.1 February 2009 - support for read shifting
version 3.0 February 2009 - support for UCSC narrowPeak format in regiontobed.py
version 3.0rc1 December 2008 - added parameter to control peak-trimming
and get back a version number and all possible command line options:
version 1.0
-usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [-withmultifraction]
+usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]
where fields in brackets are optional.
model to identify candidate regions:
# Alternative 1: find new regions outside of gene models with reads piled up
-python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
# use "none" if you don't have a repeatmask database
-python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1
In alternative 2, we pool multiple RNA-seq datasets into a single
RDS database, run it through the two scripts of alternative 1 above,
$ERANGEPATH/gfftocis.py infile.gff outfile.cis
-NOTE THAT YOU WILL MOST LIKELY HAVE TO EDIT THIS FILE TO
-ACCOMODATE YOUR SPECIFIC GFF FORMAT TO THE CISTEMATIC
-FORMAT, WHICH IS
+NOTE THAT THIS FILE IS PROVIDED AS AN EXAMPLE ONLY. YOU WILL MOST
+LIKELY HAVE TO EDIT THIS FILE TO ACCOMODATE YOUR SPECIFIC GFF
+FORMAT TO THE CISTEMATIC FORMAT, WHICH IS
geneID<tab>uniqRef<tab>chrom<tab>start<tab>stop<tab>sense<tab>type<return>
RELEASE HISTORY
+version 3.3 November 2010 - updated command line options
version 3.2 December 2009 - support for custom genome annotations with Cistematic 3.0
version 3.1 April 2009 - modified normalizeFinalExonic.py to remove genome
version 3.0 January 2009 - added logging to shell pipelines
specifying a distance to distalPairs.py that is greater than the length of the
largest existing genomic contig. For example:
-python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs -splices -cache 20000000
+python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs --splices --cache 20000000
4. RUNNING RNAPATH.py
-You can now run RNAPATH.py. I suggest optionallly using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
+You can now run RNAPATH.py. I suggest optionally using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
Example: $ERANGEPATH/rnapath/RNAPATH.py genomic_contigs.fa rna_on_genomic.crosspairs RNAPATH.log genome.RNAPATH.fa
-version 3.2 May 2010 - first release
+version 3.3 November 2010 - updated command line options
+version 3.2 May 2010 - first release
# export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes
#
# preliminary: set ERANGEPATH, e.g.
-# export ERANGEPATH=/proj/genome/experiments/commoncode
+# export ERANGEPATH=/my/path/to/erange
#
# preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g.
# export CISTEMATIC_TEMP=/any/local/dir
# create rds file with one lane's worth of data (add -index if using only one lane)
# The example below sets the default cache to 1000000
# The name::value pairs are optional documentart metadata, and can be set to any desired name or value
-python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX
# can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups
# rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer
-#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount
+#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds --defaultcache 2000000 --nocount
# append more data (only add -index when adding last lane)
-python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 --append --index
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
-python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count --markGID --cache 1
# count splice reads
-python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count --splices --noUniqs --cache 1
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm --cache
# recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount --uniq --cache 1
# There is a choice of either identifying new regions from the data alone
# (Alternative 1), or using a pre-computed list of new regions (presumably
# file (Alternative 2)
# Alternative 1: find new regions outside of gene models with reads piled up
-python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
# use "none" if you don't have a repeatmask database
-python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1
# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
#python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
# map all candidate regions that are within a 20kb radius of a gene in bp
# take out -cache if running locally
-python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache
+python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 --radius 20001 --trackfar --cache
# calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm --cache
# create bed file of accepted candidate regions
python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0
# weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count --accept LHCN10213.accepted.rpkm --multi --cache 1
# calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache
+python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm --multifraction --withGID --cache
#!/bin/bash
-echo 'buildMatrix.sh: version 1.1'
+echo 'buildMatrix.sh: version 1.2'
indexPrev=0
indexCur=0
truncateRPKM=""
if [ $# -eq 3 ]; then
- truncateRPKM="-truncate "$3
+ truncateRPKM="--truncate "$3
fi
if [ $# -eq 4 ]; then
- truncateRPKM="-rescale -truncate "$3
+ truncateRPKM="--rescale --truncate "$3
fi
if [ $# -lt 2 ]; then
- echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [-rescale]'
+ echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [--rescale]'
echo
echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
echo
FILELIST=$FILELIST$line
let N=N+1
done < $3
- python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part -minFeature $MINSIZE -nomerge -locid -norandom
+ python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part --minFeature $MINSIZE --nomerge --locid --norandom
fi
cachepages=""
if [ $# -eq 3 ]; then
- cachepages="-cache "$3
+ cachepages="--cache "$3
fi
if [ $# -lt 2 ]; then
prefix=`echo $line | cut -f 1 -d ','`
rds=`echo $line | cut -f 2 -d ','`
if [ -e $rds ]; then
- python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount -force -nomerge -rpkm $cachepages
+ python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount --force --nomerge --rpkm $cachepages
else
echo "could not find $rds - skipping"
python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "could not find $rds - skipping"
if [ -z "$ERANGEPATH" ]
then
- ERANGEPATH='../commoncode'
+ ERANGEPATH='../erange'
fi
-echo 'runRNAPairedAnalysis.sh: version 3.7'
+echo 'runRNAPairedAnalysis.sh: version 3.8'
models=""
if [ $# -eq 5 ]; then
- models=" -models "$5
+ models=" --models "$5
fi
replacemodels=""
if [ $# -eq 6 ]; then
- replacemodels=" -models $5 -replacemodels "
+ replacemodels=" --models $5 --replacemodels "
fi
if [ -z "$1" ]
then
echo
- echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [-replacemodels]'
+ echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]'
echo
echo 'where rdsprefix is the name of the rds file without the .rds extension'
echo 'use "none" for the repeatmaskdb if you do not have one'
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache $models $replacemodels
# recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1 $models $replacemodels
# count splice reads
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -markGID -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --markGID --cache 1 $models $replacemodels
# find new regions outside of gene models with reads piled up
-python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
# filter out new regions that overlap repeats more than a certain fraction
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked -startField 1 -log rna.log -cache 1
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked --startField 1 --log rna.log --cache 1
# calculate the read densities
-python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good -markRDS -cache -log rna.log
+python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good --markRDS --cache --log rna.log
# map all candidate regions that have paired ends overlapping with known genes
-python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt -cache $models $replacemodels
+python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt --cache $models $replacemodels
# calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache $models $replacemodels
# weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1 $models $replacemodels
# calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
fi
if [ -z "$ERANGEPATH" ]
then
- ERANGEPATH='../commoncode'
+ ERANGEPATH='../erange'
fi
-echo 'runSNPAnalysis.sh: version 3.1'
+echo 'runSNPAnalysis.sh: version 3.2'
cachepages=""
if [ $# -eq 9 ]; then
- cachepages="-cache "$9
+ cachepages="--cache "$9
fi
nosplices=""
if [ $# -eq 10 ]; then
- nosplices=" -nosplices "
+ nosplices=" --nosplices "
fi
if [ $# -lt 8 ]; then
python $ERANGEPATH/recordLog.py snp.log runSNPAnalysis.sh "with parameters: $arguments"
# get all SNPs by extracting it from the RDS
-python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt -enforceChr $cachepages $nosplices
+python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt --enforceChr $cachepages $nosplices
# get SNPs in non-repeat regions only
python $ERANGEPATH/chkSNPrmask.py $4 $3.snps.txt $3.nr_snps.txt $cachepages
if [ -z "$ERANGEPATH" ]
then
- ERANGEPATH='../commoncode'
+ ERANGEPATH='../erange'
fi
-echo 'runStandardAnalysis.sh: version 4.2'
+echo 'runStandardAnalysis.sh: version 4.3'
models=""
if [ $# -eq 5 ]; then
- models=" -models "$5
+ models=" --models "$5
fi
replacemodels=""
if [ $# -eq 6 ]; then
- replacemodels=" -models $5 -replacemodels "
+ replacemodels=" --models $5 --replacemodels "
fi
if [ -z "$1" ]
then
echo
- echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [-replacemodels]'
+ echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]'
echo
echo 'where rdsprefix is the name of the rds file without the .rds extension'
echo 'use "none" for the repeatmaskdb if you do not have one'
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
-echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
-echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels"
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache $models $replacemodels
# recount the unique reads with weights calculated during the first pass
-echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1 $models $replacemodels
# count splice reads
-echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1 $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --cache 1 $models $replacemodels
# Alternative 1: find new regions outside of gene models with reads piled up
-echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1"
-python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1"
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -cache 1"
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -log rna.log -startField 1 -cache 1
+echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --log rna.log --startField 1 --cache 1"
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --log rna.log --startField 1 --cache 1
# map all candidate regions that are within a given radius of a gene in bp
-echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels"
-python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache $models $replacemodels
+echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --cache $models $replacemodels"
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --cache $models $replacemodels
# make sure candidates.txt file exists
echo "touch $2.candidates.txt"
touch $2.candidates.txt
# calculate expanded exonic read density
-echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels"
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache $models $replacemodels
# weigh multi-reads
-echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1 $models $replacemodels
# calculate final exonic read density
-echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache"
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
fi
\ No newline at end of file
if [ -z "$ERANGEPATH" ]
then
- ERANGEPATH='../commoncode'
+ ERANGEPATH='../erange'
fi
-echo 'runStrandedAnalysis.sh: version 4.1'
+echo 'runStrandedAnalysis.sh: version 4.2'
if [ -z "$1" ]
then
# count the unique reads falling on the gene models ; the nomatch files are
# mappable reads that fell outside of the Cistematic gene models and not the
# unmappable of Eland (i.e, the "NM" reads)
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -stranded -markGID -cache 1
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --stranded --markGID --cache 1
# calculate a first-pass RPKM to re-weigh the unique reads,
# using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache
# recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -stranded -uniq -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --stranded --uniq --cache 1
# count splice reads
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -stranded -splices -noUniqs -cache 1
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --stranded --splices --noUniqs --cache 1
# find new regions outside of gene models with reads piled up
-python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter plus -log rna.log -cache 1
-python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter minus -log rna.log -cache 1 -append
+python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter plus --log rna.log --cache 1
+python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter minus --log rna.log --cache 1 --append
# filter out new regions that overlap repeats more than a certain fraction
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -log rna.log -cache 1
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --startField 1 --log rna.log --cache 1
# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
#python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
# map all candidate regions that are within a given radius of a gene in bp
-python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -stranded -cache
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --stranded --cache
# calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache
# weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -stranded -multi -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --stranded --multi --cache 1
# calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
fi
except:
pass
-import sys, time
+import sys
+import time
import optparse
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
-print "%prog: version 1.3"
+print "farPairs: version 1.4"
def main(argv=None):
usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--verbose", action="store_true", dest="doVerbose")
- parser.add_option("--minDist", type="int", dest="minDist")
- parser.add_option("--maxDist", type="int", dest="maxDist")
- parser.add_option("--minCount", type="int", dest="minCount")
- parser.add_option("--label", dest="label")
- parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
- minDist=1000, maxDist=500000, minCount=2, label=None)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.label)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--minDist", type="int", dest="minDist")
+ parser.add_option("--maxDist", type="int", dest="maxDist")
+ parser.add_option("--minCount", type="int", dest="minCount")
+ parser.add_option("--label", dest="label")
+
+ configParser = getConfigParser
+ section = "farPairs"
+ sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+ minDist = getConfigIntOption(configParser, section, "minDist", 1000)
+ maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+ minCount = getConfigIntOption(configParser, section, "minCount", 2)
+ label = getConfigOption(configParser, section, "label", None)
+
+ parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+ minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
+
+ return parser
+
+
def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
if label is None:
label = rdsfile
- RDS = readDataset(rdsfile, verbose=True, cache=doCache)
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
rdsChromList = RDS.getChromosomes()
if doVerbose:
continue
print chromosome
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
if doVerbose:
print len(uniqDict), time.ctime()
readList = uniqDict[readID]
if len(readList) == 2:
total += 1
- (start1, flag1, pair1) = readList[0]
- (start2, flag2, pair2) = readList[1]
+ start1 = readList[0]["start"]
+ flag1 = readList[0]["flag"]
+ start2 = readList[1]["start"]
+ flag2 = readList[1]["flag"]
if flag1 != flag2:
dist = abs(start1 - start2)
except:
pass
-import sys, optparse
+import sys
+import optparse
from cistematic.core import featuresIntersecting
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption
-print "%prog: version 1.0"
+
+print "featureIntersects: version 1.1"
def main(argv=None):
usage = "usage: python %s tabfile [--cistype type] [--radius radius]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--cistype", action="store_false", dest="cistype")
- parser.add_option("--radius", type="int", dest="radius")
- parser.set_defaults(cistype="TFBSCONSSITES", radius=100)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 1:
featureIntersects(tabfile, options.cistype, options.radius)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cistype", action="store_false", dest="cistype")
+ parser.add_option("--radius", type="int", dest="radius")
+
+ configParser = getConfigParser()
+ section = "featureIntersects"
+ cistype = getConfigOption(configParser, section, "cistype", "TFBSCONSSITES")
+ radius = getConfigIntOption(configParser, section, "radius", 100)
+
+ parser.set_defaults(cistype=cistype, radius=radius)
+
+ return parser
+
+
def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
tabfile = open(tabFileName)
previous = ""
except:
pass
-import sys, os, optparse
+import sys
+import os
+import optparse
from cistematic.experiments.fasta import Fasta
from cistematic.programs.meme import Meme
from cistematic.programs.cisGreedy import CisGreedy
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
#TODO: cisSampler is not supported yet!
#from cistematic.programs.cisSampler import CisSampler
-print "%prog: version 3.4"
+print "findMotifs: version 3.5"
def main(argv=None):
if not argv:
usage = "usage: python %prog explabel regions.fsa [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--meme", action="store_true", dest="doMeme")
- parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
- parser.add_option("--logo", action="store_true", dest="saveLogo")
- parser.add_option("--threshold", type="float", dest="threshold")
- parser.add_option("--prefix", dest="motifPrefix")
- parser.add_option("--numMotifs", dest="numMotifs")
- parser.add_option("--maxWidth", type="int", dest="maxWidth")
- parser.add_option("--maskLower", action="store_true", dest="maskLower")
- parser.set_defaults(doMeme=False, doCisGreedy=False, saveLogo=False,
- threshold=75., numMotifs="10", maxWidth=28, maskLower=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
doCisSampler)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--meme", action="store_true", dest="doMeme")
+ parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
+ parser.add_option("--logo", action="store_true", dest="saveLogo")
+ parser.add_option("--threshold", type="float", dest="threshold")
+ parser.add_option("--prefix", dest="motifPrefix")
+ parser.add_option("--numMotifs", dest="numMotifs")
+ parser.add_option("--maxWidth", type="int", dest="maxWidth")
+ parser.add_option("--maskLower", action="store_true", dest="maskLower")
+
+ configParser = getConfigParser()
+ section = "findMotifs"
+ doMeme = getConfigBoolOption(configParser, section, "doMeme", False)
+ doCisGreedy = getConfigBoolOption(configParser, section, "doCisGreedy", False)
+ saveLogo = getConfigBoolOption(configParser, section, "saveLogo", False)
+ threshold = getConfigFloatOption(configParser, section, "threshold", 75.)
+ numMotifs = getConfigOption(configParser, section, "numMotifs", "10")
+ maxWidth = getConfigIntOption(configParser, section, "maxWidth", 28)
+ maskLower = getConfigBoolOption(configParser, section, "maskLower", False)
+
+
+ parser.set_defaults(doMeme=doMeme, doCisGreedy=doCisGreedy, saveLogo=saveLogo,
+ threshold=threshold, numMotifs=numMotifs, maxWidth=maxWidth, maskLower=maskLower)
+
+ return parser
+
+
def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False):
import math
import string
import optparse
-from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
+from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
+import ReadDataset
+import Region
-versionString = "%s: version 3.2" % sys.argv[0]
+versionString = "findall: version 3.2"
print versionString
def usage():
if not argv:
argv = sys.argv
+ parser = makeParser()
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ usage()
+ sys.exit(2)
+
+ factor = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
+ options.stringency, options.noshift, options.autoshift, options.reportshift,
+ options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
+ options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
+ options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
+ options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+ options.strandfilter, options.combine5p)
+
+
+def makeParser():
usage = __doc__
parser = optparse.OptionParser(usage=usage)
parser.add_option("--append", action="store_true", dest="doAppend")
parser.add_option("--RNA", action="store_true", dest="rnaSettings")
parser.add_option("--combine5p", action="store_true", dest="combine5p")
- parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
- stringency=4.0, noshift=False, autoshift=False, reportshift=False,
- minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
- normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
- trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
- cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
- strandfilter=None, combine5p=False)
- (options, args) = parser.parse_args(argv[1:])
-
- if len(args) < 3:
- usage()
- sys.exit(2)
-
- factor = args[0]
- hitfile = args[1]
- outfilename = args[2]
-
- findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
- options.stringency, options.noshift, options.autoshift, options.reportshift,
- options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
- options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
- options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
- options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
- options.strandfilter, options.combine5p)
+ configParser = getConfigParser()
+ section = "findall"
+ minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
+ minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
+ maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
+ listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
+ shift = getConfigOption(configParser, section, "shift", None)
+ stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
+ noshift = getConfigBoolOption(configParser, section, "noshift", False)
+ autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
+ reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
+ minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
+ maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
+ leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
+ minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
+ withFlag = getConfigOption(configParser, section, "withFlag", "")
+ doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
+ trimValue = getConfigOption(configParser, section, "trimValue", None)
+ doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
+ doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
+ rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+ ptype = getConfigOption(configParser, section, "ptype", None)
+ mockfile = getConfigOption(configParser, section, "mockfile", None)
+ doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
+ noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
+ strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+ combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
+
+ parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
+ stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
+ minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
+ normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
+ trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
+ cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+ strandfilter=strandfilter, combine5p=combine5p)
+
+ return parser
def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
strandfilter=None, combine5p=False):
- shiftValue = 0
- if autoshift:
- shiftValue = "auto"
-
- if shift is not None:
- try:
- shiftValue = int(shift)
- except ValueError:
- if shift == "learn":
- shiftValue = "learn"
- print "Will try to learn shift"
-
- if noshift:
- shiftValue = 0
+ shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
if trimValue is not None:
trimValue = float(trimValue) / 100.
if rnaSettings:
print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
- shiftValue = 0
doTrim = False
doDirectionality = False
writeLog(logfilename, versionString, string.join(sys.argv[1:]))
if doControl:
print "\ncontrol:"
- mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
+ mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
if cachePages > mockRDS.getDefaultCacheSize():
mockRDS.setDBcache(cachePages)
print "\nsample:"
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
if rnaSettings:
maxSpacing = readlen
- print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
- print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
- print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
-
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
- hitRDSsize = len(hitRDS) / 1000000.
- if doControl:
- mockRDSsize = len(mockRDS) / 1000000.
-
- if normalize:
- if doControl:
- mockSampleSize = mockRDSsize
-
- hitSampleSize = hitRDSsize
-
if doAppend:
- outfile = open(outfilename, "a")
+ fileMode = "a"
else:
- outfile = open(outfilename, "w")
+ fileMode = "w"
+
+ outfile = open(outfilename, fileMode)
outfile.write("#ERANGE %s\n" % versionString)
if doControl:
- outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
+ mockRDSsize = len(mockRDS) / 1000000.
+ controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
else:
- outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
+ controlSampleString = " none"
+
+ hitRDSsize = len(hitRDS) / 1000000.
+ outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
if withFlag != "":
outfile.write("#restrict to Flag = %s\n" % withFlag)
+ print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
+ print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
+ print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+
outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
if doControl:
mockChromList = mockRDS.getChromosomes()
+ if normalize:
+ if doControl:
+ mockSampleSize = mockRDSsize
+
+ hitSampleSize = hitRDSsize
+
hitChromList.sort()
for chromosome in hitChromList:
continue
print "chromosome %s" % (chromosome)
- hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
+ hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
+ doMulti=useMulti, findallOptimize=True, strand=stranded,
+ combine5p=combine5p)
maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
if shiftValue == "learn":
- shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
- stringency, readlen, minHits, logfilename, outfile, outfilename)
+ shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
+ mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
+ logfilename, outfile, outfilename)
- regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
- normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
- shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
- noMulti, doControl, factor, trimValue, outputRegionList=True)
+ regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
+ chromosome, useMulti, normalize, maxSpacing,
+ doDirectionality, doTrim, minHits, minRatio,
+ readlen, shiftValue, minPeak, minPlusRatio,
+ maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+ doControl, factor, trimValue, outputRegionList=True)
statistics["index"] += regionStats["index"]
statistics["total"] += regionStats["total"]
#now do background swapping the two samples around
print "calculating background..."
backgroundTrimValue = 1/20.
- backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
- normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
- shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
- noMulti, doControl, factor, backgroundTrimValue)
+ backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+ chromosome, useMulti, normalize, maxSpacing,
+ doDirectionality, doTrim, minHits, minRatio,
+ readlen, shiftValue, minPeak, minPlusRatio,
+ maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+ doControl, factor, backgroundTrimValue)
statistics["mIndex"] += backgroundRegionStats["index"]
statistics["mTotal"] += backgroundRegionStats["total"]
writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+def determineShiftValue(autoshift, shift, noshift, rnaSettings):
+ shiftValue = 0
+ if autoshift:
+ shiftValue = "auto"
+
+ if shift is not None:
+ try:
+ shiftValue = int(shift)
+ except ValueError:
+ if shift == "learn":
+ shiftValue = "learn"
+ print "Will try to learn shift"
+
+ if noshift or rnaSettings:
+ shiftValue = 0
+
+ return shiftValue
+
+
def doNotProcessChromosome(chromosome, doControl, mockChromList):
skipChromosome = False
if chromosome == "chrM":
def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
- stringency, readlen, minHits, logfilename, outfile, outfilename):
+ stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
- print "learning shift.... will need at least 30 training sites"
+ print "learning shift.... will need at least %d training sites" % minSites
previousHit = -1 * maxSpacing
hitList = [-1]
- weightList = [0]
+ totalWeight = 0
readList = []
shiftDict = {}
count = 0
numStarts = 0
- for (pos, sense, weight) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ pos = read["start"]
+ sense = read["sense"]
+ weight = read["weight"]
if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
- sumAll = sum(weightList)
+ sumAll = totalWeight
if normalize:
sumAll /= hitSampleSize
count += 1
hitList = []
- weightList = []
+ totalWeight = 0
readList = []
numStarts = 0
numStarts += 1
hitList.append(pos)
- weightList.append(weight)
- readList.append((pos, sense, weight))
+ totalWeight += weight
+ readList.append({"start": pos, "sense": sense, "weight": weight})
previousHit = pos
bestShift = 0
bestCount = 0
- outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
+ learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
+ stringency * minRatio, stringency * readlen),
+ "#number of training examples: %d" % count]
+ outline = string.join(learningSettings, "\n")
print outline
writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
- if count < 30:
+ if count < minSites:
outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
- print outline
+ print >> outfile, outline
writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
shiftValue = 0
else:
noMulti, doControl, factor, trimValue, outputRegionList=False):
index = 0
- total = 0
+ totalRegionWeight = 0
failedCounter = 0
previousHit = - 1 * maxSpacing
currentHitList = [-1]
- currentWeightList = [0]
+ currentTotalWeight = 0
+ currentUniqReadCount = 0
currentReadList = []
regionWeights = []
outregions = []
numStarts = 0
hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
- for (pos, sense, weight) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ pos = read["start"]
+ sense = read["sense"]
+ weight = read["weight"]
if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
- sumAll = sum(currentWeightList)
+ sumAll = currentTotalWeight
if normalize:
sumAll /= rdsSampleSize
foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
if foldRatio >= minRatio:
# first pass, with absolute numbers
- if doDirectionality:
- (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
- else:
- (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
-
- bestPos = topPos[0]
- peakScore = smoothArray[bestPos]
+ peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
+
+ bestPos = peak.topPos[0]
+ numHits = peak.numHits
+ peakScore = peak.smoothArray[bestPos]
+ numPlus = peak.numPlus
+ shift = peak.shift
+ numLeft = peak.numLeft
if normalize:
peakScore /= rdsSampleSize
stop = regionStop - regionStart - 1
startFound = False
while not startFound:
- if smoothArray[start] >= minSignalThresh or start == bestPos:
+ if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
startFound = True
else:
start += 1
stopFound = False
while not stopFound:
- if smoothArray[stop] >= minSignalThresh or stop == bestPos:
+ if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
stopFound = True
else:
stop -= 1
regionStop = regionStart + stop
regionStart += start
- try:
- if doDirectionality:
- (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
- else:
- (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
- except:
- continue
+ trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
+ sumAll = trimPeak.numHits
+ numPlus = trimPeak.numPlus
+ numLeft = trimPeak.numLeft
if normalize:
sumAll /= rdsSampleSize
sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
# just in case it changed, use latest data
try:
- bestPos = topPos[0]
- peakScore = smoothArray[bestPos]
+ bestPos = trimPeak.topPos[0]
+ peakScore = trimPeak.smoothArray[bestPos]
except:
continue
peakScore /= rdsSampleSize
elif outputRegionList:
- sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
+ sumMulti = currentTotalWeight - currentUniqReadCount
if outputRegionList:
# normalize to RPM
plusRatio = float(numPlus)/numHits
if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
if outputRegionList:
- peak = ""
+ peakDescription = ""
if listPeak:
- peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
+ peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
if doDirectionality:
if leftPlusRatio < numLeft / numPlus:
plusP = plusRatio * 100.
leftP = 100. * numLeft / numPlus
# we have a region that passes all criteria
- outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
+ region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
+ factor, index, chrom, sumAll,
+ foldRatio, multiP, plusP, leftP,
+ peakDescription, shift)
+ outregions.append(region)
- total += sumAll
+ totalRegionWeight += sumAll
else:
failedCounter += 1
else:
# we have a region, but didn't check for directionality
index += 1
- total += sumAll
+ totalRegionWeight += sumAll
if outputRegionList:
- outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
+ region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
+ sumAll, foldRatio, multiP, peakDescription, shift)
+ outregions.append(region)
currentHitList = []
- currentWeightList = []
+ currentTotalWeight = 0
+ currentUniqReadCount = 0
currentReadList = []
numStarts = 0
numStarts += 1
currentHitList.append(pos)
- currentWeightList.append(weight)
- currentReadList.append((pos, sense, weight))
+ currentTotalWeight += weight
+ if weight == 1.0:
+ currentUniqReadCount += 1
+
+ currentReadList.append({"start": pos, "sense": sense, "weight": weight})
previousHit = pos
statistics = {"index": index,
- "total": total,
+ "total": totalRegionWeight,
"failed": failedCounter
}
bestShift = 0
shiftDict = {}
for region in outregions:
+ # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+ if reportshift:
+ outputList = [region.printRegionWithShift()]
+ if shiftValue == "auto":
+ try:
+ shiftDict[region.shift] += 1
+ except KeyError:
+ shiftDict[region.shift] = 1
+ else:
+ outputList = [region.printRegion()]
+
# iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
if doPvalue:
- sumAll = int(region[5])
+ sumAll = int(region.numReads)
for i in xrange(sumAll):
pValue *= poissonmean
pValue /= i+1
- if shiftValue == "auto" and reportshift:
- try:
- shiftDict[region[-1]] += 1
- except KeyError:
- shiftDict[region[-1]] = 1
-
- try:
- if reportshift:
- outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
- else:
- outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
- except:
- if reportshift:
- outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
- else:
- outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
-
- if doPvalue:
- outputList.append("%1.2g" % pValue)
+ outputList.append("%1.2f" % pValue)
outline = string.join(outputList, "\t")
print outline
print >> outfile, outline
- if shiftValue == "auto" and reportshift:
- bestCount = 0
- for shift in sorted(shiftDict):
- if shiftDict[shift] > bestCount:
- bestShift = shift
- bestCount = shiftDict[shift]
+ bestCount = 0
+ for shift in sorted(shiftDict):
+ if shiftDict[shift] > bestCount:
+ bestShift = shift
+ bestCount = shiftDict[shift]
return bestShift
except:
pass
-from random import random
import sys
+from random import random
+
-print "%s: version 1.0" % sys.argv[0]
+print "fraction: version 1.1"
def main(argv=None):
if not argv:
pass
# originally from version 1.3 of geneDnaDownstreamCounts.py
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
-print "%prog: version 2.0"
+print "geneDownstreamBins: version 2.1"
def main(argv=None):
if not argv:
usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--max", type="int", dest="standardMinDist",
- help="maximum region in bp")
- parser.set_defaults(standardMinDist=3000)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--max", type="int", dest="standardMinDist",
+ help="maximum region in bp")
+
+ configParser = getConfigParser()
+ section = "geneDownstreamBins"
+ standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
+
+ parser.set_defaults(standardMinDist=standardMinDist)
+
+ return parser
+
+
def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
bins = 10
standardMinThresh = standardMinDist / bins
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
normalizationFactor = 1.0
if normalize:
hitDictSize = len(hitRDS)
normalizationFactor = hitDictSize / 1000000.
hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
- geneinfoDict = idb.getallGeneInfo(genome)
featuresDict = hg.getallGeneFeatures()
outfile = open(outfilename, "w")
continue
binList = [0.] * bins
- for (tagStart, sense, weight) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ tagStart = read["start"]
+ weight = read["weight"]
tagStart -= gstart
if tagStart >= glen:
break
except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
-print '%s: version 2.1' % sys.argv[0]
+print "geneLocusBins: version 2.2"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--bins", type="int", dest="bins",
- help="number of bins to use [default: 10]")
- parser.add_option("--flank", type="int", dest="flankBP",
- help="number of flanking BP on both upstream and downstream [default: 0]")
- parser.add_option("--upstream", type="int", dest="upstreamBP",
- help="number of upstream flanking BP [default: 0]")
- parser.add_option("--downstream", type="int", dest="downstreamBP",
- help="number of downstream flanking BP [default: 0]")
- parser.add_option("--nocds", action="store_false", dest="doCDS",
- help="do not CDS")
- parser.add_option("--raw", action="store_false", dest="normalizeBins",
- help="do not normalize results")
- parser.add_option("--force", action="store_false", dest="limitNeighbor",
- help="limit neighbor region")
- parser.add_option("--regions", dest="acceptfile")
- parser.add_option("--cache", action="store_true", dest="doCache",
- help="use cache")
- parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
-def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--bins", type="int", dest="bins",
+ help="number of bins to use [default: 10]")
+ parser.add_option("--flank", type="int", dest="flankBP",
+ help="number of flanking BP on both upstream and downstream [default: 0]")
+ parser.add_option("--upstream", type="int", dest="upstreamBP",
+ help="number of upstream flanking BP [default: 0]")
+ parser.add_option("--downstream", type="int", dest="downstreamBP",
+ help="number of downstream flanking BP [default: 0]")
+ parser.add_option("--nocds", action="store_false", dest="doCDS",
+ help="do not CDS")
+ parser.add_option("--raw", action="store_false", dest="normalizeBins",
+ help="do not normalize results")
+ parser.add_option("--force", action="store_false", dest="limitNeighbor",
+ help="limit neighbor region")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--cache", action="store_true", dest="doCache",
+ help="use cache")
+
+ configParser = getConfigParser()
+ section = "geneLocusBins"
+ normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ bins = getConfigIntOption(configParser, section, "bins", 10)
+ flankBP = getConfigOption(configParser, section, "flankBP", None)
+ upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
+ downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
+ doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
+ limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
+
+ parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
+ upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
+ limitNeighbor=limitNeighbor)
+
+ return parser
+
+def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
+ normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
+ acceptfile=None):
+
if acceptfile is None:
acceptDict = {}
else:
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
- hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalizeBins:
hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=doCache)
-
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
if doFlank:
locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
else:
gidList = hg.allGIDs()
gidList.sort()
for chrom in acceptDict:
- for (label, start, stop, length) in acceptDict[chrom]:
- if label not in gidList:
- gidList.append(label)
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
(gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
-print '%s: version 3.0' % sys.argv[0]
+print "geneLocusCounts: version 3.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome readDB outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
- help="do not count unique reads")
- parser.add_option("--multi", action="store_true", dest="doUniqs",
- help="count multi reads")
- parser.add_option("--splices", action="store_true", dest="doUniqs",
- help="count splice reads")
- parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
- parser.add_option("--regions", dest="acceptfile")
- parser.add_option("--noCDS", action="store_false", dest="useCDS")
- parser.add_option("--locusLength", type="int", dest="bplength",
- help="number of bases to report")
- parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
except ValueError:
pass
- geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+ geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
+ options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
+ options.bplength, options.acceptfile)
-def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
- print 'returning only up to %d bp from gene locus' % bplength
- print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+ help="do not count unique reads")
+ parser.add_option("--multi", action="store_true", dest="doUniqs",
+ help="count multi reads")
+ parser.add_option("--splices", action="store_true", dest="doUniqs",
+ help="count splice reads")
+ parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--noCDS", action="store_false", dest="useCDS")
+ parser.add_option("--locusLength", type="int", dest="bplength",
+ help="number of bases to report")
+
+ configParser = getConfigParser()
+ section = "geneLocusCounts"
+ doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+ doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
+ spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
+ bplength = getConfigIntOption(configParser, section, "bplength", 0)
+ acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+
+ parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
+ useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
+ acceptfile=acceptfile)
+
+ return parser
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
+ doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
+ spanTSS=False, bplength=0, acceptfile=""):
+
+ print "returning only up to %d bp from gene locus" % bplength
+ print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
if acceptfile:
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
- hitRDS = readDataset(hitfile, verbose = True)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
- gidCount = {}
- gidList = []
- gidLen = {}
- geneinfoDict = idb.getallGeneInfo(genome)
- locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
-
+ gidDict = {}
+ locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
locusChroms = locusByChromDict.keys()
chromList = hitRDS.getChromosomes(fullChrom=False)
chromList.sort()
for chrom in chromList:
- if chrom == 'M' or chrom not in locusChroms:
+ if doNotProcessChromosome(chrom, locusChroms):
continue
- print 'chr' + chrom
- fullchrom = 'chr' + chrom
+ fullchrom = "chr%s" % chrom
+ print fullchrom
hitRDS.memSync(fullchrom, index=True)
for (start, stop, gid, length) in locusByChromDict[chrom]:
- if gid not in gidList:
- gidList.append(gid)
- gidCount[gid] = 0
- gidLen[gid] = length
+ if not gidDict.has_key(gid):
+ gidDict[gid] = {"count": 0, "length": length}
- gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
- outfile = open(outfilename,'w')
+ outfile = open(outfilename, "w")
totalCount /= 1000000.
- outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+ outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
+ gidList = gidDict.keys()
gidList.sort()
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
for gid in gidList:
- if 'FAR' not in gid:
- symbol = 'LOC' + gid
- geneinfo = ''
+ if "FAR" not in gid:
+ symbol = "LOC%s" % gid
+ geneinfo = ""
try:
geneinfo = geneinfoDict[gid]
symbol = geneinfo[0][0]
- except:
+ except (KeyError, IndexError):
pass
else:
symbol = gid
- if gid in gidCount and gid in gidLen:
- rpm = gidCount[gid] / totalCount
- rpkm = 1000. * rpm / gidLen[gid]
- outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+ gidCount = gidDict[gid]["count"]
+ gidLength = gidDict[gid]["length"]
+ rpm = gidCount / totalCount
+ rpkm = 1000. * rpm / gidLength
+ outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
outfile.close()
+
+def doNotProcessChromosome(chrom, locusChroms):
+ return chrom == "M" or chrom not in locusChroms
+
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
except:
pass
-from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getLocusByChromDict
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
-import sys, optparse
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-print "%prog: version 2.0"
+
+print "geneLocusPeaks: version 2.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--up", type="int", dest="upstream")
- parser.add_option("--down", type="int", dest="downstream")
- parser.add_option("--regions", dest="acceptfile")
- parser.add_option("--raw", action="store_false", dest="normalize")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--up", type="int", dest="upstream")
+ parser.add_option("--down", type="int", dest="downstream")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+
+ configParser = getConfigParser()
+ section = "geneLocusPeaks"
+ upstream = getConfigIntOption(configParser, section, "upstream", 0)
+ downstream = getConfigIntOption(configParser, section, "downstream", 0)
+ acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+
+ parser.set_defaults(upstream=upstream, downstream=downstream, acceptfile=acceptfile, normalize=normalize, doCache=doCache)
+
+ return parser
+
+
def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
acceptDict = {}
print "upstream = %d downstream = %d" % (upstream, downstream)
- hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalize:
hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
gidCount = {}
gidPos = {}
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
gidList = hg.allGIDs()
gidList.sort()
for chrom in acceptDict:
- for (label, start, stop, length) in acceptDict[chrom]:
- if label not in gidList:
- gidList.append(label)
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
for gid in gidList:
gidCount[gid] = 0
print chrom
for (start, stop, gid, glen) in locusByChromDict[chrom]:
gidCount[gid] = 0.
- (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
- if len(topPos) > 0:
- gidCount[gid] = smoothArray[topPos[0]]
- gidPos[gid] = (chrom, start + topPos[0])
+ peak = findPeak(hitDict[chrom], start, glen, readlen)
+ if len(peak.topPos) > 0:
+ gidCount[gid] = peak.smoothArray[peak.topPos[0]]
+ gidPos[gid] = (chrom, start + peak.topPos[0])
else:
gidPos[gid] = (chrom, start)
import sys
import optparse
-from commoncode import readDataset, getFeaturesByChromDict
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
from cistematic.genomes import Genome
from cistematic.core.geneinfo import geneinfoDB
-print "%s: version 5.1" % sys.argv[0]
+print "geneMrnaCounts: version 5.2"
def main(argv=None):
usage = "usage: python %prog genome rdsfile outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--stranded", action="store_true", dest="trackStrand")
- parser.add_option("--splices", action="store_true", dest="doSplices")
- parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
- parser.add_option("--multi", action="store_true", dest="doMulti")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.add_option("--searchGID", action="store_true", dest="searchGID")
- parser.add_option("--countfeatures", action="store_true", dest="countFeats")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--markGID", action="store_true", dest="markGID")
- parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
- extendGenome="", replaceModels=False, searchGID=False,
- countFeats=False, cachePages=None, markGID=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.searchGID, options.countFeats, options.cachePages, options.markGID)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--stranded", action="store_true", dest="trackStrand")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+ parser.add_option("--multi", action="store_true", dest="doMulti")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.add_option("--searchGID", action="store_true", dest="searchGID")
+ parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--markGID", action="store_true", dest="markGID")
+
+ configParser = getConfigParser()
+ section = "geneMrnaCounts"
+ trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+ doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+ searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
+ countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+ markGID = getConfigBoolOption(configParser, section, "markGID", False)
+
+ parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
+ extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
+ countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+
+ return parser
+
def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
searchGID=False, countFeats=False, cachePages=None, markGID=False):
cachePages = 100000
doCache = False
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
except:
print 'psyco not running'
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-print '%s: version 4.1' % sys.argv[0]
+print "geneMrnaCountsWeighted: version 4.3"
def main(argv=None):
usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--stranded", action="store_false", dest="ignoreSense")
- parser.add_option("--uniq", action="store_true", dest="withUniqs")
- parser.add_option("--multi", action="store_true", dest="withMulti")
- parser.add_option("--record", action="store_true", dest="recording",
- help="ignored with uniq reads")
- parser.add_option("--accept", dest="acceptfile")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--verbose", action="store_true", dest="doVerbose")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
- acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
- replaceModels=False)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
outfilename = args[3]
geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
- options.withUniqs, options.withMulti, options.recording,
+ options.withUniqs, options.withMulti,
options.acceptfile, options.cachePages, options.doVerbose,
options.extendGenome, options.replaceModels)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--stranded", action="store_false", dest="ignoreSense")
+ parser.add_option("--uniq", action="store_true", dest="withUniqs")
+ parser.add_option("--multi", action="store_true", dest="withMulti")
+ parser.add_option("--accept", dest="acceptfile")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+ configParser = getConfigParser()
+ section = "geneMrnaCountsWeighted"
+ ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
+ withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
+ withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
+ acceptfile = getConfigOption(configParser, section, "acceptfile", None)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+ parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
+ acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
+ replaceModels=replaceModels)
+
+ return parser
+
+
+#TODO: Reported user performance issue. Long run times in conditions:
+# small number of reads ~40-50M
+# all features on single chromosome
+#
+# User states has been a long time problem.
+
def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
- withUniqs=False, withMulti=False, recording=False, acceptfile=None,
+ withUniqs=False, withMulti=False, acceptfile=None,
cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
if (not withUniqs and not withMulti) or (withUniqs and withMulti):
if cachePages is not None:
cacheGeneDB(genome)
hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
- idb = geneinfoDB(cache=True)
print "%s cached" % genome
doCache = True
else:
doCache = False
cachePages = 0
hg = Genome(genome, inRAM=True)
- idb = geneinfoDB()
-
- if acceptfile is not None:
- acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
- else:
- acceptDict = {}
-
- if recording and withUniqs:
- recording = False
if extendGenome:
if replaceModels:
print "will replace gene models with %s" % extendGenome
else:
print "will extend gene models with %s" % extendGenome
- else:
- replaceModels = False
- if extendGenome != "":
- hg.extendFeatures(extendGenome, replace = replaceModels)
-
- hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ hg.extendFeatures(extendGenome, replace=replaceModels)
+
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
- readlen = hitRDS.getReadSize()
-
- geneinfoDict = idb.getallGeneInfo(genome)
- geneannotDict = hg.allAnnotInfo()
- gidCount = {}
- gidReadDict = {}
-
- featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
- gidList = hg.allGIDs()
+ allGIDs = set(hg.allGIDs())
+ if acceptfile is not None:
+ regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
+ for chrom in regionDict:
+ for region in regionDict[chrom]:
+ allGIDs.add(region.label)
+ else:
+ regionDict = {}
- gidList.sort()
- for chrom in acceptDict:
- for (label, start, stop, length) in acceptDict[chrom]:
- if label not in gidList:
- gidList.append(label)
+ featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
- for gid in gidList:
- gidCount[gid] = 0
- gidReadDict[gid] = []
-
- uniqueCountDict = {}
+ gidReadDict = {}
read2GidDict = {}
-
- uniquecounts = open(countfile)
- for line in uniquecounts:
- fields = line.strip().split()
- # add a pseudo-count here to ease calculations below
- uniqueCountDict[fields[0]] = float(fields[-1]) + 1
-
- uniquecounts.close()
-
- outfile = open(outfilename, "w")
+ for gid in allGIDs:
+ gidReadDict[gid] = []
index = 0
if withMulti and not withUniqs:
else:
chromList = hitRDS.getChromosomes(fullChrom=False)
- for achrom in chromList:
- if achrom not in featuresByChromDict:
+ readlen = hitRDS.getReadSize()
+ for chromosome in chromList:
+ if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
continue
- print "\n" + achrom + " ",
- startFeature = 0
- fullchrom = "chr" + achrom
+ print "\n%s " % chromosome,
+ fullchrom = "chr%s" % chromosome
hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
- featList = featuresByChromDict[achrom]
- if ignoreSense:
- for (tagStart, tagReadID) in hitDict[fullchrom]:
- index += 1
- if index % 100000 == 0:
- print "read %d" % index,
-
- stopPoint = tagStart + readlen
- if startFeature < 0:
- startFeature = 0
-
- for (start, stop, gid, sense, ftype) in featList[startFeature:]:
- if tagStart > stop:
- startFeature += 1
- continue
-
- if start > stopPoint:
- startFeature -= 100
- break
-
- if start <= tagStart <= stop:
- try:
- gidReadDict[gid].append(tagReadID)
- if tagReadID in read2GidDict:
- if gid not in read2GidDict[tagReadID]:
- read2GidDict[tagReadID].append(gid)
- else:
- read2GidDict[tagReadID] = [gid]
-
- gidCount[gid] += 1
- except:
- print "gid %s not in gidReadDict" % gid
-
- stopPoint = stop
- else:
- for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
- index += 1
- if index % 100000 == 0:
- print "read %d" % index,
-
- stopPoint = tagStart + readlen
- if startFeature < 0:
- startFeature = 0
-
- for (start, stop, gid, sense, ftype) in featList[startFeature:]:
- if tagStart > stop:
- startFeature += 1
- continue
-
- if start > stopPoint:
- startFeature -= 100
- break
-
- if sense == "R":
- sense = "-"
- else:
- sense = "+"
-
- if start <= tagStart <= stop and sense == tSense:
- try:
- gidReadDict[gid].append(tagReadID)
- if tagReadID in read2GidDict:
- if gid not in read2GidDict[tagReadID]:
- read2GidDict[tagReadID].append(gid)
- else:
- read2GidDict[tagReadID] = [gid]
-
- gidCount[gid] += 1
- except:
- print "gid %s not in gidReadDict" % gid
-
- stopPoint = stop
-
- for gid in gidList:
- if "FAR" not in gid:
- symbol = "LOC" + gid
- geneinfo = ""
+ featureList = featuresByChromDict[chromosome]
+
+ readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+ index = totalProcessedReads
+ for (tagReadID, gid) in readGidList:
try:
- geneinfo = geneinfoDict[gid]
- if genome == "celegans":
- symbol = geneinfo[0][1]
+ gidReadDict[gid].append(tagReadID)
+ if tagReadID in read2GidDict:
+ read2GidDict[tagReadID].add(gid)
else:
- symbol = geneinfo[0][0]
- except:
- try:
- symbol = geneannotDict[(genome, gid)][0]
- except:
- symbol = "LOC" + gid
- else:
- symbol = gid
+ read2GidDict[tagReadID] = set([gid])
+ except KeyError:
+ print "gid %s not in gidReadDict" % gid
- tagCount = 0.
- for readID in gidReadDict[gid]:
- try:
- tagValue = uniqueCountDict[gid]
- except:
- tagValue = 1
+ writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
+ if doCache:
+ uncacheGeneDB(genome)
- tagDenom = 0.
- for aGid in read2GidDict[readID]:
- try:
- tagDenom += uniqueCountDict[aGid]
- except:
- tagDenom += 1
- try:
- tagCount += tagValue / tagDenom
- except ZeroDivisionError:
- tagCount = 0
-
+def doNotProcessChromosome(chromosome, chromosomeList):
+ return chromosome not in chromosomeList
+
+
+def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+
+ startFeature = 0
+ readGidList = []
+ ignoreSense = True
+ for read in hitDict[fullchrom]:
+ tagStart = read["start"]
+ tagReadID = read["readID"]
+ if read.has_key("sense"):
+ tagSense = read["sense"]
+ ignoreSense = False
+
+ index += 1
+ if index % 100000 == 0:
+ print "read %d" % index,
+
+ stopPoint = tagStart + readlen
+ if startFeature < 0:
+ startFeature = 0
+
+ for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+ if tagStart > stop:
+ startFeature += 1
+ continue
+
+ if start > stopPoint:
+ startFeature -= 100
+ break
+
+ if not ignoreSense:
+ if sense == "R":
+ sense = "-"
+ else:
+ sense = "+"
+
+ if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
+ readGidList.append((tagReadID, gid))
+ stopPoint = stop
+
+ return readGidList, index
+
+
+def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
+
+ uniqueCountDict = {}
+ uniquecounts = open(countFilename)
+ for line in uniquecounts:
+ fields = line.strip().split()
+ # add a pseudo-count here to ease calculations below
+ #TODO: figure out why this was done in prior implementation...
+ uniqueCountDict[fields[0]] = float(fields[-1]) + 1
+
+ uniquecounts.close()
+
+ genomeName = genome.genome
+ geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
+ geneannotDict = genome.allAnnotInfo()
+ outfile = open(outFilename, "w")
+ for gid in allGIDs:
+ symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
+ tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
if doVerbose:
print "%s %s %f" % (gid, symbol, tagCount)
outfile.close()
- if doCache:
- uncacheGeneDB(genome)
+
+def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
+ if "FAR" not in gid:
+ symbol = "LOC%s" % gid
+ geneinfo = ""
+ try:
+ geneinfo = geneinfoDict[gid]
+ if genomeName == "celegans":
+ symbol = geneinfo[0][1]
+ else:
+ symbol = geneinfo[0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneannotDict[(genomeName, gid)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % gid
+ else:
+ symbol = gid
+
+ return symbol
+
+
+def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
+ tagCount = 0.
+ for readID in gidReadDict[gid]:
+ try:
+ tagValue = uniqueCountDict[gid]
+ except KeyError:
+ tagValue = 1
+
+ tagDenom = 0.
+ for relatedGID in read2GidDict[readID]:
+ try:
+ tagDenom += uniqueCountDict[relatedGID]
+ except KeyError:
+ tagDenom += 1
+
+ try:
+ tagCount += tagValue / tagDenom
+ except ZeroDivisionError:
+ pass
+
+ return tagCount
if __name__ == "__main__":
except:
pass
-import sys, optparse
-from commoncode import getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigIntOption, getConfigBoolOption, getConfigOption
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
-print "%prog: version 2.4"
+print "geneNeighbors: version 2.5" % sys.argv[0]
def main(argv=None):
usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--regions", dest="acceptFile")
- parser.add_option("--downstream", type="int", dest="downMax")
- parser.add_option("--upstream", type="int", dest="upMax")
- parser.add_option("--mindist", type="int", dest="minDist")
- parser.add_option("--minlocus", type="int", dest="minLocus")
- parser.add_option("--maxlocus", type="int", dest="maxLocus")
- parser.add_option("--samesense", action="store_true", dest="checkSense")
- parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000,
- upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
print "\n%d genes matched" % index
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--regions", dest="acceptFile")
+ parser.add_option("--downstream", type="int", dest="downMax")
+ parser.add_option("--upstream", type="int", dest="upMax")
+ parser.add_option("--mindist", type="int", dest="minDist")
+ parser.add_option("--minlocus", type="int", dest="minLocus")
+ parser.add_option("--maxlocus", type="int", dest="maxLocus")
+ parser.add_option("--samesense", action="store_true", dest="checkSense")
+
+ configParser = getConfigParser()
+ section = "geneNeighbors"
+ acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+ checkSense = getConfigBoolOption(configParser, section, "checkSense", False)
+ downMax = getConfigIntOption(configParser, section, "downMax", 10000000)
+ upMax = getConfigIntOption(configParser, section, "upMax", 10000000)
+ minDist = getConfigIntOption(configParser, section, "minDist", 0)
+ minLocus = getConfigIntOption(configParser, section, "minLocus", -1)
+ maxLocus = getConfigIntOption(configParser, section, "maxLocus", 10000000)
+
+ parser.set_defaults(acceptfile=acceptfile, checkSense=checkSense, downMax=downMax,
+ upMax=upMax, minDist=minDist, minLocus=minLocus, maxLocus=maxLocus)
+
+ return parser
+
+
def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
maxLocus=10000000):
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
gidList = hg.allGIDs()
gidList.sort()
for chrom in acceptDict:
- for (label, start, stop, length) in acceptDict[chrom]:
- if label not in gidList:
- gidList.append(label)
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
index = 0
outfile = open(outfilename,"w")
except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, computeRegionBins, getLocusByChromDict, getConfigParser, getConfigBoolOption, getConfigIntOption, getConfigOption
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
-print "%prog: version 1.3"
+print "geneStallingBins: version 1.4"
def main(argv=None):
usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--upstream", type="int", dest="upstreamBp")
- parser.add_option("--downstream", type="int", dest="downstreamBp")
- parser.add_option("--regions", dest="acceptfile")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--normalize", action="store_true", dest="normalize")
- parser.add_option("--tagCount", action="store_true", dest="doTagCount")
- parser.add_option("--bins", type="int", dest="bins")
- parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
- doCache=False, normalize=False, doTagCount=False, bins=4)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
options.normalize, options.doTagCount, options.bins)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--upstream", type="int", dest="upstreamBp")
+ parser.add_option("--downstream", type="int", dest="downstreamBp")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--normalize", action="store_true", dest="normalize")
+ parser.add_option("--tagCount", action="store_true", dest="doTagCount")
+ parser.add_option("--bins", type="int", dest="bins")
+
+ configParser = getConfigParser()
+ section = "geneStallingBins"
+ upstreamBp = getConfigIntOption(configParser, section, "upstreamBp", 300)
+ downstreamBp = getConfigIntOption(configParser, section, "downstreamBp", 0)
+ acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ normalize = getConfigBoolOption(configParser, section, "normalize", False)
+ doTagCount = getConfigBoolOption(configParser, section, "doTagCount", False)
+ bins = getConfigIntOption(configParser, section, "bins", 4)
+
+ parser.set_defaults(upstreamBp=upstreamBp, downstreamBp=downstreamBp, acceptfile=acceptfile,
+ doCache=doCache, normalize=normalize, doTagCount=doTagCount, bins=bins)
+
+ return parser
+
+
def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
downstreamBp=0, acceptfile="", doCache=False, normalize=False,
doTagCount=False, bins=4):
doCDS = True
limitNeighbor = False
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
hitNormalizationFactor = 1.0
if normalize:
hitDictSize = len(hitRDS)
hitNormalizationFactor = hitDictSize / 1000000.
- controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ controlRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
controlNormalizationFactor = 1.0
if normalize:
controlDictSize = len(hitRDS)
controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=doCache)
-
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
gidList = hg.allGIDs()
gidList.sort()
for chrom in acceptDict:
- for (label, start, stop, length) in acceptDict[chrom]:
- if label not in gidList:
- gidList.append(label)
+ for region in acceptDict[chrom]:
+ if region.label not in gidList:
+ gidList.append(region.label)
(gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
(controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
except:
pass
-# originally from version 1.3 of geneDownstreamBins.py
+import sys
from commoncode import *
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
-import sys
-print '%s: version 2.0' % sys.argv[0]
+
+print "geneStartBins: version 2.1"
if len(sys.argv) < 4:
print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
print '\n\twhere regionSize is the optional maximum region in bp\n'
bins = 10
standardMinThresh = standardMinDist / bins
-hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalize:
normalizationFactor = totalCount / 1000000.
hg = Genome(genome)
-idb = geneinfoDB(cache=True)
-
gidDict = {}
-geneinfoDict = idb.getallGeneInfo(genome)
+geneinfoDict = getGeneInfoDict(genome, cache=True)
featuresDict = hg.getallGeneFeatures()
-#infile = open(infilename)
outfile = open(outfilename,'w')
gidList = hg.allGIDs()
symbol = geneinfo[0][0]
except:
print geneinfo
+
newfeatureList = []
if len(featureList) == 0:
continue
+
for (ftype, chrom, start, stop, fsense) in featureList:
if (start, stop) not in newfeatureList:
newfeatureList.append((start, stop))
+
if chrom not in hitDict:
continue
+
newfeatureList.sort()
if len(newfeatureList) < 1:
- #print '%s %s %d' % (gid, symbol, -1)
- #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
continue
+
glen = standardMinDist / 2
if fsense == 'F':
nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
if nextGene < glen * 2:
- glen = nextGene / 2
+ glen = nextGene / 2
+
if glen < 1:
- glen = 1
+ glen = 1
+
gstart = newfeatureList[0][0] - glen
if gstart < 0:
- gstart = 0
+ gstart = 0
+
gstop = newfeatureList[0][0] + glen
else:
nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
if nextGene < glen * 2:
glen = nextGene / 2
+
if glen < 1:
glen = 1
+
gstart = newfeatureList[-1][1] - glen
gstop = newfeatureList[-1][1] + glen
tagCount = 0
if glen < standardMinDist / 2:
continue
+
binList = [0] * bins
- for (tagStart, sense, weight) in hitDict[chrom]:
- tagStart -= gstart
+ for read in hitDict[chrom]:
+ tagStart = read["start"] - gstart
+ sense = read["sense"]
+ weight = read["weight"]
if tagStart >= 2 * glen:
break
+
if tagStart > 0:
tagCount += weight
if fsense == 'R':
rdist = 2 * glen - tagStart
binID = rdist / standardMinThresh
binList[binID] += weight
+
if tagCount < 2:
continue
+
print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
for binAmount in binList:
outfile.write('\t%d' % binAmount)
+
outfile.write('\n')
-#infile.close()
+
outfile.close()
except:
pass
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
-print "%prog: version 2.0"
+print "geneUpstreamBins: version 2.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--raw", action="store_false", dest="normalize",
- help="maximum region in bp")
- parser.add_option("--max", type="int", dest="standardMinDist")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--raw", action="store_false", dest="normalize",
+ help="maximum region in bp")
+ parser.add_option("--max", type="int", dest="standardMinDist")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+
+ configParser = getConfigParser()
+ section = "geneUpstreamBins"
+ standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+
+ parser.set_defaults(standardMinDist=standardMinDist, normalize=normalize, doCache=doCache)
+
+ return parser
+
+
def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
bins = 10
standardMinThresh = standardMinDist / bins
- hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
normalizationFactor = 1.0
if normalize:
totalCount = len(hitRDS)
hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
featuresDict = hg.getallGeneFeatures()
outfile = open(outfilename,"w")
continue
binList = [0] * bins
- for (tagStart, sense, weight) in hitDict[chrom]:
+ for read in hitDict[chrom]:
+ tagStart = read["start"]
+ weight = read["weight"]
tagStart -= gstart
if tagStart >= glen:
break
-import sys, optparse
+import sys
+import optparse
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigBoolOption
-print "%prog: version 3.1"
+print "getGOgenes: version 3.2"
def main(argv=None):
if not argv:
usage = "usage: python %s genome GOID1 [GOID2 ....] [--outfile outfilename] [--append] [--restrict genefile]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--outfile", dest="outfilename")
- parser.add_option("--append", action="store_true", dest="append")
- parser.add_option("--restrict", dest="restrictfilename")
- parser.set_defaults(outfilename=None, restrictfilename=None, append=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
getGOgenes(genome, GOIDlist, options.outfilename, options.restrictfilename, options.append)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--outfile", dest="outfilename")
+ parser.add_option("--append", action="store_true", dest="append")
+ parser.add_option("--restrict", dest="restrictfilename")
+
+ configParser = getConfigParser()
+ section = "getGOgenes"
+ outfilename = getConfigOption(configParser, section, "outfilename", None)
+ restrictfilename = getConfigOption(configParser, section, "restrictfilename", None)
+ append = getConfigBoolOption(configParser, section, "append", False)
+
+ parser.set_defaults(outfilename=outfilename, restrictfilename=restrictfilename, append=append)
+
+ return parser
+
+
def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append=False):
writeOut = False
if outfilename is not None:
restrict = True
hg = Genome(genome)
- idb = geneinfoDB()
print sys.argv
print GOIDlist
geneList = geneDict.keys()
print len(geneList)
- geneInfoList = idb.getallGeneInfo(genome)
+ geneInfoList = getGeneInfoDict(genome)
if writeOut:
if append:
from cistematic.genomes import Genome
from commoncode import writeLog
-print "%prog: version 1.5"
+print "getNovelSNPs: version 1.6"
try:
import psyco
import optparse
import string
from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
-print "%prog: version 4.5"
+print "getSNPGeneInfo: version 4.6"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--cache", action="store_true", dest="cachePages")
- parser.add_option("--withoutsense", action="store_false", dest="withSense")
- parser.add_option("--flank", type="int", dest="flankBP")
- parser.set_defaults(doCache=False, withSense=True, flankBP=0)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--cache", action="store_true", dest="cachePages")
+ parser.add_option("--withoutsense", action="store_false", dest="withSense")
+ parser.add_option("--flank", type="int", dest="flankBP")
+
+ configParser = getConfigParser()
+ section = "getSNPGeneInfo"
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ withSense = getConfigBoolOption(configParser, section, "withSense", True)
+ flankBP = getConfigIntOption(configParser, section, "flankBP", 0)
+
+ parser.set_defaults(doCache=doCache, withSense=withSense, flankBP=flankBP)
+
+ return parser
+
+
def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
if doCache:
cacheGeneDB(genome)
- idb = geneinfoDB(cache=True)
print "cached %s" % genome
- else:
- idb = geneinfoDB()
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
geneDict = {}
if flankBP > 0:
totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
"""
-import sys, optparse
-from commoncode import readDataset, writeLog
+import sys
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
+import ReadDataset
-print "%prog: version 3.5"
+print "getSNPs: version 3.6"
try:
import psyco
usage = __doc__
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--nosplices", action="store_false", dest="doSplices")
- parser.add_option("--enforceChr", action="store_true", dest="forceChr")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nosplices", action="store_false", dest="doSplices")
+ parser.add_option("--enforceChr", action="store_true", dest="forceChr")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "getSNPs"
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
+ forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
+
+ parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+
+ return parser
+
+
def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
if cachePages > 20000:
hitRDS.setDBcache(cachePages)
except:
readDict[chrom] = []
- for (start, stop) in readDict[chrom]:
+ for read in readDict[chrom]:
+ start = read["start"]
+ stop = read["stop"]
if finalDict.has_key(start):
finalDict[start].append(stop)
else:
except:
spliceDict[chrom] = []
- for (start, stop) in spliceDict[chrom]:
+ for read in spliceDict[chrom]:
+ try:
+ start = read["startL"]
+ stop = read["stopL"]
+ except KeyError:
+ start = read["startR"]
+ stop = read["stopR"]
+
if finalDict.has_key(start):
finalDict[start].append(stop)
else:
from cistematic.core import complement
from cistematic.core.motif import Motif
from cistematic.genomes import Genome
-from commoncode import readDataset, getMergedRegions, findPeak
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
+import ReadDataset
from pylab import *
import matplotlib
-print '%s: version 3.4' % sys.argv[0]
+print 'getallNRSE: version 3.5'
def main(argv=None):
if not argv:
usage = "usage: python %s genome regionfile siteOutfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--dataset", dest="chipfilename")
- parser.add_option("--min", type="float", dest="minHeight")
- parser.add_option("--minfraction", type="float", dest="minFraction")
- parser.add_option("--plot", dest="plotname")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--raw", action="store_false", dest="normalize")
- parser.add_option("--verbose", action="store_true", dest="doVerbose")
- parser.add_option("--markov1", action="store_true", dest="doMarkov1")
- parser.add_option("--peakdist", type="int", dest="maxpeakdist")
- parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
- parser.add_option("--motifdir", dest="motifDir")
- parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
- doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
- maxpeakdist=None, fullOnly=False, motifDir="./")
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.motifDir)
+def getParser(usage):
+
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--dataset", dest="chipfilename")
+ parser.add_option("--min", type="float", dest="minHeight")
+ parser.add_option("--minfraction", type="float", dest="minFraction")
+ parser.add_option("--plot", dest="plotname")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+ parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+ parser.add_option("--peakdist", type="int", dest="maxpeakdist")
+ parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
+ parser.add_option("--motifdir", dest="motifDir")
+
+ configParser = getConfigParser()
+ section = "getallNRSE"
+ chipfilename = getConfigOption(configParser, section, "chipfilename", "")
+ minHeight = getConfigFloatOption(configParser, section, "minHeight", -2.)
+ minFraction = getConfigFloatOption(configParser, section, "minFraction", -2.)
+ plotname = getConfigOption(configParser, section, "plotname", "")
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
+ maxpeakdist = getConfigOption(configParser, section, "maxpeakdist", None)
+ fullOnly = getConfigBoolOption(configParser, section, "fullOnly", False)
+ motifDir = getConfigOption(configParser, section, "motifDir", "./")
+
+ parser.set_defaults(chipfilename=chipfilename, minHeight=minHeight, minFraction=minFraction, plotname=plotname,
+ doCache=doCache, normalize=normalize, doVerbose=doVerbose, doMarkov1=doMarkov1,
+ maxpeakdist=maxpeakdist, fullOnly=fullOnly, motifDir=motifDir)
+
+ return parser
+
+
def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
minFraction=-2., plotname="", doCache=False, normalize=True,
doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
doDataset = False
normalizeBy = 1
if chipfilename:
- hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(chipfilename, verbose=doVerbose, cache=doCache)
doDataset = True
if normalize:
normalizeBy = len(hitRDS) / 1000000.
if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
continue
- for (start, stop, length) in regions[rchrom]:
- regionList.append((rchrom, start, length))
+ for region in regions[rchrom]:
+ regionList.append((rchrom, region.start, region.length))
notFoundIndex = 0
currentChrom = ""
hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
currentChrom = rchrom
- (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
+ peak = findPeak(hitDict[rchrom], start, length, doWeight=True)
+ topPos = peak.topPos
+ numHits = peak.numHits
if len(topPos) == 0:
print "topPos error"
peakpos = topPos[0]
- peakscore = smoothArray[peakpos]
+ peakscore = peak.smoothArray[peakpos]
if peakscore == 0.:
peakscore = -1.
except:
print 'psyco not running'
-import sys, optparse
-from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import optparse
+from cistematic.core import genesIntersecting, featuresIntersecting
+from commoncode import getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption, getGeneInfoDict, getGeneAnnotDict, getExtendedGeneAnnotDict
-print "%prog: version 5.5"
+print "getallgenes: version 5.6"
def main(argv=None):
usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--radius", type="int", dest="maxRadius")
- parser.add_option("--nomatch", dest="nomatchfilename")
- parser.add_option("--trackfar", action="store_true", dest="trackFar")
- parser.add_option("--stranded", action="store_true", dest="trackStrand")
- parser.add_option("--cache", action="store_true", dest="cachePages")
- parser.add_option("--compact", action="store_true", dest="compact")
- parser.add_option("--step", type="int", dest="step")
- parser.add_option("--startField", type="int", dest="colID")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
- trackStrand=False, compact=False, colID=1, doCache=False,
- extendGenome="", replaceModels=False)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.doCache, options.extendgenome, options.replaceModels)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--radius", type="int", dest="maxRadius")
+ parser.add_option("--nomatch", dest="nomatchfilename")
+ parser.add_option("--trackfar", action="store_true", dest="trackFar")
+ parser.add_option("--stranded", action="store_true", dest="trackStrand")
+ parser.add_option("--cache", action="store_true", dest="cachePages")
+ parser.add_option("--compact", action="store_true", dest="compact")
+ parser.add_option("--step", type="int", dest="step")
+ parser.add_option("--startField", type="int", dest="colID")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+ configParser = getConfigParser()
+ section = "getallgenes"
+ maxRadius = getConfigIntOption(configParser, section, "maxRadius", 20002)
+ nomatchfilename = getConfigOption(configParser, section, "nomatchfilename", "")
+ step = getConfigOption(configParser, section, "step", None)
+ trackFar = getConfigBoolOption(configParser, section, "trackFar", False)
+ trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+ compact = getConfigBoolOption(configParser, section, "compact", False)
+ colID = getConfigIntOption(configParser, section, "colID", 1)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+ parser.set_defaults(maxRadius=maxRadius, nomatchfilename=nomatchfilename, step=step, trackFar=trackFar,
+ trackStrand=trackStrand, compact=compact, colID=colID, doCache=doCache,
+ extendGenome=extendGenome, replaceModels=replaceModels)
+
+ return parser
+
+
def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
doCache=False, extendGenome="", replaceModels=False):
- if doCache:
- idb = geneinfoDB(cache=True)
- else:
- idb = geneinfoDB()
-
if not step:
step = maxRadius - 2
infile = open(infilename)
outfile = open(outfilename,"w")
- if genome == "dmelanogaster":
- geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
- else:
- geneinfoDict = idb.getallGeneInfo(genome)
+ geneinfoDict = getGeneInfoDict(genome, cache=doCache)
posList = []
altPosDict = {}
if maxRadius < step:
step = maxRadius - 2
- hg = Genome(genome, inRAM=True)
if extendGenome != "":
- hg.extendFeatures(extendGenome, replace = replaceModels)
-
- geneannotDict = hg.allAnnotInfo()
+ geneannotDict = getExtendedGeneAnnotDict(genome, extendGenome, replace=replaceModels, inRAM=True)
+ else:
+ geneannotDict = getGeneAnnotDict(genome, inRAM=True)
for radius in range(1, maxRadius, step):
print "radius %d" % radius
from cistematic.core.motif import Motif, hasMotifExtension
from cistematic.core import complement
from cistematic.genomes import Genome
-from commoncode import readDataset, getMergedRegions, findPeak
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
-print "%prog: version 2.4"
+print "getallsites: version 2.5"
def main(argv=None):
usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--dataset", dest="chipfilename")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--best", action="store_true", dest="bestOnly",
- help="only report the best position for each region")
- parser.add_option("--usepeak", action="store_true", dest="usePeak",
- help="use peak position and height from regions file")
- parser.add_option("--printseq", action="store_true", dest="printSeq")
- parser.add_option("--nomerge", action="store_true", dest="noMerge")
- parser.add_option("--markov1", action="store_true", dest="doMarkov1")
- parser.add_option("--rank", type="int", dest="useRank",
- help="return region ranking based on peak height ranking [requires --usepeak]")
- parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
- printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 5:
options.useRank, options.noMerge)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--dataset", dest="chipfilename")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--best", action="store_true", dest="bestOnly",
+ help="only report the best position for each region")
+ parser.add_option("--usepeak", action="store_true", dest="usePeak",
+ help="use peak position and height from regions file")
+ parser.add_option("--printseq", action="store_true", dest="printSeq")
+ parser.add_option("--nomerge", action="store_true", dest="noMerge")
+ parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+ parser.add_option("--rank", type="int", dest="useRank",
+ help="return region ranking based on peak height ranking [requires --usepeak]")
+
+ configParser = getConfigParser()
+ section = "getallsites"
+ chipfilename = getConfigOption(configParser, section, "chipfilename", "")
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ bestOnly = getConfigBoolOption(configParser, section, "bestOnly", False)
+ usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
+ printSeq = getConfigBoolOption(configParser, section, "printSeq", False)
+ doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
+ useRank = getConfigBoolOption(configParser, section, "useRank", False)
+ noMerge = getConfigBoolOption(configParser, section, "noMerge", False)
+
+ parser.set_defaults(chipfilename=chipfilename, doCache=doCache, bestOnly=bestOnly, usePeak=usePeak,
+ printSeq=printSeq, doMarkov1=doMarkov1, useRank=useRank, noMerge=noMerge)
+
+ return parser
+
+
def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
useRank=False, noMerge=False):
doRDS = True
if doRDS:
- hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(chipfilename, verbose = True, cache=doCache)
outfile = open(outfilename, "w")
continue
if usePeak:
- for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
- regionList.append((peakHeight, chrom, start, length, peakPos))
+ for region in regions[chrom]:
+ regionList.append((region.peakHeight, chrom, region.start, region.length, region.peakPos))
else:
- for (start, stop, length) in regions[chrom]:
- regionList.append((chrom, start, length))
+ for region in regions[chrom]:
+ regionList.append((chrom, region.start, region.length))
if usePeak:
regionList.sort()
except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
from cistematic.genomes import Genome
-print "%s: version 3.4" % sys.argv[0]
+print "getfasta: version 3.5"
def main(argv=None):
usage = "usage: python %s genome regionfile outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--seqradius", type="int", dest="seqsize")
- parser.add_option("--minreads", type="int", dest="minHitThresh")
- parser.add_option("--returnTop", type="int", dest="topRegions")
- parser.add_option("--maxsize", type="int", dest="maxsize")
- parser.add_option("--usepeak", action="store_true", dest="usePeaks")
- parser.add_option("--dataset", dest="hitfile")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--compact", action="store_true", dest="doCompact")
- parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
- usePeaks=False, hitfile=None, doCache=False, doCompact=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.doCache, options.doCompact)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--seqradius", type="int", dest="seqsize")
+ parser.add_option("--minreads", type="int", dest="minHitThresh")
+ parser.add_option("--returnTop", type="int", dest="topRegions")
+ parser.add_option("--maxsize", type="int", dest="maxsize")
+ parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+ parser.add_option("--dataset", dest="hitfile")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--compact", action="store_true", dest="doCompact")
+
+ configParser = getConfigParser()
+ section = "getfasta"
+ seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
+ minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
+ topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
+ maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
+ usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
+ hitfile = getConfigOption(configParser, section, "hitFile", None)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
+
+ parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
+ usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
+
+ return parser
+
def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
doDataset = False
if usePeaks:
ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
elif doDataset:
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
else:
ncregions = getDefaultRegion(mergedRegions, maxsize)
def getDefaultRegion(regionDict, maxsize):
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen) = region
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ length = region.length
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
continue
- resultDict = {"start": rstart,
- "length": rlen,
+ resultDict = {"start": start,
+ "length": length,
"topPos": [-1]
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions
def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen, peakPos, peakHeight) = region
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ length = region.length
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
continue
- topPos = peakPos - rstart
- if peakHeight > minHitThresh:
- resultDict = {"start": rstart,
- "length": rlen,
+ topPos = region.peakPos - start
+ if region.peakHeight > minHitThresh:
+ resultDict = {"start": start,
+ "length": length,
"topPos": [topPos]
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions
readlen = hitRDS.getReadSize()
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
-
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen) = region
-
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
+
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ stop = region.stop
+ length = region.length
+
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
continue
- thechrom = "chr%s" % achrom
+ thechrom = "chr%s" % chromosome
print "."
- hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+ hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
print "hitDict length: %d", len(hitDict[thechrom])
- (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
- if numHits > minHitThresh:
- resultDict = {"start": rstart,
- "length": rlen,
- "topPos": topPos
+ peak = findPeak(hitDict[thechrom], start, length, readlen)
+ if peak.numHits > minHitThresh:
+ resultDict = {"start": start,
+ "length": length,
+ "topPos": peak.topPos
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions
import optparse
import matplotlib
from pylab import *
+from commoncode import getConfigParser, getConfigIntOption
-print "%prog: version 2.1"
+
+print "getgosig: version 2.2"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [--fontsize pts] [--length in] [--width in]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--fontsize", type="int", dest="fontSize")
- parser.add_option("--length", type="int", dest="length")
- parser.add_option("--width", type="int", dest="width")
- parser.set_defaults(fontSize=5, length=10, width=7)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 5:
getgosig(genome, imagename, fileroots, titles, options.fontSize, options.length, options.width)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--fontsize", type="int", dest="fontSize")
+ parser.add_option("--length", type="int", dest="length")
+ parser.add_option("--width", type="int", dest="width")
+
+ configParser = getConfigParser()
+ section = "getgosig"
+ fontSize = getConfigIntOption(configParser, section, "fontSize", 5)
+ length = getConfigIntOption(configParser, section, "length", 10)
+ width = getConfigIntOption(configParser, section, "width", 7)
+
+ parser.set_defaults(fontSize=fontSize, length=length, width=width)
+
+ return parser
+
+
def getgosig(genome, imagename, fileroots=[], titles=[], fontSize=5, length=10, width=7):
hg = Genome(genome)
allgodesc = hg.allGOterms()
if not argv:
argv = sys.argv
- print '%s: version 1.1' % argv[0]
+ print "getmers: version 1.2"
if len(sys.argv) < 5:
print 'usage: python %s genome merlen chrAny:start-stop outfile' % argv[0]
psyco.full()
except:
print "psyco not running"
-from cistematic.core import complement
+
from cistematic.genomes import Genome
+from commoncode import getConfigParser, getConfigIntOption
def main(argv=None):
if not argv:
argv = sys.argv
- verstring = "%prog: version 1.0"
+ verstring = "getsplicefa: version 1.1"
print verstring
delimiter = "|"
\n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
\n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--verbose", action="store_true", dest="doVerbose",
- help="show verbose messages [default: False]")
- parser.add_option("--spacer", type="int", dest="spacer",
- help="number of spacer NTs to use [default: 2")
+ parser = makeParser(usage)
parser.set_defaults(doVerbose=False, spacer=2)
(options, args) = parser.parse_args(argv[1:])
getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--verbose", action="store_true", dest="doVerbose",
+ help="show verbose messages [default: False]")
+ parser.add_option("--spacer", type="int", dest="spacer",
+ help="number of spacer NTs to use [default: 2")
+
+ configParser = getConfigParser()
+ section = "getsplicefa"
+ doVerbose = getConfigIntOption(configParser, section, "doVerbose", False)
+ spacer = getConfigIntOption(configParser, section, "spacer", 2)
+
+ parser.set_defaults(doVerbose=doVerbose, spacer=spacer)
+
+ return parser
+
+
def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
spacerseq = "N" * spacer
exonStopDict[name] = exonStops
exonLengths = []
for index in range(spliceCount + 1):
- exonLengths.append(exonStops[index] - exonStarts[index])
+ exonLengths.append(exonStops[index] - exonStarts[index] + 1)
exonLengthDict[name] = exonLengths
import sys
-print "%s: version 1.0" % sys.argv[0]
+print "gointersects: version 1.1"
def main(argv=None):
if not argv:
# ENRAGE
#
-import sys, optparse
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
-print 'version 2.0'
+print "intersects: version 2.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog infile1 infile2 outfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("-d", dest="delimiter")
- parser.add_option("--file3", dest="infile3")
- parser.add_option("-1", type="int", dest="matchfield1")
- parser.add_option("-2", type="int", dest="matchfield2")
- parser.add_option("-3", type="int", dest="matchfield3")
- parser.add_option("-reject1", dest="reject1file")
- parser.add_option("-trackGID", action="store_true", dest="trackGID")
- parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0,
- matchField3=0, rejectFileName="", trackGID=False)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.rejectFileName, options.trackGID)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("-d", dest="delimiter")
+ parser.add_option("--file3", dest="infile3")
+ parser.add_option("-1", type="int", dest="matchfield1")
+ parser.add_option("-2", type="int", dest="matchfield2")
+ parser.add_option("-3", type="int", dest="matchfield3")
+ parser.add_option("-reject1", dest="reject1file")
+ parser.add_option("-trackGID", action="store_true", dest="trackGID")
+
+ configParser = getConfigParser()
+ section = "geneMrnaCountsWeighted"
+ delimiter = getConfigOption(configParser, section, "delimiter", "\t")
+ infile3 = getConfigOption(configParser, section, "infile3", None)
+ matchField1 = getConfigIntOption(configParser, section, "matchField1", 0)
+ matchField2 = getConfigIntOption(configParser, section, "matchField2", 0)
+ matchField3 = getConfigIntOption(configParser, section, "matchField3", 0)
+ rejectFileName = getConfigOption(configParser, section, "rejectFileName", "\t")
+ trackGID = getConfigBoolOption(configParser, section, "trackGID", False)
+
+ parser.set_defaults(delimiter=delimiter, infile3=infile3, matchField1=matchField1, matchField2=matchField2,
+ matchField3=matchField3, rejectFileName=rejectFileName, trackGID=trackGID)
+
+ return parser
+
+
def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
from cistematic.genomes import Genome
from commoncode import getMergedRegions, getFeaturesByChromDict
-print "%s: version 1.1" % sys.argv[0]
+print "listGeneFeatures: version 1.2"
def main(argv=None):
-import sys, os
+import sys
+import os
def getEdges(nodeList, shorten=False):
if not argv:
argv = sys.argv
- print "%s: version 1.2" % argv[0]
+ print "makeSNPtrack: version 1.3"
if len(argv) < 4:
print "usage: python %s snpfile trackname trackoutfile" % argv[0]
except:
pass
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
PLUS_COLOR = "0,0,255"
MINUS_COLOR = "255,0,0"
if not argv:
argv = sys.argv
- verstring = "%prog: version 3.1"
+ verstring = "makebedfromrds: version 3.2"
print verstring
doPairs = False
usage = "usage: %prog trackLabel rdsFile bamFile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--nouniq", action="store_false", dest="withUniqs")
- parser.add_option("--nomulti", action="store_false", dest="withMulti")
- parser.add_option("--splices", action="store_true", dest="doSplices")
- parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
- parser.add_option("--flag", dest="withFlag")
- parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
- parser.add_option("--pairs", type="int", dest="pairDist")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
- parser.add_option("--chrom", action="append", dest="chromList")
- parser.add_option("--strand", dest="strand")
- parser.add_option("-r", "--region", dest="region", type="string",
- help="samtools region string")
- parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False,
- pairDist=None, withFlag="", useFlagLike=False, enforceChr=False,
- senseStrand="", allChrom=True, doCache=False, cachePages=100000,
- chromList=[])
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
try:
options.allChrom, options.doCache, options.cachePages, options.chromList)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+ parser.add_option("--pairs", type="int", dest="pairDist")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--chrom", action="append", dest="chromList")
+ parser.add_option("--strand", dest="strand")
+
+ configParser = getConfigParser()
+ section = "makebedfromrds"
+ withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+ withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doSpliceColor = getConfigBoolOption(configParser, section, "doSpliceColor", False)
+ pairDist = getConfigOption(configParser, section, "pairDist", None)
+ withFlag = getConfigOption(configParser, section, "withFlag", "")
+ useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+ enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+ senseStrand = getConfigOption(configParser, section, "senseStrand", "")
+ allChrom = getConfigBoolOption(configParser, section, "allChrom", True)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", 100000)
+
+ parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices, doSpliceColor=doSpliceColor,
+ pairDist=pairDist, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+ senseStrand=senseStrand, allChrom=allChrom, doCache=doCache, cachePages=cachePages,
+ chromList=[])
+
+ return parser
+
+
def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
sys.exit(1)
print "\nsample:"
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
#check that this is better than the dataset's default cache size
if cachePages > RDS.getDefaultCacheSize():
listLen = len(localList) - 1
localIndex = 0
while localIndex <= listLen:
+ read = localList[localIndex]
try:
- (leftpos, leftsense, leftweight, lPairID) = localList[localIndex]
+ leftpos = read["start"]
+ leftsense = read["sense"]
+ leftweight = read["weight"]
+ lPairID = read["pairID"]
leftstop = leftpos + readlength - 1
lpart = 1
startList = [leftpos]
stopList = [leftstop]
- except:
- (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex]
+ except KeyError:
+ leftpos = read["startL"]
+ LLstop = read["stopL"]
+ LRstart = read["startR"]
+ leftstop = read["stopL"]
+ leftsense = read["sense"]
+ lPairID = read["pairID"]
leftweight = 1.0
lpart = 2
startList = [leftpos, LRstart]
stopList = [LLstop, leftstop]
if localIndex < listLen:
+ read = localList[localIndex + 1]
try:
- (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1]
+ rightpos = read["start"]
+ rightsense = read["sense"]
+ rightweight = read["weight"]
+ rPairID= read["pairID"]
rightstop = rightpos + readlength - 1
rpart = 1
rstartList = [rightpos]
rstopList = [rightstop]
- except:
- (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1]
+ except KeyError:
+ rightpos = read["startL"]
+ RLstop = read["stopL"]
+ RRstart = read["startR"]
+ rightstop = read["stopR"]
+ rightsense = read["sense"]
+ rPairID = read["pairID"]
rightweight = 1.0
rpart = 2
rstartList = [rightpos, RRstart]
else:
hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
try:
- for (pos, sense, weight, readID) in hitDict[achrom]:
+ for read in hitDict[achrom]:
+ pos = read["start"]
+ sense = read["sense"]
+ readID = read["readID"]
splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
index += 1
except:
spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
if achrom not in spliceDict:
continue
- for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+ for read in spliceDict[achrom]:
+ readstart = read["startL"]
+ Lstop = read["stopL"]
+ Rstart = read["startR"]
+ readstop = read["stopR"]
+ rsense = read["sense"]
+ readName = read["readID"]
splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
index += 1
spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
if achrom not in spliceDict:
continue
- for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+ for read in spliceDict[achrom]:
+ readstart = read["startL"]
+ Lstop = read["stopL"]
+ Rstart = read["startR"]
+ readstop = read["stopR"]
+ rsense = read["sense"]
+ readName = read["readID"]
splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
index += 1
except:
pass
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigIntOption, getConfigBoolOption
+import ReadDataset
-verstring = "%prog: version 2.1" % sys.argv[0]
+verstring = "makerdsfrombed: version 2.2"
print verstring
usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--append", action="store_false", dest="init")
- parser.add_option("--index", action="store_true", dest="doIndex")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--RNA", action="store_true", dest="rnaDataType")
- parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--RNA", action="store_true", dest="rnaDataType")
+
+ configParser = getConfigParser()
+ section = "makerdsfrombed"
+ init = getConfigBoolOption(configParser, section, "init", True)
+ rnaDataType = getConfigBoolOption(configParser, section, "RNA", False)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+
+ parser.set_defaults(init=init, rnaDataType=rnaDataType, doIndex=doIndex, cachePages=cachePages)
+
+ return parser
+
+
def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]):
readsize = 0
padsize = 0
infile = open(filename,"r")
- rds = readDataset(outdbname, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
if not init:
rds.dropIndex()
except:
pass
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
-verstring = "%prog: version 3.9"
+verstring = "makerdsfromblat: version 3.10"
print verstring
def main(argv=None):
usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--append", action="store_false", dest="init")
- parser.add_option("--index", action="store_true", dest="doIndex")
- parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
- parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
- parser.add_option("--flag", action="store_true", dest="flagReads")
- parser.add_option("--strict", type="int", dest="minSpliceLength",
- help="min required bp on each side of a splice")
- parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
- parser.add_option("--verbose", action="store_true", dest="verbose")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--RNA", dest="geneDataFileName")
- parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.cachePages, options.geneDataFileName, propertyList)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+ parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
+ parser.add_option("--flag", action="store_true", dest="flagReads")
+ parser.add_option("--strict", type="int", dest="minSpliceLength",
+ help="min required bp on each side of a splice")
+ parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--RNA", dest="geneDataFileName")
+
+ configParser = getConfigParser()
+ section = "makerdsfromblat"
+ init = getConfigBoolOption(configParser, section, "init", True)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+ minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
+ forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
+ flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
+ spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
+ verbose = getConfigBoolOption(configParser, section, "verbose", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
+
+ parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
+ flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
+ geneDataFileName=geneDataFileName)
+
+ return parser
+
+
def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
doIndex=False,trimReadID=True, minSpliceLength=0,
forceRNA=False, theFlag="", spliceOnly=False,
genedatafile.close()
- rds = readDataset(outdbname, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
#check that our cacheSize is better than the dataset's default cache size
defaultCacheSize = rds.getDefaultCacheSize()
except:
pass
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
-verstring = "%prog: version 4.1"
+verstring = "makerdsfrombowtie: version 4.2"
print verstring
def main(argv=None):
usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--RNA", dest="genedatafilename")
- parser.add_option("--append", action="store_false", dest="init")
- parser.add_option("--index", action="store_true", dest="doIndex")
- parser.add_option("--spacer", type="int", dest="spacer")
- parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
- parser.add_option("--forcepair", type="int", dest="forceID")
- parser.add_option("--flip", action="store_true", dest="flip")
- parser.add_option("--verbose", action="store_true", dest="verbose")
- parser.add_option("--strip", action="store_true", dest="stripSpace")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
- trimReadID=True, forceID=None, flip=False, verbose=False,
- stripSpace=False, cachePages=100000)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
propertyList)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--RNA", dest="genedatafilename")
+ parser.add_option("--append", action="store_false", dest="init")
+ parser.add_option("--index", action="store_true", dest="doIndex")
+ parser.add_option("--spacer", type="int", dest="spacer")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+ parser.add_option("--forcepair", type="int", dest="forceID")
+ parser.add_option("--flip", action="store_true", dest="flip")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--strip", action="store_true", dest="stripSpace")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "makerdsfrom bowtie"
+ genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
+ init = getConfigBoolOption(configParser, section, "init", True)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ spacer = getConfigIntOption(configParser, section, "spacer", 2)
+ trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+ forceID = getConfigOption(configParser, section, "forceID", None)
+ flip = getConfigBoolOption(configParser, section, "flip", False)
+ verbose = getConfigBoolOption(configParser, section, "verbose", False)
+ stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+
+ parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
+ trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
+ stripSpace=stripSpace, cachePages=cachePages)
+
+ return parser
+
+
def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
doIndex=False, spacer=2, trimReadID=True, forceID=None,
flip=False, verbose=False, stripSpace=False, cachePages=100000,
genedatafile.close()
- rds = readDataset(outdbname, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
#check that our cacheSize is better than the dataset's default cache size
defaultCacheSize = rds.getDefaultCacheSize()
except:
pass
-import sys, string, optparse
-from commoncode import readDataset
+import sys
+import string
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
def main(argv=None):
if not argv:
argv = sys.argv
- verstring = "%prog: version 3.4"
+ verstring = "makerdsfromeland2: version 3.5"
print verstring
usage = "usage: %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
\ninput reads must be sorted to properly record multireads"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--append", action="store_false", dest="init",
- help="append to existing rds file [default: create new]")
- parser.add_option("--RNA", dest="geneDataFileName",
- help="set data type to RNA [default: DNA]")
- parser.add_option("--index", action="store_true", dest="doIndex",
- help="index the output rds file")
- parser.add_option("--cache", type="int", dest="cachePages",
- help="number of cache pages to use [default: 100000")
- parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
- help="use : as the delimiter")
- parser.add_option("--paired", dest="pairID",
- help="pairID value")
- parser.add_option("--extended", action="store_true", dest="extended",
- help="use eland_extended input")
- parser.add_option("--verbose", action="store_true", dest="verbose")
- parser.add_option("--maxlines", type="int", dest="maxLines",
- help="[default: 1000000000")
- parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
if options.geneDataFileName is not None:
dataType = 'RNA'
- makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
+ makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
+ options.pairID, dataType, options.geneDataFileName, options.cachePages,
+ options.maxLines, options.extended, options.verbose)
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init",
+ help="append to existing rds file [default: create new]")
+ parser.add_option("--RNA", dest="geneDataFileName",
+ help="set data type to RNA [default: DNA]")
+ parser.add_option("--index", action="store_true", dest="doIndex",
+ help="index the output rds file")
+ parser.add_option("--cache", type="int", dest="cachePages",
+ help="number of cache pages to use [default: 100000")
+ parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
+ help="use : as the delimiter")
+ parser.add_option("--paired", dest="pairID",
+ help="pairID value")
+ parser.add_option("--extended", action="store_true", dest="extended",
+ help="use eland_extended input")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--maxlines", type="int", dest="maxLines",
+ help="[default: 1000000000")
+
+ configParser = getConfigParser()
+ section = "makerdsfromeland2"
+ init = getConfigBoolOption(configParser, section, "init", True)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
+ useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
+ pairID = getConfigOption(configParser, section, "pairID", None)
+ maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
+ extended = getConfigBoolOption(configParser, section, "extended", False)
+ verbose = getConfigBoolOption(configParser, section, "verbose", False)
+
+ parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
+ geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
+ pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
+
+ return parser
+
+def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
+ init=True, pairID="1", dataType="DNA", geneDataFileName=None,
+ cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
-def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
maxBorder = 0
index = 0
insertSize = 100000
mapDict[uname] = []
genedatafile.close()
- rds = readDataset(outdbname, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
if cachePages > rds.getDefaultCacheSize():
if init:
# ENRAGE
#
-import sys, string, optparse
+import sys
+import string
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
-print "%prog: version 2.1"
+print "makesitetrack: version 2.2"
def main(argv=None):
usage = "usage: python %prog sitefile outbedfile [--noheader] [--stype fieldID] [--color xx,yy,zz] [--append] [--exploded]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--noheader", action="store_true", dest="noHeader")
- parser.add_option("--stype", type="int", dest="stypeID")
- parser.add_option("--color", dest="color")
- parser.add_option("--append", action="store_true", dest="append")
- parser.add_option("--exploded", action="store_false", dest="compact")
- parser.set_defaults(stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
makesitetrack(infile, outfileName, options.stypeID, options.color, options.append, options.compact, options.noHeader)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--noheader", action="store_true", dest="noHeader")
+ parser.add_option("--stype", type="int", dest="stypeID")
+ parser.add_option("--color", dest="color")
+ parser.add_option("--append", action="store_true", dest="append")
+ parser.add_option("--exploded", action="store_false", dest="compact")
+
+ configParser = getConfigParser()
+ section = "makesitetrack"
+ stypeID = getConfigOption(configParser, section, "stypeID", None)
+ color = getConfigOption(configParser, section, "color", "0,0,0")
+ append = getConfigBoolOption(configParser, section, "append", False)
+ compact = getConfigBoolOption(configParser, section, "compact", True)
+ noHeader = getConfigBoolOption(configParser, section, "noHeader", False)
+
+ parser.set_defaults(stypeID=stypeID, color=color, append=append, compact=compact, noHeader=noHeader)
+
+ return parser
+
+
def makesitetrack(infileName, outFileName, stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False):
if stypeID is not None:
doStype = True
# makewiggle.py
# ENRAGE
#
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, getConfigFloatOption
-print "%prog: version 6.7"
+print "makewiggle: version 6.8"
try:
import psyco
usage = "usage: python %s name rdsfile outfilename [options]"
+ parser = getParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ name = args[0]
+ hitfilename = args[1]
+ outfilename = args[2]
+
+ makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
+ options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
+ options.group, options.startPriority, options.skipRandom, options.withMulti,
+ options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
+ options.chunk)
+
+
+def getParser(usage):
parser = optparse.OptionParser(usage=usage)
parser.add_option("--raw", action="store_false", dest="doNormalize")
parser.add_option("--color", dest="color")
parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
parser.add_option("--stranded", dest="strand")
parser.add_option("--maxchunk", type="int", dest="chunk")
- parser.set_defaults(doNormalize=True, color=None, altColor="", limitChrom=None,
- shift=0, doSplit=False, listfilename=None, listPrefix="",
- group="", startPriority=0.01, skipRandom=False, withMulti=True,
- withSplices=False, doSingle=False, cachePages=-1, enforceChr=False,
- strand=None, chunk=20)
-
- (options, args) = parser.parse_args(argv[1:])
-
- if len(args) < 3:
- print usage
- sys.exit(1)
-
- name = args[0]
- hitfilename = args[1]
- outfilename = args[2]
- makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
- options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
- options.group, options.startPriority, options.skipRandom, options.withMulti,
- options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
- options.chunk)
+ configParser = getConfigParser()
+ section = "makewiggle"
+ doNormalize = getConfigBoolOption(configParser, section, "doNormalize", True)
+ color = getConfigOption(configParser, section, "color", None)
+ altColor = getConfigOption(configParser, section, "altColor", "")
+ limitChrom = getConfigOption(configParser, section, "limitChrom", None)
+ shift = getConfigIntOption(configParser, section, "shift", 0)
+ doSplit = getConfigBoolOption(configParser, section, "doSplit", False)
+ listfilename = getConfigOption(configParser, section, "listfilename", None)
+ listPrefix = getConfigOption(configParser, section, "listPrefix", "")
+ group = getConfigOption(configParser, section, "group", "")
+ startPriority = getConfigFloatOption(configParser, section, "startPriority", 0.01)
+ skipRandom = getConfigBoolOption(configParser, section, "skipRandom", False)
+ withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+ withSplices = getConfigBoolOption(configParser, section, "withSplices", False)
+ doSingle = getConfigBoolOption(configParser, section, "doSingle", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+ enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+ strand = getConfigOption(configParser, section, "strand", None)
+ chunk = getConfigIntOption(configParser, section, "chunk", 20)
+
+ parser.set_defaults(doNormalize=doNormalize, color=color, altColor=altColor, limitChrom=limitChrom,
+ shift=shift, doSplit=doSplit, listfilename=listfilename, listPrefix=listPrefix,
+ group=group, startPriority=startPriority, skipRandom=skipRandom, withMulti=withMulti,
+ withSplices=withSplices, doSingle=doSingle, cachePages=cachePages, enforceChr=enforceChr,
+ strand=strand, chunk=chunk)
+
+ return parser
def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, altColor="",
print "Will shift reads by +/- %d bp according to their sense" % shift
name += "shift=%d" % shift
- hitRDS = readDataset(hitfilename, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfilename, verbose=True, cache=doCache)
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
except:
pass
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
from cistematic.genomes import Genome
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
-print "%prog: version 5.6"
+print "normalizeExpandedExonic: version 5.7"
def main(argv=None):
usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--gidField", type="int", dest="fieldID")
- parser.add_option("--maxLength", type="float", dest="maxLength")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
- replaceModels=False)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(sys.argv) < 6:
options.replaceModels)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--gidField", type="int", dest="fieldID")
+ parser.add_option("--maxLength", type="float", dest="maxLength")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+ configParser = getConfigParser()
+ section = "normalizeExpandedExonic"
+ fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
+ maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+ parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
+ replaceModels=replaceModels)
+
+ return parser
+
+
def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
outfilename, candidateLines=[], acceptedfilename="",
fieldID=0, maxLength=1000000000., doCache=False,
if extendGenome != "":
hg.extendFeatures(extendGenome, replace=replaceModels)
- RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
+ RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
uniqcount = RDS.getUniqsCount()
print "%d unique reads" % uniqcount
except:
pass
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
-print "%prog: version 3.5" % sys.argv[0]
+print "normalizeFinalExonic: version 3.6"
def main(argv=None):
if not argv:
usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--multifraction", action="store_true", dest="reportfraction")
- parser.add_option("--multifold", action="store_true", dest="reportFold")
- parser.add_option("--minrpkm", type="float", dest="minThreshold")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--withGID", action="store_true", dest="writeGID")
- parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
- doCache=False, writeGID=False)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
options.doCache, options.writeGID)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--multifraction", action="store_true", dest="reportfraction")
+ parser.add_option("--multifold", action="store_true", dest="reportFold")
+ parser.add_option("--minrpkm", type="float", dest="minThreshold")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--withGID", action="store_true", dest="writeGID")
+
+ configParser = getConfigParser()
+ section = "normalizeFinalExonic"
+ reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
+ reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
+ minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
+
+ parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
+ doCache=doCache, writeGID=writeGID)
+
+ return parser
+
+
def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
writeGID=False):
elif reportFold:
print "reporting fold contribution of multireads"
- RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
+ RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
uniqcount = RDS.getUniqsCount()
splicecount = RDS.getSplicesCount()
multicount = RDS.getMultiCount()
# partition.py
# ENRAGE
#
-""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum]
+""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]
where the regionfiles must be comma-separated with no white space
-minFeature controls the size of the smallest partition
"""
except:
pass
-import sys, string, optparse
-from commoncode import getMergedRegions, writeLog
+import sys
+import string
+import optparse
+from commoncode import getMergedRegions, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-versionString = '%s: version 2.0' % sys.argv[0]
+versionString = "partition: version 2.1"
print versionString
usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--minFeature", type="int", dest="minFeature",
- help="size of smallest partition")
- parser.add_option("--chromField", type="int", dest="cField",
- help="num chromosome fields")
- parser.add_option("--padregion", type="int", dest="padregion",
- help="padding on each side of region")
- parser.add_option("--mergeregion", type="int", dest="mergeregion",
- help="bp threshold to merge regions")
- parser.add_option("--nomerge", action="store_false", dest="merging",
- help="do not merge regions")
- parser.add_option("--log", dest="logfilename",
- help="log file")
- parser.add_option("--locID", action="store_true", dest="locID",
- help="use location as region ID")
- parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
- help="ignore 'random' chromosomes")
- parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log")
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
if options.ignoreRandom:
print "ignoring 'random' chromosomes"
- partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename)
+ partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField,
+ options.padregion, options.locID, options.ignoreRandom, options.mergeregion,
+ options.merging, options.logfilename)
-def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"):
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--minFeature", type="int", dest="minFeature",
+ help="size of smallest partition")
+ parser.add_option("--chromField", type="int", dest="cField",
+ help="num chromosome fields")
+ parser.add_option("--padregion", type="int", dest="padregion",
+ help="padding on each side of region")
+ parser.add_option("--mergeregion", type="int", dest="mergeregion",
+ help="bp threshold to merge regions")
+ parser.add_option("--nomerge", action="store_false", dest="merging",
+ help="do not merge regions")
+ parser.add_option("--log", dest="logfilename",
+ help="log file")
+ parser.add_option("--locID", action="store_true", dest="locID",
+ help="use location as region ID")
+ parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
+ help="ignore 'random' chromosomes")
+
+ configParser = getConfigParser()
+ section = "partition"
+ minFeature = getConfigIntOption(configParser, section, "minFeature", 25)
+ cField = getConfigIntOption(configParser, section, "cField", 1)
+ padregion = getConfigIntOption(configParser, section, "padregion", 1)
+ locID = getConfigBoolOption(configParser, section, "locID", False)
+ ignoreRandom = getConfigBoolOption(configParser, section, "ignoreRandom", False)
+ mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
+ merging = getConfigBoolOption(configParser, section, "merging", True)
+ logfilename = getConfigOption(configParser, section, "logfilename", "partition.log")
+
+ parser.set_defaults(minFeature=minFeature, cField=cField, padregion=padregion, locID=locID,
+ ignoreRandom=ignoreRandom, mergeregion=mergeregion, merging=merging,
+ logfilename=logfilename)
+
+ return parser
+
+
+def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0,
+ locID=False, ignoreRandom=False, mergeregion=0, merging=True,
+ logfilename="partition.log"):
writeLog(logfilename, versionString, string.join(sys.argv[1:]))
numRegions = len(regionFileList)
chromList = []
for regionID in range(numRegions):
- allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion, minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion)
+ allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,
+ minHits=-1, fullChrom=True, verbose=True, chromField=cField,
+ doMerge=merging, pad=padregion)
+
for achrom in allregionsDict[regionID]:
if achrom not in chromList:
chromList.append(achrom)
chromList = sorted(chromList)
for chrom in chromList:
- if ignoreRandom and 'random' in chrom:
+ if ignoreRandom and "random" in chrom:
continue
outregionDict[chrom] = []
pointList = []
for regionID in range(numRegions):
if chrom in allregionsDict[regionID]:
- for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]:
- pointList.append(rstart)
- pointList.append(rstop)
+ for region in allregionsDict[regionID][chrom]:
+ pointList.append(region.start)
+ pointList.append(region.stop)
pointList.sort()
start = 0
outregionDict[chrom].append((start, point - 1, point - 1 - start))
start = point
- outfile = open(outfilename, 'w')
+ outfile = open(outfilename, "w")
if locID:
- outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n')
+ outfile.write("#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n")
else:
- outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n')
+ outfile.write("#labelID\tchrom\tstart\tstop\tlength_kb\n")
index = 0
for chrom in outregionDict:
for (start, stop, length) in outregionDict[chrom]:
index += 1
if locID:
- outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.))
+ label = "%s:%d-%d" % (chrom, start, stop)
else:
- outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.))
+ label = "%s%d" % (mergeID, index)
+
+ outfile.write("%s\t%s\t%d\t%d\t%.3f\n" % (label, chrom, start, stop, length/1000.))
message = "%s was partitioned into %d regions" % (mergeID, index)
print message
import sys
-print "%s: version 1.0" % sys.argv[0]
+print "peakstoregion: version 1.1"
def main(argv=None):
if not argv:
peakfile = argv[1]
outfile = argv[2]
- radius = 500
- chromField = 2
- posField = 3
- labelField = 1
- dataField = -1
-
- if len(argv) > 3:
+ try:
radius = int(argv[3])
+ except (IndexError, ValueError):
+ radius = 500
- if len(argv) > 4:
+ try:
chromField = int(argv[4])
+ except (IndexError, ValueError):
+ chromField = 2
- if len(argv) > 5:
+ try:
posField = int(argv[5])
+ except (IndexError, ValueError):
+ posField = 3
- if len(argv) > 6:
+ try:
labelField = int(argv[6])
+ except (IndexError, ValueError):
+ labelField = 1
- if len(argv) > 7:
+ try:
dataField = int(argv[7])
+ except (IndexError, ValueError):
+ dataField = -1
peakstoregion(peakfile, outfile, radius, chromField, posField, labelField, dataField)
import matplotlib
from pylab import *
from math import *
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption
-print "%prog: version 3.2"
+print "plotbardist: version 3.3"
def main(argv=None):
usage = "usage: python %prog infile1 [infile2] [infile3] [options] outfile.png"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--bins", type="int", dest="bins")
- parser.add_option("--field", type="int", dest="binnedField")
- parser.add_option("--binSize", type="float", dest="binLength")
- parser.add_option("--doLog", type="int", dest="logBase")
- parser.add_option("--ymax", type="int", dest="maxY")
- parser.add_option("--xlabel", dest="xLabel")
- parser.add_option("--ylabel", dest="yLabel")
- parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
- parser.add_option("--title", dest="figTitle")
- parser.add_option("--legend", dest="barsLegend", help="comma separated list")
- parser.add_option("--xoffset", type="float", dest="pointOffset")
- parser.add_option("--figsize", dest="figSizes", help="x,y pair")
- parser.set_defaults(bins=10, binnedField=-1, binLength=-1, logBase=None, maxY=0,
- xLabel="bins", yLabel="count", binLabels=None, figTitle="",
- barsLegend=None, pointOffset=0., figSizes=None)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
options.figTitle, options.barsLegend, options.pointOffset, options.figSizes)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--bins", type="int", dest="bins")
+ parser.add_option("--field", type="int", dest="binnedField")
+ parser.add_option("--binSize", type="float", dest="binLength")
+ parser.add_option("--doLog", type="int", dest="logBase")
+ parser.add_option("--ymax", type="int", dest="maxY")
+ parser.add_option("--xlabel", dest="xLabel")
+ parser.add_option("--ylabel", dest="yLabel")
+ parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
+ parser.add_option("--title", dest="figTitle")
+ parser.add_option("--legend", dest="barsLegend", help="comma separated list")
+ parser.add_option("--xoffset", type="float", dest="pointOffset")
+ parser.add_option("--figsize", dest="figSizes", help="x,y pair")
+
+ configParser = getConfigParser()
+ section = "plotbardist"
+ bins = getConfigIntOption(configParser, section, "bins", 10)
+ binnedField = getConfigIntOption(configParser, section, "binnedField", -1)
+ binLength = getConfigIntOption(configParser, section, "binLength", -1)
+ logBase = getConfigOption(configParser, section, "logBase", None)
+ maxY = getConfigIntOption(configParser, section, "maxY", 0)
+ xLabel = getConfigOption(configParser, section, "xLabel", "bins")
+ yLabel = getConfigOption(configParser, section, "yLabel", "count")
+ binLabels = getConfigOption(configParser, section, "binLabels", None)
+ figTitle = getConfigOption(configParser, section, "figTitle", "")
+ barsLegend = getConfigOption(configParser, section, "barsLegend", None)
+ pointOffset = getConfigFloatOption(configParser, section, "pointOffset", 0.)
+ figSizes = getConfigOption(configParser, section, "figSizes", None)
+
+ parser.set_defaults(bins=bins, binnedField=binnedField, binLength=binLength, logBase=logBase, maxY=maxY,
+ xLabel=xLabel, yLabel=yLabel, binLabels=binLabels, figTitle=figTitle,
+ barsLegend=barsLegend, pointOffset=pointOffset, figSizes=figSizes)
+
+ return parser
+
+
def plotbardist(fileList, pngfilename, bins=10, binnedField=-1, binLength=-1, logBase=None,
maxY=0, xLabel="bins", yLabel="count", binLabels=None, figTitle="",
barsLegend=None, pointOffset=0., figSizes=None):
except:
pass
-print "%s: version 1.1" % sys.argv[0]
+print "plotnomogram: version 1.2"
def main(argv=None):
from pylab import *
from math import *
import matplotlib
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-print "%prog: version 2.2"
+print "plotprofile: version 2.3"
def main(argv=None):
if not argv:
usage = "usage: python %s infile outfile.png [--scale] [--max weightMax] [--ymin bottom] [--ymax top] [--subtractEvens]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--scale", action="store_true", dest="doScale")
- parser.add_option("--max", type="float", dest="weightMax")
- parser.add_option("--ymin", type="float", dest="ymin")
- parser.add_option("--ymax", type="float", dest="ymax")
- parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
- parser.set_defaults(doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
plotprofile(infile, pngfilename, options.doScale, options.weightMax, options.ymin, options.ymax, options.subtractEvens)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--scale", action="store_true", dest="doScale")
+ parser.add_option("--max", type="float", dest="weightMax")
+ parser.add_option("--ymin", type="float", dest="ymin")
+ parser.add_option("--ymax", type="float", dest="ymax")
+ parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
+
+ configParser = getConfigParser()
+ section = "plotprofile"
+ doScale = getConfigBoolOption(configParser, section, "doScale", False)
+ weightMax = getConfigIntOption(configParser, section, "weightMax", -1)
+ ymin = getConfigOption(configParser, section, "ymin", None)
+ ymax = getConfigOption(configParser, section, "ymax", None)
+ subtractEvens = getConfigBoolOption(configParser, section, "subtractEvens", False)
+
+ parser.set_defaults(doScale=doScale, weightMax=weightMax, ymin=ymin, ymax=ymax, subtractEvens=subtractEvens)
+
+ return parser
+
+
def plotprofile(inFileName, pngfilename, doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False):
infile = open(inFileName)
limitYscale = False
if not argv:
argv = sys.argv
- print '%s: version 1.1' % argv[0]
+ print "predictSpliceCount: version 1.2"
if len(argv) < 6:
print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
except:
pass
-import sys, optparse
-print "%prog: version 2.2"
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
+
+print "profilebins: version 2.3"
def main(argv=None):
usage = "usage: python %prog label infile1 [--upstream infile2] [--downstream infile3] [--uplength kb] [--downlength kb] [--gene geneName] [--genes genefile] [--append] outfile"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--upstream", dest="upfilename")
- parser.add_option("--downstream", dest="downfilename")
- parser.add_option("--uplength", type="float", dest="uplength")
- parser.add_option("--downlength", type="int", dest="")
- parser.add_option("--gene", dest="gene")
- parser.add_option("--genes", dest="genefile")
- parser.add_option("--append", action="store_true", dest="doAppend")
- parser.set_defaults(upfilename=None, downfilename=None, uplength=0.0, downlength=0.0,
- gene=None, genefile=None, doAppend=False)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.doAppend)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--upstream", dest="upfilename")
+ parser.add_option("--downstream", dest="downfilename")
+ parser.add_option("--uplength", type="float", dest="uplength")
+ parser.add_option("--downlength", type="int", dest="")
+ parser.add_option("--gene", dest="gene")
+ parser.add_option("--genes", dest="genefile")
+ parser.add_option("--append", action="store_true", dest="doAppend")
+
+ configParser = getConfigParser()
+ section = "profilebins"
+ upfilename = getConfigOption(configParser, section, "upfilename", None)
+ downfilename = getConfigOption(configParser, section, "downfilename", None)
+ uplength = getConfigFloatOption(configParser, section, "uplength", 0.0)
+ downlength = getConfigFloatOption(configParser, section, "downlength", 0.0)
+ gene = getConfigOption(configParser, section, "gene", None)
+ genefile = getConfigOption(configParser, section, "genefile", None)
+ doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
+
+ parser.set_defaults(upfilename=upfilename, downfilename=downfilename, uplength=uplength, downlength=downlength,
+ gene=gene, genefile=genefile, doAppend=doAppend)
+
+ return parser
+
+
def profilebins(label, infilename, outfilename, upfilename=None, downfilename=None,
uplength=0.0, downlength=0.0, gene=None, genefile=None, doAppend=False):
import string
import optparse
import math
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption
-print "%prog: version 2.3"
+print "ratio: version 2.4"
def main(argv=None):
usage = "usage: python %prog denominatorField infile [--only fieldID] [--out outfile]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--only", type="int", dest="onlyField")
- parser.add_option("--out", dest="outFileName")
- parser.set_defaults(outFileName=None, onlyField=-1)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
ratio(field, inFileName, options.outFileName, options.onlyField)
+
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--only", type="int", dest="onlyField")
+ parser.add_option("--out", dest="outFileName")
+
+ configParser = getConfigParser()
+ section = "ratio"
+ outFileName = getConfigOption(configParser, section, "outfile", None)
+ onlyField = getConfigIntOption(configParser, section, "fieldID", -1)
+
+ parser.set_defaults(outFileName=outFileName, onlyField=onlyField)
+
+ return parser
+
+
def ratio(field, inFileName, outFileName=None, onlyField=-1):
if inFileName is not None:
import sys
import optparse
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigIntOption
-print "%prog: version 2.7"
+print "rdsmetadata: version 2.8"
def main(argv=None):
usage = "usage: python %prog rdsfile [propertyName1::propertyValue1] ... [propertyNameN::propertyValueN] [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--defaultcache", type="int", dest="cacheVal")
- parser.add_option("--index", action="store_true", dest="buildIndex")
- parser.add_option("--dropindex", action="store_true", dest="dropIndex")
- parser.add_option("--nocount", action="store_false", dest="doCount")
- parser.add_option("--complexity", action="store_true", dest="doComplexity")
- parser.add_option("--reset", action="store_true", dest="resetFlags")
- parser.add_option("--initrna", action="store_true", dest="rnaDataType")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.set_defaults(cacheVal=0, buildIndex=False, dropIndex=False, doCount=True,
- doComplexity=False, resetFlags=False, rnaDataType=False,
- cachePages=-1)
-
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 1:
options.resetFlags, options.rnaDataType, options.cachePages)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--defaultcache", type="int", dest="cacheVal")
+ parser.add_option("--index", action="store_true", dest="buildIndex")
+ parser.add_option("--dropindex", action="store_true", dest="dropIndex")
+ parser.add_option("--nocount", action="store_false", dest="doCount")
+ parser.add_option("--complexity", action="store_true", dest="doComplexity")
+ parser.add_option("--reset", action="store_true", dest="resetFlags")
+ parser.add_option("--initrna", action="store_true", dest="rnaDataType")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "rdsmetadata"
+ cacheVal = getConfigIntOption(configParser, section, "cacheVal", 0)
+ buildIndex = getConfigBoolOption(configParser, section, "buildIndex", False)
+ dropIndex = getConfigBoolOption(configParser, section, "dropIndex", False)
+ doCount = getConfigBoolOption(configParser, section, "doCount", True)
+ doComplexity = getConfigBoolOption(configParser, section, "doComplexity", False)
+ resetFlags = getConfigBoolOption(configParser, section, "resetFlags", False)
+ rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+
+ parser.set_defaults(cacheVal=cacheVal, buildIndex=buildIndex, dropIndex=dropIndex, doCount=doCount,
+ doComplexity=doComplexity, resetFlags=resetFlags, rnaDataType=rnaDataType,
+ cachePages=cachePages)
+
+ return parser
+
+
def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False,
dropIndex=False, doCount=True, doComplexity=False, resetFlags=False,
rnaDataType=False, cachePages=-1):
doCache = True
if rnaDataType:
- rds = readDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
+ rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
else:
- rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
+ rds = ReadDataset.ReadDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
if cachePages > rds.getDefaultCacheSize():
rds.setDBcache(cachePages)
print 'psyco not running'
import sys
+import ReadDataset
+from commoncode import getMergedRegions, computeRegionBins
+
print '%s: version 2.0' % sys.argv[0]
if len(sys.argv) < 4:
print 'usage: python %s regionfile rdsfile outfilename [-bins numbins] [-field fieldNum] [-raw] [-padregion bp] [-mergeregion bp] [-cache]' % sys.argv[0]
sys.exit(1)
-from commoncode import *
-
regionfilename = sys.argv[1]
hitfile = sys.argv[2]
outfilename = sys.argv[3]
binfield = sys.argv.index('-bins') + 1
bins = int(sys.argv[binfield])
-hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
normalizationFactor = 1.0
if normalize:
chromList = hitRDS.getChromosomes(fullChrom=False)
chromList.sort()
-regionDict = getMergedRegions(regionfilename, maxDist = mergeregion, keepLabel = True, verbose = True, chromField = cField, pad=padregion)
+regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, keepLabel=True, verbose=True, chromField=cField, pad=padregion)
hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
tagCount = 0.
for binAmount in regionsBins[regionID]:
tagCount += binAmount
+
outfile.write('%s\t%s\t%.1f\t%d' % (regionID, regionID, tagCount, Len[gid]))
for binAmount in gidBins[gid]:
if normalizeBins:
except:
print 'psyco not running'
-import sys, string, optparse
-from commoncode import readDataset, getMergedRegions, findPeak, writeLog
+import sys
+import string
+import optparse
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
-versionString = "%prog: version 3.9"
+versionString = "regionCounts: version 3.10"
print versionString
def main(argv=None):
usage = "usage: python %prog regionfile rdsfile outfilename [options]"
+ parser = getParser(usage)
+ (options, args) = parser.parse_args(argv[1:])
+
+ if len(args) < 3:
+ print usage
+ sys.exit(1)
+
+ regionfilename = args[0]
+ hitfile = args[1]
+ outfilename = args[2]
+
+ regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+ options.useFullchrom, options.normalize, options.padregion,
+ options.mergeregion, options.merging, options.doUniqs, options.doMulti,
+ options.doSplices, options.usePeak, options.cachePages, options.logfilename,
+ options.doRPKM, options.doLength, options.forceRegion)
+
+
+def getParser(usage):
parser = optparse.OptionParser(usage=usage)
parser.add_option("--markRDS", action="store_true", dest="flagRDS")
parser.add_option("--chromField", type="int", dest="cField")
parser.add_option("--rpkm", action="store_true", dest="doRPKM")
parser.add_option("--length", action="store_true", dest="doLength")
parser.add_option("--force", action="store_true", dest="forceRegion")
- parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
- padregion=0, mergeregion=0, merging=True, doUniqs=True,
- doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
- logfilename="regionCounts.log", doRPKM=False, doLength=False,
- forceRegion=False)
-
- (options, args) = parser.parse_args(argv[1:])
- if len(args) < 3:
- print usage
- sys.exit(1)
-
- regionfilename = args[0]
- hitfile = args[1]
- outfilename = args[2]
-
- regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
- options.useFullchrom, options.normalize, options.padregion,
- options.mergeregion, options.merging, options.doUniqs, options.doMulti,
- options.doSplices, options.usePeak, options.cachePages, options.logfilename,
- options.doRPKM, options.doLength, options.forceRegion)
+ configParser = getConfigParser()
+ section = "regionCounts"
+ flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
+ cField = getConfigIntOption(configParser, section, "cField", 1)
+ useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ padregion = getConfigIntOption(configParser, section, "padregion", 0)
+ mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
+ merging = getConfigBoolOption(configParser, section, "merging", True)
+ doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+ doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+ logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
+ doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
+ doLength = getConfigBoolOption(configParser, section, "doLength", False)
+ forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
+
+ parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
+ padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
+ doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
+ logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
+ forceRegion=forceRegion)
+
+ return parser
def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
labeltoRegionDict = {}
regionCount = {}
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
readlen = hitRDS.getReadSize()
if cachePages > hitRDS.getDefaultCacheSize():
hitRDS.setDBcache(cachePages)
for rchrom in regionDict:
if forceRegion and rchrom not in chromList:
print rchrom
- for (label, start, stop, length) in regionDict[rchrom]:
- regionCount[label] = 0
- labelList.append(label)
- labeltoRegionDict[label] = (rchrom, start, stop)
+ for region in regionDict[rchrom]:
+ regionCount[region.label] = 0
+ labelList.append(region.label)
+ labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
for rchrom in chromList:
regionList = []
rindex = 0
dictLen = len(readDict[fullchrom])
- for (label, start, stop, length) in regionDict[rchrom]:
+ for region in regionDict[rchrom]:
+ label = region.label
+ start = region.start
+ stop = region.stop
regionCount[label] = 0
labelList.append(label)
labeltoRegionDict[label] = (rchrom, start, stop)
-
- if useFullchrom:
- fullchrom = rchrom
- else:
- fullchrom = "chr%s" % rchrom
-
- for (label, rstart, rstop, length) in regionDict[rchrom]:
- regionList.append((label, fullchrom, rstart, rstop))
+ regionList.append((label, fullchrom, start, stop))
if usePeak:
readList = []
for localIndex in xrange(rindex, dictLen):
read = readDict[fullchrom][localIndex]
- if read[0] < rstart:
+ if read["start"] < start:
rindex += 1
- elif rstart <= read[0] <= rstop:
+ elif start <= read["start"] <= stop:
readList.append(read)
else:
break
continue
readList.sort()
- (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
+ peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
try:
- topValue = smoothArray[topPos[0]]
+ topValue = peak.smoothArray[peak.topPos[0]]
except:
- print "problem with %s %s" % (str(topPos), str(smoothArray))
+ print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
continue
regionCount[label] += topValue
else:
- regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
if flagRDS:
hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
-print "%prog: version 3.0"
+print "regionintersects: version 3.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--reject1", dest="rejectOneName")
- parser.add_option("--reject2", dest="rejectTwoName")
- parser.add_option("--union", action="store_true", dest="trackReject")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--raw", action="store_false", dest="normalize")
- parser.add_option("--verbose", action="store_true", dest="doVerbose")
- parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
- doCache=False, normalize=True, doVerbose=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 5:
options.doVerbose)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--reject1", dest="rejectOneName")
+ parser.add_option("--reject2", dest="rejectTwoName")
+ parser.add_option("--union", action="store_true", dest="trackReject")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--raw", action="store_false", dest="normalize")
+ parser.add_option("--verbose", action="store_true", dest="doVerbose")
+
+ configParser = getConfigParser()
+ section = "regionintersects"
+ rejectOneName = getConfigOption(configParser, section, "rejectOneName", None)
+ rejectTwoName = getConfigOption(configParser, section, "rejectTwoName", None)
+ trackReject = getConfigBoolOption(configParser, section, "trackReject", False)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ normalize = getConfigBoolOption(configParser, section, "normalize", True)
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+
+ parser.set_defaults(rejectOneName=rejectOneName, rejectTwoName=rejectTwoName,
+ trackReject=trackReject, doCache=doCache, normalize=normalize,
+ doVerbose=doVerbose)
+
+ return parser
+
+
def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
outfilename, rejectOneName=None, rejectTwoName=None,
trackReject=False, doCache=False, normalize=True, doVerbose=False):
oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
- oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache)
- twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
+ oneRDS = ReadDataset.ReadDataset(readOneName, verbose=doVerbose, cache=doCache)
+ twoRDS = ReadDataset.ReadDataset(readTwoName, verbose=doVerbose, cache=doCache)
if normalize:
normalize1 = len(oneRDS) / 1000000.
numRegionsOne = 0
numRegionsTwo = 0
+ commonChromosomeList = set(oneDict.keys())
for rchrom in oneDict:
numRegionsOne += len(oneDict[rchrom])
for rchrom in twoDict:
+ commonChromosomeList.add(rchrom)
numRegionsTwo += len(twoDict[rchrom])
outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
- for rchrom in oneDict:
- if rchrom not in twoDict:
- continue
-
- print rchrom
+ for chromosome in commonChromosomeList:
+ print chromosome
rindex = 0
rindex2 = 0
- fullchrom = "chr" + rchrom
+ fullchrom = "chr%s" % chromosome
oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
dictLen1 = len(oneReads[fullchrom])
twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
dictLen2 = len(twoReads[fullchrom])
- chrom = rchrom
- onePeaksDict[chrom] = []
- oneFoundDict[chrom] = []
- for (start, stop, length) in oneDict[chrom]:
+ onePeaksDict[chromosome] = []
+ oneFoundDict[chromosome] = []
+ for region in oneDict[chromosome]:
+ start = region.start
+ stop = region.stop
+ length = region.length
readList = []
for localIndex in xrange(rindex, dictLen1):
read = oneReads[fullchrom][localIndex]
- if read[0] < start:
+ if read["start"] < start:
rindex += 1
- elif start <= read[0] <= stop:
+ elif start <= read["start"] <= stop:
readList.append(read)
else:
break
readList.sort()
- (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
- onePeakScore = smoothArray[topPos[0]]
- onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
+ peak = findPeak(readList, start, length, doWeight=True)
+ onePeakScore = peak.smoothArray[peak.topPos[0]]
+ onePeaksDict[chromosome].append((peak.topPos[0] + start, length/2, start, stop, peak.numHits/normalize1, onePeakScore/normalize1))
- for (start, stop, length) in twoDict[chrom]:
+ for region in twoDict[chromosome]:
+ start = region.start
+ stop = region.stop
+ length = region.length
readList2 = []
for localIndex in xrange(rindex2, dictLen2):
read = twoReads[fullchrom][localIndex]
- if read[0] < start:
+ if read["start"] < start:
rindex2 += 1
- elif start <= read[0] <= stop:
+ elif start <= read["start"] <= stop:
readList2.append(read)
else:
break
continue
readList2.sort()
- (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
+ peak2 = findPeak(readList2, start, length, doWeight=True)
+ numHits = peak2.numHits
numHits /= normalize2
twoIsCommon = False
- twoPeak = topPos[0] + start
+ twoPeak = peak2.topPos[0] + start
twoRadius = length/2
- twoPeakScore = smoothArray[topPos[0]] / normalize2
- for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+ twoPeakScore = peak2.smoothArray[peak2.topPos[0]] / normalize2
+ for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
- oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
+ oneFoundDict[chromosome].append((onePeak, oneRadius, ostart, ostop, ohits))
twoIsCommon = True
commonRegions += 1
- outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
+ outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chromosome, ostart, ostop, ohits, opeakScore, chromosome, start, stop, numHits, twoPeakScore)
if doVerbose:
print outline
- outfile.write(outline + "\n")
+ print >> outfile, outline
if trackReject and not twoIsCommon:
twoRejectIndex += 1
- outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
+ outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chromosome, start, stop, numHits, twoPeakScore)
if doReject:
- rejectTwo.write(outline + "\n")
+ print >> rejectTwo, outline
else:
- outfile.write(outline + "\n")
+ print >> outfile, outline
if doVerbose:
print outline
if trackReject:
- for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
- if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
+ for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
+ if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chromosome]:
oneRejectIndex += 1
- outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
+ outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chromosome, ostart, ostop, ohits, opeakScore)
if doReject:
- rejectOne.write(outline + "\n")
+ print >> rejectOne, outline
else:
- outfile.write(outline + "\n")
+ print >> outfile, outline
if doVerbose:
print outline
except:
pass
-import sys, math, optparse
+import sys
+import math
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
-print "%prog: version 3.1"
+print "regiontobed: version 3.2"
def usage():
usage = __doc__
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--color", dest="color")
- parser.add_option("--score", type="int", dest="scoreField")
- parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
- parser.add_option("--broadPeak", action="store_true", dest="doBroad")
- parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
- parser.add_option("--nolabel", action="store_true", dest="noLabel")
- parser.set_defaults(color="0,0,0", scoreField=None, doNarrow=False,
- doBroad=False, itemRGB=False, noLabel=False)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.itemRGB, options.noLabel)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--color", dest="color")
+ parser.add_option("--score", type="int", dest="scoreField")
+ parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
+ parser.add_option("--broadPeak", action="store_true", dest="doBroad")
+ parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
+ parser.add_option("--nolabel", action="store_true", dest="noLabel")
+
+ configParser = getConfigParser()
+ section = "regiontobed"
+ color = getConfigOption(configParser, section, "color", "0,0,0")
+ scoreField = getConfigOption(configParser, section, "scoreField", None)
+ doNarrow = getConfigBoolOption(configParser, section, "doNarrow", False)
+ doBroad = getConfigBoolOption(configParser, section, "doBroad", False)
+ itemRGB = getConfigBoolOption(configParser, section, "itemRGB", False)
+ noLabel = getConfigBoolOption(configParser, section, "noLabel", False)
+
+ parser.set_defaults(color=color, scoreField=scoreField, doNarrow=doNarrow,
+ doBroad=doBroad, itemRGB=itemRGB, noLabel=noLabel)
+
+ return parser
+
+
def regiontobed(factorlabel, regionFileName, outFileName, color="0,0,0",
scoreField=None, doNarrow=False, doBroad=False, itemRGB=False,
noLabel=False):
-"""
-Based on shell script provided by Ali.
-"""
-
import sys
import optparse
-from Erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
-from Erange.commoncode import countDuplicatesInList
+from erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
+from erange.commoncode import countDuplicatesInList, getConfigParser, getConfigOption
def main(argv=None):
usage = "usage: python %prog dbfile snpsfile genome rpkmfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--goprefix", dest="prefix")
- parser.add_option("--novelsnp", dest="novelsnpoutfilename")
- parser.add_option("--bedfile", dest="bedoutfilename")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--snpDB", action="append", dest="snpDBList",
- help="additional snp db files to check will be searched in order given")
- parser.set_defaults(prefix=None, novelsnpoutfilename=None, bedoutfilename=None, cachePages=None, snpDBList=[])
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
genome = args[2]
rpkmfilename = args[3]
+ rnaEditing(dbfile, hitfile, genome, rpkmfilename, options)
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--goprefix", dest="prefix")
+ parser.add_option("--novelsnp", dest="novelsnpoutfilename")
+ parser.add_option("--bedfile", dest="bedoutfilename")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--snpDB", action="append", dest="snpDBList",
+ help="additional snp db files to check will be searched in order given")
+
+ configParser = getConfigParser()
+ section = "rnaEditing"
+ prefix = getConfigOption(configParser, section, "prefix", None)
+ novelsnpoutfilename = getConfigOption(configParser, section, "novelsnpoutfilename", None)
+ bedoutfilename = getConfigOption(configParser, section, "bedoutfilename", None)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+ parser.set_defaults(prefix=prefix, novelsnpoutfilename=novelsnpoutfilename, bedoutfilename=bedoutfilename,
+ cachePages=cachePages, snpDBList=[])
+
+ return parser
+
+
+def rnaEditing(dbfile, hitfile, genome, rpkmfilename, options):
if options.cachePages is not None:
doCache = True
else:
except:
pass
-import sys, time, optparse
-from commoncode import readDataset
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+
def main(argv=None):
if not argv:
argv = sys.argv
- print "%prog: version 3.6"
+ print "rnafarPairs: version 3.7"
usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--verbose", action="store_true", dest="doVerbose",
- help="verbose output")
- parser.add_option("--cache", action="store_true", dest="doCache",
- help="use cache")
- parser.add_option("--maxDist", type="int", dest="maxDist",
- help="maximum distance")
- parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 4:
outfilename = args[3]
rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
-
+
+
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--verbose", action="store_true", dest="doVerbose",
+ help="verbose output")
+ parser.add_option("--cache", action="store_true", dest="doCache",
+ help="use cache")
+ parser.add_option("--maxDist", type="int", dest="maxDist",
+ help="maximum distance")
+
+ configParser = getConfigParser()
+ section = "rnafarPairs"
+ doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+
+ parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
+
+ return parser
+
def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
goodDict = {}
fields = line.split()
goodDict[fields[0]] = line
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
- rdsChromList = RDS.getChromosomes()
-
+ goodfile.close()
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
+ chromosomeList = RDS.getChromosomes()
if doVerbose:
print time.ctime()
distinct = 0
total = 0
outfile = open(outfilename,"w")
-
- idb = geneinfoDB()
- if genome == "dmelanogaster":
- geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
- else:
- geneinfoDict = idb.getallGeneInfo(genome)
-
- hg = Genome(genome)
- geneannotDict = hg.allAnnotInfo()
-
+ geneinfoDict = getGeneInfoDict(genome)
+ geneannotDict = getGeneAnnotDict(genome)
assigned = {}
farConnected = {}
- for achrom in rdsChromList:
- if achrom == "chrM":
+ for chromosome in chromosomeList:
+ if doNotProcessChromosome(chromosome):
continue
- print achrom
- uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+ print chromosome
+ uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
if doVerbose:
print len(uniqDict), time.ctime()
readList = uniqDict[readID]
if len(readList) == 2:
total += 1
- (start1, flag1, pair1) = readList[0]
- (start2, flag2, pair2) = readList[1]
-
- if flag1 != flag2:
- dist = abs(start1 - start2)
- if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
- geneID = ""
- saw1 = False
- saw2 = False
- if flag1 in goodDict:
- geneID = flag2
- farFlag = flag1
- saw1 = True
-
- if flag2 in goodDict:
- geneID = flag1
- farFlag = flag2
- saw2 = True
-
- if saw1 or saw2:
- total += 1
-
- if saw1 and saw2:
- if flag1 < flag2:
- geneID = flag1
- farFlag = flag2
- else:
- geneID = flag2
- farFlag = flag1
-
- if geneID in farConnected:
- farConnected[geneID].append(farFlag)
- else:
- farConnected[geneID] = [farFlag]
- elif geneID != "":
- try:
- if genome == "dmelanogaster":
- symbol = geneinfoDict["Dmel_" + geneID][0][0]
- else:
- symbol = geneinfoDict[geneID][0][0]
- except:
- try:
- symbol = geneannotDict[(genome, geneID)][0]
- except:
- symbol = "LOC" + geneID
-
- symbol = symbol.strip()
- symbol = symbol.replace(" ","|")
- symbol = symbol.replace("\t","|")
- if farFlag not in assigned:
- assigned[farFlag] = (symbol, geneID)
- print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
- outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
- distinct += 1
-
- farIndex = 0
+ if processReads(readList[:2], maxDist):
+ flags = (readList[0]["flag"], readList[1]["flag"])
+ processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+ total += processed
+ distinct += distinctPairs
+
+ entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
+ distinct += entriesWritten
+ outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+ outfile.close()
+ print "distinct: %d\ttotal: %d" % (distinct, total)
+ print time.ctime()
+
+
+def doNotProcessChromosome(chromosome):
+ return chromosome == "chrM"
+
+
+def processReads(reads, maxDist):
+ process = False
+ start1 = reads[0]["start"]
+ start2 = reads[1]["start"]
+ dist = abs(start1 - start2)
+ flag1 = reads[0]["flag"]
+ flag2 = reads[1]["flag"]
+
+ if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+ process = True
+
+ return process
+
+
+def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+ flag1, flag2 = flags
+ total = 0
+ distinct = 0
+ read1IsGood = flag1 in goodDict
+ read2IsGood = flag2 in goodDict
+
+ if read1IsGood and read2IsGood:
+ if flag1 < flag2:
+ geneID = flag1
+ farFlag = flag2
+ else:
+ geneID = flag2
+ farFlag = flag1
+
+ try:
+ farConnected[geneID].append(farFlag)
+ except KeyError:
+ farConnected[geneID] = [farFlag]
+ elif read1IsGood or read2IsGood:
+ total += 1
+ if read2IsGood:
+ farFlag = flag2
+ geneID = flag1
+ else:
+ farFlag = flag1
+ geneID = flag2
+
+ try:
+ if genome == "dmelanogaster":
+ symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+ else:
+ symbol = geneInfoDict[geneID][0][0]
+ except (KeyError, IndexError):
+ try:
+ symbol = geneAnnotDict[(genome, geneID)][0]
+ except (KeyError, IndexError):
+ symbol = "LOC%s" % geneID
+
+ symbol = symbol.strip()
+ symbol = symbol.replace(" ","|")
+ symbol = symbol.replace("\t","|")
+
+ if farFlag not in assigned:
+ assigned[farFlag] = (symbol, geneID)
+ print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+ distinct += 1
+
+ return total, distinct
+
+
+def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
+ total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
+ writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
+
+ return written
+
+
+def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
+ total = 0
+ written = 0
for farFlag in farConnected:
geneID = ""
symbol = ""
idList = [farFlag] + farConnected[farFlag]
- for oneID in idList:
- if oneID in assigned:
- (symbol, geneID) = assigned[oneID]
+ for ID in idList:
+ if ID in assigned:
+ (symbol, geneID) = assigned[ID]
if geneID == "":
- farIndex += 1
- symbol = "FAR%d" % farIndex
- geneID = -1 * farIndex
+ total += 1
+ symbol = "FAR%d" % total
+ geneID = -1 * total
- for oneID in idList:
- if oneID not in assigned:
- print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
- outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
- distinct += 1
- assigned[oneID] = (symbol, geneID)
+ for ID in idList:
+ if ID not in assigned:
+ print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
+ outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
+ written += 1
+ assigned[ID] = (symbol, geneID)
+ return total, written
+
+
+def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
for farFlag in goodDict:
if farFlag not in assigned:
farIndex += 1
print line.strip()
outfile.write(line)
- outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
- outfile.close()
- print "distinct: %d\ttotal: %d" % (distinct, total)
- print time.ctime()
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file
import math, cmath
import sys
import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
alphaVal = 0.5
-print "%prog: version 3.1"
+print "scatterfields: version 3.2"
def main(argv=None):
if not argv:
usage = __doc__
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--xmin", type="float", dest="forcexmin")
- parser.add_option("--ymin", type="float", dest="forceymin")
- parser.add_option("--xmax", type="float", dest="forcexmax")
- parser.add_option("--ymax", type="float", dest="forceymax")
- parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
- parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
- parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
- parser.add_option("--order", type="int", dest="fitOrder")
- parser.add_option("--base", type="int", dest="base")
- parser.add_option("--markGenes", dest="markFile")
- parser.add_option("--markfold", type="float", dest="foldChange")
- parser.add_option("--noregression", action="store_false", dest="doRegression")
- parser.add_option("--large", action="store_true", dest="plotLarge")
- parser.add_option("--markdiag", action="store_true", dest="markDiag")
- parser.add_option("--title", type="int", dest="figtitle")
- parser.add_option("--verbose", action="store_true", dest="verbose")
- parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False,
- doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None,
- foldChange=None, doRegression=True, plotLarge=False, markDiag=False,
- figtitle="", verbose=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 6:
options.markDiag, options.figtitle, options.verbose)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--xmin", type="float", dest="forcexmin")
+ parser.add_option("--ymin", type="float", dest="forceymin")
+ parser.add_option("--xmax", type="float", dest="forcexmax")
+ parser.add_option("--ymax", type="float", dest="forceymax")
+ parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
+ parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
+ parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
+ parser.add_option("--order", type="int", dest="fitOrder")
+ parser.add_option("--base", type="int", dest="base")
+ parser.add_option("--markGenes", dest="markFile")
+ parser.add_option("--markfold", type="float", dest="foldChange")
+ parser.add_option("--noregression", action="store_false", dest="doRegression")
+ parser.add_option("--large", action="store_true", dest="plotLarge")
+ parser.add_option("--markdiag", action="store_true", dest="markDiag")
+ parser.add_option("--title", type="int", dest="figtitle")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+
+ configParser = getConfigParser()
+ section = "scatterfields"
+ forcexmin = getConfigFloatOption(configParser, section, "forcexmin", 0.0)
+ forceymin = getConfigFloatOption(configParser, section, "forceymin", 0.0)
+ forcexmax = getConfigIntOption(configParser, section, "forcexmax", -1)
+ forceymax = getConfigIntOption(configParser, section, "forceymax", -1)
+ doLogF1 = getConfigBoolOption(configParser, section, "doLogF1", False)
+ doLogF2 = getConfigBoolOption(configParser, section, "doLogF2", False)
+ doArcsinh = getConfigBoolOption(configParser, section, "doArcsinh", False)
+ fitOrder = getConfigIntOption(configParser, section, "fitOrder", 1)
+ base = getConfigIntOption(configParser, section, "base", 10)
+ markFile = getConfigOption(configParser, section, "markFile", None)
+ foldChange = getConfigOption(configParser, section, "foldChange", None)
+ doRegression = getConfigBoolOption(configParser, section, "doRegression", True)
+ plotLarge = getConfigBoolOption(configParser, section, "plotLarge", False)
+ markDiag = getConfigBoolOption(configParser, section, "markDiag", False)
+ figtitle = getConfigOption(configParser, section, "figtitle", "")
+ verbose = getConfigBoolOption(configParser, section, "verbose", False)
+
+ parser.set_defaults(forcexmin=forcexmin, forceymin=forceymin, forcexmax=forcexmax, forceymax=forceymax, doLogF1=doLogF1,
+ doLogF2=doLogF2, doArcsinh=doArcsinh, fitOrder=fitOrder, base=base, markFile=markFile,
+ foldChange=foldChange, doRegression=doRegression, plotLarge=plotLarge, markDiag=markDiag,
+ figtitle=figtitle, verbose=verbose)
+
+ return parser
+
+
def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
import sys
-print "%s: version 2.0" % sys.argv[0]
+print "siteintersects: version 2.1"
def main(argv=None):
import sys
import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigFloatOption
-print "%prog: version 1.1"
+print "stallCategory: version 1.2"
def main(argv=None):
usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--out", dest="outFileName")
- parser.add_option("--statout", dest="statOutFileName")
- parser.add_option("--expression", type="float", dest="expressionLevel")
- parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--out", dest="outFileName")
+ parser.add_option("--statout", dest="statOutFileName")
+ parser.add_option("--expression", type="float", dest="expressionLevel")
+
+ configParser = getConfigParser()
+ section = "stallCategory"
+ outFileName = getConfigOption(configParser, section, "outFileName", None)
+ statOutFileName = getConfigOption(configParser, section, "statOutFileName", None)
+ expressionLevel = getConfigFloatOption(configParser, section, "expressionLevel", 0.9)
+
+ parser.set_defaults(outFileName=outFileName, statOutFileName=statOutFileName, expressionLevel=expressionLevel)
+
+ return parser
+
+
def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
infile1 = open(inFile1Name)
'''
import unittest
import os
-from Erange import analyzego
+from erange import analyzego
class TestAnalyzeGO(unittest.TestCase):
import unittest
import string
import os
-from Erange import chksnp
+from erange import chksnp
dbPath = "/Users/sau/work/snpdb/hg18"
import os
import string
from array import array
-from Erange import commoncode
+from erange import commoncode
from cistematic.genomes import Genome
result = ([], 0.0, array("f"), 0.0)
self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
- hitList= [[4, "+", 0.5]]
+ hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
- hitList= [[4, "+", 0.5]]
+ hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
- hitList= [[4, "+", 0.5]]
+ hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
#TODO: write test
def testGetBestShiftForRegion(self):
- hitList = [[14, "-", 1.0], [16, "-", 1.0], [24, "+", 1.0], [26, "+", 10.0]]
+ hitList = [{"start": 14, "sense": "-", "weight": 1.0},
+ {"start": 16, "sense": "-", "weight": 1.0},
+ {"start": 24, "sense": "+", "weight": 1.0},
+ {"start": 26, "sense": "+", "weight": 10.0}
+ ]
self.assertEquals(74, commoncode.getBestShiftForRegion(hitList, 0, 100))
self.assertEquals(16, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=30))
self.assertEquals(0, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=10))
+ #TODO: write test
+ def testFindPeakSequenceArray(self):
+ pass
+
+
+ #TODO: write test
+ def testGetPeakPositionList(self):
+ pass
+
+
#TODO: write test
def testGetFeaturesByChromDict(self):
firstFeatures = {"I": (4123, 4219, "Y74C9A.3", "R", "3UTR"),
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[1, "+", 1.0]]}
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
"2": [("regionID2", 1, 1000, 1000, "F")]
}
- hitDict = {"1": [[1, "+", 1.0]],
- "2": [[1, "+", 1.0]]
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+ "2": [{"start": 1, "sense": "+", "weight": 1.0}]
}
result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}]}
result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [15, "+", 1.0]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}, {"start": 15, "sense": "+", "weight": 1.0}]}
result = ({"regionID": [2.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [200, "+", 2.0]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}, {"start": 200, "sense": "+", "weight": 2.0}]}
result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[1, "+", 1.0]]}
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
regionList = ["regionID"]
result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[1, "+", 1.0]]}
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
regionList = ["empty region"]
result = ({"empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
"2": [("regionID2", 1, 1000, 1000, "F")]
}
- hitDict = {"1": [[1, "+", 1.0]],
- "2": [[1, "+", 1.0]]
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+ "2": [{"start": 1, "sense": "+", "weight": 1.0}]
}
regionList = ["regionID", "regionID2"]
result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
"2": [("regionID2", 1, 1000, 1000, "F")]
}
- hitDict = {"1": [[1, "+", 1.0]],
- "2": [[1, "+", 1.0]]
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+ "2": [{"start": 1, "sense": "+", "weight": 1.0}]
}
regionList = ["empty region", "regionID2"]
result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
"2": [("regionID2", 1, 1000, 1000, "F")]
}
- hitDict = {"1": [[1, "+", 1.0]],
- "2": [[1, "+", 1.0]]
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+ "2": [{"start": 1, "sense": "+", "weight": 1.0}]
}
regionList = ["regionID2"]
result = ({"regionID2": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[1, "+", 1.0]]}
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
result = ({"regionID": [2.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizedTag=2.0))
regionsByChromDict = {"1": [(1, 100, "regionID", 100, "F")]}
- hitDict = {"1": [[1, "+", 1.0]]}
+ hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, defaultRegionFormat=False))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}]}
fixedFirstBin = 20
result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}]}
fixedFirstBin = 5
result = ({"regionID": [0.0, 1.0, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
fixedFirstBin = 20
result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[80, "+", 1.0], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 80, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
fixedFirstBin = 5
result = ({"regionID": [0.0, 1.5, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
binLength = 25
result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
binLength = 50
result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
binLength = 15
result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
- hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 40, "sense": "+", "weight": 0.7}, {"start": 85, "sense": "+", "weight": 0.5}]}
binLength = 15
result = ({"regionID": [1.0, 0.0, 0.7, 0.5]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
regionsByChromDict = {"1": [("regionID", 1, 100, 100, "R")]}
- hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+ hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 40, "sense": "+", "weight": 0.7}, {"start": 85, "sense": "+", "weight": 0.5}]}
result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
import testChksnp
import testCommoncode
import testGeneMrnaCounts
+import testGeneMrnaCountsWeighted
#import testGetFasta
import testGetNovelSNPs
import testGetSNPGeneInfo
import testMakeRdsFromBam
import testMakeSNPTrack
import testMarkLinkers
+import testPeak
import testPeaksToRegion
import testProcessVelvet
import testReadDataset
import testRnaAToIFilter
import testRnaEditing
import testRNAPATH
-import testTranscripts
+#import testTranscripts
def main(argv=None):
suite.addTest(testChksnp.suite())
suite.addTest(testCommoncode.suite())
suite.addTest(testGeneMrnaCounts.suite())
+ suite.addTest(testGeneMrnaCountsWeighted.suite())
#suite.addTest(testGetFasta.suite())
suite.addTest(testGetNovelSNPs.suite())
suite.addTest(testGetSNPGeneInfo.suite())
suite.addTest(testMakeRdsFromBam.suite())
suite.addTest(testMakeSNPTrack.suite())
suite.addTest(testMarkLinkers.suite())
+ suite.addTest(testPeak.suite())
suite.addTest(testPeaksToRegion.suite())
suite.addTest(testProcessVelvet.suite())
suite.addTest(testReadDataset.suite())
'''
import unittest
import os
-from Erange import geneMrnaCounts
+from erange import geneMrnaCounts
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome
-from Erange.commoncode import readDataset
+from erange import ReadDataset
class TestGeneMrnaCounts(unittest.TestCase):
outfilename = "testGeneMrnaCounts.txt"
def setUp(self):
- self.rds = readDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
+ self.rds = ReadDataset.ReadDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
def tearDown(self):
outfile.close()
os.remove(self.outfilename)
- reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+ reads = self.rds.getReadsDict(withFlag=True)
self.assertEquals("728439", reads["1"][0]["flag"])
geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
outfile.close()
os.remove(self.outfilename)
- reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+ reads = self.rds.getReadsDict(withFlag=True)
self.assertEquals("728439", reads["1"][0]["flag"])
--- /dev/null
+'''
+Created on Oct 20, 2010
+
+@author: sau
+'''
+import unittest
+from erange import geneMrnaCountsWeighted
+
+
+class TestGeneMrnaCountsWeighted(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ #TODO: write test
+ def testMain(self):
+ pass
+
+
+ #TODO: write test
+ def testGeneMrnaCountsWeighted(self):
+ pass
+
+
+ def testDoNotProcessChromosome(self):
+ chromosomeList = []
+ self.assertTrue(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+ chromosomeList = ["chr1"]
+ self.assertFalse(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+ chromosomeList = ["chr2"]
+ self.assertTrue(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+
+ #TODO: write test
+ def testGetReadGIDs(self):
+ pass
+
+
+ def testGetGeneSymbolEmptyDicts(self):
+ geneinfoDict = {}
+ geneannotDict = {}
+ genome = "hsapien"
+ self.assertEquals("FARGene", geneMrnaCountsWeighted.getGeneSymbol("FARGene", genome, geneinfoDict, geneannotDict))
+ self.assertEquals("LOCGene", geneMrnaCountsWeighted.getGeneSymbol("Gene", genome, geneinfoDict, geneannotDict))
+
+
+ def testGetGeneSymbolFromInfoDict(self):
+ geneinfoDict = {"Gene": [("GeneName", "AltGeneName")]}
+ geneannotDict = {}
+ self.assertEquals("GeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "hsapien", geneinfoDict, geneannotDict))
+ self.assertEquals("AltGeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "celegans", geneinfoDict, geneannotDict))
+
+
+ def testGetGeneSymbolFromAnnotDict(self):
+ geneinfoDict = {}
+ geneannotDict = {("hsapien", "Gene"): ["GeneName", "AltGeneName"]}
+ self.assertEquals("GeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "hsapien", geneinfoDict, geneannotDict))
+ self.assertEquals("LOCGene", geneMrnaCountsWeighted.getGeneSymbol("Gene", "celegans", geneinfoDict, geneannotDict))
+
+
+ #TODO: write test
+ def testWriteCountsToFile(self):
+ pass
+
+
+ def testGetTagCount(self):
+ self.assertEquals(0.0, geneMrnaCountsWeighted.getTagCount({}, "gene", {"gene": ""}, {}))
+
+ uniqueCountDict = {"gene": 1,
+ "related1": 1,
+ "related2": 1
+ }
+ gidReadDict = {"gene": ["read1"]}
+ read2GidDict = {"read1": ["related1", "related2"]}
+ self.assertEquals(0.5, geneMrnaCountsWeighted.getTagCount(uniqueCountDict, "gene", gidReadDict, read2GidDict))
+ self.assertEquals(0.5, geneMrnaCountsWeighted.getTagCount({}, "gene", gidReadDict, read2GidDict))
+
+ uniqueCountDict["gene"] = 2
+ self.assertEquals(1.0, geneMrnaCountsWeighted.getTagCount(uniqueCountDict, "gene", gidReadDict, read2GidDict))
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestGeneMrnaCountsWeighted))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
'''
import unittest
import os
-from Erange import getfasta
-#from Erange import ReadDataset
-from Erange.commoncode import readDataset
+from erange import getfasta
+#from erange import ReadDataset
+from erange import ReadDataset
testDBName = "testRDS.rds"
# need to check to see if the issue might be with commoncode.findPeak as there is a lot of questionable
# logic in that one
def testGetRegionUsingRDS(self):
- rds = readDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+ rds = ReadDataset.ReadDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
rds.insertMetadata([("readsize", "100")])
rdsEntryList = [("testRead", "chr1", 10, 100, "+", 1.0, "", "")]
rds.insertUniqs(rdsEntryList)
@author: sau
'''
import unittest
-from Erange import getSNPGeneInfo
+from erange import getSNPGeneInfo
class TestGetSNPGeneInfo(unittest.TestCase):
@author: sau
'''
import os, unittest
-from Erange.commoncode import readDataset
-from Erange import getSNPs
+from erange import ReadDataset
+from erange import getSNPs
class TestGetSNPs(unittest.TestCase):
def setUp(self):
- self.rdsDNA = readDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
+ self.rdsDNA = ReadDataset.ReadDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
uniqueInsertList = [("uniqueID1", "chr1", 10, 20, "+", 1.0, "", ""),
("uniqueID2", "chr1", 100, 200, "+", 1.0, "", ""),
self.rdsDNA.insertUniqs(uniqueInsertList)
self.rdsDNA.insertMulti(multiInsertList)
+ self.rdsDNA.insertMetadata([("readsize", 100)])
def tearDown(self):
@author: sau
'''
import unittest
-from Erange import MakeBamFromRds
+from erange import MakeBamFromRds
class TestMakeBamFromRds(unittest.TestCase):
pass
+ #TODO: finish test
+ def testFixSpliceSense(self):
+ seq = "NNNGTAGNNN"
+ startRight = 7
+ stopLeft = 3
+ sense, count = MakeBamFromRds.fixSpliceSense(seq, startRight, stopLeft)
+ self.assertEquals("+", sense)
+ self.assertEquals(0, count)
+
+
+ #TODO: finish test
def testGetMismatches(self):
mismatchString = "3A10T"
self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("A3G, T10A"))
'''
import os, unittest
-from Erange import makeGraphs
+from erange import makeGraphs
testFileName = "/tmp/testEdgeFileForUnitTests.txt"
@author: sau
'''
import unittest
-from Erange import MakeRdsFromBam
+from erange import MakeRdsFromBam
-class TestMakeRdsFromBam(unittest.TestCase):
-
-
- def testGetSpliceBounds(self):
- start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
- self.assertEqual(start, 0, "incorrect start position for 262")
- self.assertEqual(startR, 8, "incorrect right start position for 262")
- self.assertEqual(stopL, 2, "incorrect left stop position for 262")
- self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+class TestMakeRdsFromBam(unittest.TestCase):
def testGetMismatches(self):
self.assertEquals(resultString, MakeRdsFromBam.getMismatches("badMismatchTagData", querySequence, "+"))
+ #TODO: write test
+ def testGetPairedReadNumberSuffix(self):
+ pass
+
+
+ #TODO: write test
+ def testGetParser(self):
+ pass
+
+
+ #TODO: write test
+ def testGetRDSEntry(self):
+ pass
+
+
+ #TODO: write test
+ def testGetRDSSpliceEntry(self):
+ pass
+
+
+ def testGetReadSense(self):
+ reverse = False
+ self.assertEqual("+", MakeRdsFromBam.getReadSense(reverse))
+ reverse = True
+ self.assertEqual("-", MakeRdsFromBam.getReadSense(reverse))
+
+
+ def testGetSpliceBounds(self):
+ start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
+
+ self.assertEqual(start, 0, "incorrect start position for 262")
+ self.assertEqual(startR, 8, "incorrect right start position for 262")
+ self.assertEqual(stopL, 2, "incorrect left stop position for 262")
+ self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+
+
+ #TODO: write test
+ def testIsPairedRead(self):
+ pass
+
+
def testIsSpliceEntry(self):
self.assertTrue(MakeRdsFromBam.isSpliceEntry([(1,6), (3, 4), (1, 2)]))
self.assertFalse(MakeRdsFromBam.isSpliceEntry([(1,6), (2, 4), (1, 2)]))
self.assertFalse(MakeRdsFromBam.isSpliceEntry(""))
+ #TODO: write test
+ def testMain(self):
+ pass
+
+
+ #TODO: write test
+ def testMakeRDSFromBAM(self):
+ pass
+
+
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestMakeRdsFromBam))
@author: sau
'''
import unittest
-from Erange import makeSNPtrack
+from erange import makeSNPtrack
class TestMakeSNPTrack(unittest.TestCase):
'''
import unittest
import os
-from Erange.chiapet import markLinkers
+from erange.chiapet import markLinkers
class TestMarkLinkers(unittest.TestCase):
--- /dev/null
+'''
+Created on Oct 29, 2010
+
+@author: sau
+'''
+import unittest
+from array import array
+from erange import Peak
+
+
+class TestPeak(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testInitPeak(self):
+ topPos = 1
+ numHits = 2
+ smoothArray = array("f", [0.] * 10)
+ numPlus = 3
+ peak = Peak.Peak(topPos, numHits, smoothArray, numPlus)
+ self.assertEquals(1, peak.topPos)
+ self.assertEquals(2, peak.numHits)
+
+ def testProperties(self):
+ topPos = 1
+ numHits = 2
+ smoothArray = array("f", [0.] * 10)
+ numPlus = 3
+ peak = Peak.Peak(topPos, numHits, smoothArray, numPlus)
+ peak.topPos = 10
+ self.assertEquals(10, peak.topPos)
+ peak.numHits = 20
+ self.assertEquals(20, peak.numHits)
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestPeak))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
'''
import unittest
import os
-from Erange import peakstoregion
+from erange import peakstoregion
inFileName = "testPeaksToRegionInFile.txt"
outFileName = "testPeaksToRegionOutFile.txt"
'''
import unittest
import os
-from Erange.rnapath import processvelvet
+from erange.rnapath import processvelvet
class TestProcessVelvet(unittest.TestCase):
'''
import unittest
import os
-from Erange.rnapath import RNAPATH
+from erange.rnapath import RNAPATH
compDict = {"A": "T",
"T": "A",
import unittest
import os
import sqlite3 as sqlite
-from Erange import ReadDataset
+from erange import ReadDataset
testDBName = "testRDS.rds"
rnaTestDBName = "testRDSRNA.rds"
--- /dev/null
+import unittest
+from erange import Region
+
+
+class TestRegion(unittest.TestCase):
+
+
+ def setUp(self):
+ pass
+
+
+ def tearDown(self):
+ pass
+
+
+ def testRegion(self):
+ factor = "foo"
+ region = Region.Region(factor, "index", "chrom", "start", "stop", "numReads", "foldRatio", "multiP", "peakDescription", "shift")
+ self.assertEquals(factor, region.factor)
+
+
+ def testDirectionalRegion(self):
+ factor = "foo"
+ region = Region.DirectionalRegion(factor, "index", "chrom", "start", "stop", "numReads", "foldRatio", "multiP", "plusP", "leftP", "peakDescription", "shift")
+ self.assertEquals(factor, region.factor)
+
+
+ def testPrintDeirectionalRegionWithShift(self):
+ factor = "foo"
+ region = Region.DirectionalRegion(factor, 1, "chrom", 10, 100, 1.0, 0.5, 0.4, 0.3, 0.2, "peakDescription", 9)
+ result = "foo1\tchrom\t10\t100\t1.0\t0.5\t0.4\t0.3\t0.2\tpeakDescription\t9"
+ self.assertEquals(result, region.printRegionWithShift())
+
+
+def suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(TestRegion))
+
+ return suite
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
\ No newline at end of file
@author: sau
'''
import unittest
-from Erange import rnaAToIFilter
+from erange import rnaAToIFilter
class TestRnaAToIFilter(unittest.TestCase):
@author: sau
'''
import unittest
-from Erange import rnaEditing
+from erange import rnaEditing
class TestRnaEditing(unittest.TestCase):
'''
import unittest
import os
-from Erange import transcripts
+from erange import transcripts
inFileName = "testTranscriptsInFile.txt"
outFileName = "testTranscriptsOutFile.txt"
@author: sau
'''
import unittest
-from Erange import makebedfromrds
+from erange import makebedfromrds
class TestMakeBedFromRds(unittest.TestCase):
where transcriptome size is in Gbp, cell count is in arbitrary units and efficiency is a fraction
"""
-import sys, optparse
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigFloatOption
def main(argv=None):
if not argv:
argv = sys.argv
- print "%prog: version 3.0"
+ print "transcripts: version 3.1"
usage = "usage: python %prog rpkmFile outFile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--transcriptome", type="float", dest="tSize",
- help="transcriptome size in Gbp [default 200000.0]")
- parser.add_option("--cells", type="float", dest="cellCount",
- help="arbitrary units [default 1e6]")
- parser.add_option("--efficiency", type="float", dest="efficiency",
- help="fraction [default 0.3]")
- parser.set_defaults(tSize=200000.0, cellCount=1e6, efficiency=0.3)
+ parser = makeParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
transcripts(infile, outfile, options.tSize, options.cellCount, options.efficiency)
+def makeParser(usage=""):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--transcriptome", type="float", dest="tSize",
+ help="transcriptome size in Gbp [default 200000.0]")
+ parser.add_option("--cells", type="float", dest="cellCount",
+ help="arbitrary units [default 1e6]")
+ parser.add_option("--efficiency", type="float", dest="efficiency",
+ help="fraction [default 0.3]")
+
+ configParser = getConfigParser()
+ section = "transcripts"
+ tSize = getConfigFloatOption(configParser, section, "tSize", 200000.0)
+ cellCount = getConfigFloatOption(configParser, section, "cellCount", 1e6)
+ efficiency = getConfigFloatOption(configParser, section, "efficiency", 0.3)
+
+ parser.set_defaults(tSize=tSize, cellCount=cellCount, efficiency=efficiency)
+
+ return parser
+
+
def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency=0.3):
infile = open(infilename)
outfile = open(outfilename, "w")
transcripts = rpkm * tSize
transPerCell = transcripts / cellCount / efficiency
outfile.write("%s\t%.1f\t%.1f\n" % (fields[0], transcripts, transPerCell))
+
infile.close()
outfile.close()
# Created by Ali Mortazavi on 8/12/08.
#
-import sys, optparse
+import sys
+import optparse
from cistematic.core import complement
+from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
-print "%prog: version 2.1"
+print "trimreads: version 2.2"
def main(argv=None):
if not argv:
usage = "usage: python %prog length infile outfile [--fastq] [--fromback] [--paired] [--flip] [--filter maxN]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--fastq", action="store_true", dest="fastq")
- parser.add_option("--fromback", action="store_true", dest="fromBack")
- parser.add_option("--paired", action="store_true", dest="paired")
- parser.add_option("--flip", action="store_true", dest="flipseq")
- parser.add_option("--filter", type="int", dest="maxN")
- parser.set_defaults(fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
trimreads(length, infile, outfile, options.fastq, options.fromBack, options.paired, options.flipseq, options.maxN)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--fastq", action="store_true", dest="fastq")
+ parser.add_option("--fromback", action="store_true", dest="fromBack")
+ parser.add_option("--paired", action="store_true", dest="paired")
+ parser.add_option("--flip", action="store_true", dest="flipseq")
+ parser.add_option("--filter", type="int", dest="maxN")
+
+ configParser = getConfigParser()
+ section = "trimreads"
+ fastq = getConfigBoolOption(configParser, section, "fastq", False)
+ fromBack = getConfigBoolOption(configParser, section, "fromBack", False)
+ paired = getConfigBoolOption(configParser, section, "paired", False)
+ flipseq = getConfigBoolOption(configParser, section, "flipseq", False)
+ maxN = getConfigOption(configParser, section, "maxN", None)
+
+ parser.set_defaults(fastq=fastq, fromBack=fromBack, paired=paired, flipseq=flipseq, maxN=maxN)
+
+ return parser
+
+
def trimreads(length, inFileName, outFileName, fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None):
infile = open(inFileName)
outfile = open(outFileName, "w")
from commoncode import getMergedRegions, getLocusByChromDict
from cistematic.genomes import Genome
-print "%s: version 1.3" % sys.argv[0]
+print "utrChanges: version 1.4"
def main(argv=None):
hg = Genome(genome)
- origLocusByChromDict = getLocusByChromDict(hg, keepSense = True)
- newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict = acceptDict, keepSense = True)
+ origLocusByChromDict = getLocusByChromDict(hg, keepSense=True)
+ newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
new3utr = 0
new5utr = 0
except:
pass
-from commoncode import readDataset
-import sys, time, string, optparse
+import sys
+import time
+import string
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
-print "%prog: version 3.1"
+
+print "weighMultireads: version 3.3"
def main(argv=None):
if not argv:
usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--radius", type="int", dest="radius")
- parser.add_option("--noradius", action="store_false", dest="doRadius")
- parser.add_option("--usePairs", type="int", dest="pairDist")
- parser.add_option("--verbose", action="store_true", dest="verbose")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None)
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 1:
weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
-def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--radius", type="int", dest="radius")
+ parser.add_option("--noradius", action="store_false", dest="doRadius")
+ parser.add_option("--usePairs", type="int", dest="pairDist")
+ parser.add_option("--verbose", action="store_true", dest="verbose")
+ parser.add_option("--cache", type="int", dest="cachePages")
- if radius is not None:
- doRadius = True
- else:
- radius = 100
+ configParser = getConfigParser()
+ section = "weighMultireads"
+ radius = getConfigOption(configParser, section, "radius", None)
+ doRadius = getConfigBoolOption(configParser, section, "doRadius", True)
+ pairDist = getConfigOption(configParser, section, "pairDist", None)
+ verbose = getConfigBoolOption(configParser, section, "verbose", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+ parser.set_defaults(radius=radius, doRadius=doRadius, pairDist=pairDist, verbose=verbose, cachePages=cachePages)
- usePairs = False
- if pairDist is not None:
- usePairs = True
+ return parser
+
+
+def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
- tooFar = pairDist * 10
-
- doCache = False
if cachePages is not None:
doCache = True
else:
+ doCache = False
cachePages = 1
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
- readlen = RDS.getReadSize()
- halfreadlen = readlen / 2
-
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
if cachePages > RDS.getDefaultCacheSize():
RDS.setDBcache(cachePages)
if verbose:
print time.ctime()
- multiIDs = RDS.getReadIDs(uniqs=False,multi=True)
+ multiIDs = RDS.getReadIDs(uniqs=False, multi=True)
if verbose:
print "got multiIDs ", time.ctime()
- fixedPair = 0
fixedReads = []
- if usePairs:
- print "doing pairs with pairDist = %d" % pairDist
- uidDict = {}
- midDict = {}
- jointList = []
- bothMultiList = []
- mainIDList = []
- guDict = {}
- muDict = {}
-
- if RDS.dataType == "RNA":
- uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True)
- else:
- uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False)
+ if pairDist is not None:
+ fixedReads = reweighUsingPairs(RDS, pairDist, multiIDs, verbose)
- if verbose:
- print "got uniqIDs ", time.ctime()
+ if radius is not None:
+ doRadius = True
+ else:
+ radius = 100
- for readID in uniqIDs:
- (mainID, pairID) = readID.split("/")
- try:
- uidDict[mainID].append(pairID)
- except:
- uidDict[mainID] = [pairID]
- mainIDList.append(mainID)
+ if doRadius:
+ reweighUsingRadius(RDS, radius, multiIDs, fixedReads, verbose)
- if verbose:
- print "uidDict all ", len(uidDict), time.ctime()
+ if doCache:
+ RDS.saveCacheDB(rdsfile)
- for mainID in mainIDList:
- if len(uidDict[mainID]) == 2:
- del uidDict[mainID]
+ if verbose:
+ print "finished", time.ctime()
- if verbose:
- print "uidDict first candidates ", len(uidDict), time.ctime()
- for readID in multiIDs:
- (frontID, multiplicity) = readID.split("::")
- (mainID, pairID) = frontID.split("/")
- try:
- if pairID not in midDict[mainID]:
- midDict[mainID].append(pairID)
- except:
- midDict[mainID] = [pairID]
+def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
+ fixedPair = 0
+ tooFar = pairDist * 10
+ readlen = RDS.getReadSize()
+ fixedReads = []
+ print "doing pairs with pairDist = %d" % pairDist
+ hasSplices = RDS.dataType == "RNA"
+ uniqIDs = RDS.getReadIDs(uniqs=True, multi=False, splices=hasSplices)
+
+ if verbose:
+ print "got uniqIDs ", time.ctime()
- if verbose:
- print "all multis ", len(midDict), time.ctime()
+ jointList, bothMultiList = getReadIDLists(uniqIDs, multiIDs, verbose)
+ uniqDict = getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose)
+ if verbose:
+ print "guDict actual ", len(uniqDict), time.ctime()
- mainIDList = uidDict.keys()
- for mainID in mainIDList:
- if mainID not in midDict:
- del uidDict[mainID]
+ multiDict = getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose)
+ if verbose:
+ print "muDict actual ", len(multiDict), time.ctime()
+
+ RDS.setSynchronousPragma("OFF")
+ for readID in jointList:
+ try:
+ ustart = uniqDict[readID]["start"]
+ ustop = ustart + readlen
+ except KeyError:
+ ustart = uniqDict[readID]["startL"]
+ ustop = uniqDict[readID]["stopR"]
+
+ uniqReadChrom = uniqDict[readID]["chrom"]
+ multiReadList = multiDict[readID]
+ numMultiReads = len(multiReadList)
+ bestMatch = [tooFar] * numMultiReads
+ found = False
+ for index in range(numMultiReads):
+ mstart = multiReadList[index]["start"]
+ multiReadChrom = multiReadList[index]["chrom"]
+ mpair = multiReadList[index]["pairID"]
+ if uniqReadChrom != multiReadChrom:
+ continue
- if verbose:
- print "uidDict actual candidates ", len(uidDict), time.ctime()
+ if abs(mstart - ustart) < pairDist:
+ bestMatch[index] = abs(mstart - ustart)
+ found = True
+ elif abs(mstart - ustop) < pairDist:
+ bestMatch[index] = abs(mstart - ustop)
+ found = True
- for readID in midDict:
- listLen = len(midDict[readID])
- if listLen == 1:
- if readID in uidDict:
- jointList.append(readID)
- elif listLen == 2:
- bothMultiList.append(readID)
+ if found:
+ theMatch = -1
+ theDist = tooFar
+ reweighList = []
+ for index in range(numMultiReads):
+ if theDist > bestMatch[index]:
+ theMatch = index
+ theDist = bestMatch[index]
+
+ theID = string.join([readID, mpair], "/")
+ for index in range(numMultiReads):
+ if index == theMatch:
+ score = 1 - ((numMultiReads - 1) / (100. * numMultiReads))
+ else:
+ score = 1 / (100. * numMultiReads)
+
+ start = multiReadList[index][0]
+ chrom = "chr%s" % multiReadList[index][1]
+ reweighList.append((round(score,3), chrom, start, theID))
+
+ #TODO: Is this right? If match index is 0 are we doing nothing?
+ if theMatch > 0:
+ RDS.reweighMultireads(reweighList)
+ fixedPair += 1
+ if verbose and fixedPair % 10000 == 1:
+ print "fixed %d" % fixedPair
+ print uniqDict[readID]
+ print multiDict[readID]
+ print reweighList
+
+ fixedReads.append(theID)
+
+ RDS.setSynchronousPragma("ON")
+
+ print "fixed %d pairs" % fixedPair
+ print time.ctime()
+
+ return fixedReads
+
+
+def getReadIDLists(uniqIDs, multiIDs, verbose=False):
+ uidDict = {}
+ mainIDList = []
+ for readID in uniqIDs:
+ (mainID, pairID) = readID.split("/")
+ try:
+ uidDict[mainID].append(pairID)
+ except:
+ uidDict[mainID] = [pairID]
+ mainIDList.append(mainID)
- if verbose:
- print "joint ", len(jointList), time.ctime()
- print "bothMulti ", len(bothMultiList), time.ctime()
+ if verbose:
+ print "uidDict all ", len(uidDict), time.ctime()
- del uidDict
- del midDict
- del mainIDList
- del uniqIDs
+ for mainID in mainIDList:
+ if len(uidDict[mainID]) == 2:
+ del uidDict[mainID]
- uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
- if verbose:
- print "got uniq dict ", len(uniqDict), time.ctime()
-
- if RDS.dataType == "RNA":
- spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
- if verbose:
- print "got splice dict ", len(spliceDict), time.ctime()
-
- for readID in jointList:
- try:
- guDict[readID] = uniqDict[readID][0]
- except:
- if RDS.dataType == "RNA":
- guDict[readID] = spliceDict[readID][0]
-
- del uniqDict
- del spliceDict
- if verbose:
- print "guDict actual ", len(guDict), time.ctime()
+ if verbose:
+ print "uidDict first candidates ", len(uidDict), time.ctime()
+
+ midDict = {}
+ for readID in multiIDs:
+ (frontID, multiplicity) = readID.split("::")
+ (mainID, pairID) = frontID.split("/")
+ try:
+ if pairID not in midDict[mainID]:
+ midDict[mainID].append(pairID)
+ except:
+ midDict[mainID] = [pairID]
- multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
- if verbose:
- print "got multi dict ", len(multiDict), time.ctime()
+ if verbose:
+ print "all multis ", len(midDict), time.ctime()
- for readID in jointList:
- muDict[readID] = multiDict[readID]
+ mainIDList = uidDict.keys()
+ for mainID in mainIDList:
+ if mainID not in midDict:
+ del uidDict[mainID]
- for readID in bothMultiList:
- muDict[readID] = multiDict[readID]
+ if verbose:
+ print "uidDict actual candidates ", len(uidDict), time.ctime()
+
+ jointList = []
+ bothMultiList = []
+ for readID in midDict:
+ listLen = len(midDict[readID])
+ if listLen == 1:
+ if readID in uidDict:
+ jointList.append(readID)
+ elif listLen == 2:
+ bothMultiList.append(readID)
- del multiDict
- if verbose:
- print "muDict actual ", len(muDict), time.ctime()
-
- RDS.setSynchronousPragma("OFF")
- for readID in jointList:
- try:
- (ustart, uchrom, upair) = guDict[readID]
- ustop = ustart + readlen
- except:
- (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID]
-
- muList = muDict[readID]
- muLen = len(muList)
- bestMatch = [tooFar] * muLen
- found = False
- for index in range(muLen):
- (mstart, mchrom, mpair) = muList[index]
- if uchrom != mchrom:
- continue
-
- if abs(mstart - ustart) < pairDist:
- bestMatch[index] = abs(mstart - ustart)
- found = True
- elif abs(mstart - ustop) < pairDist:
- bestMatch[index] = abs(mstart - ustop)
- found = True
-
- if found:
- theMatch = -1
- theDist = tooFar
- reweighList = []
- for index in range(muLen):
- if theDist > bestMatch[index]:
- theMatch = index
- theDist = bestMatch[index]
-
- theID = string.join([readID, mpair], "/")
- for index in range(muLen):
- if index == theMatch:
- score = 1 - (muLen - 1) / (100. * (muLen))
- else:
- score = 1 / (100. * muLen)
-
- start = muList[index][0]
- chrom = "chr%s" % muList[index][1]
- reweighList.append((round(score,3), chrom, start, theID))
-
- if theMatch > 0:
- RDS.reweighMultireads(reweighList)
- fixedPair += 1
- if verbose and fixedPair % 10000 == 1:
- print "fixed %d" % fixedPair
- print guDict[readID]
- print muDict[readID]
- print reweighList
-
- fixedReads.append(theID)
-
- RDS.setSynchronousPragma("ON")
-
- del guDict
- del muDict
- print "fixed %d pairs" % fixedPair
- print time.ctime()
+ if verbose:
+ print "joint ", len(jointList), time.ctime()
+ print "bothMulti ", len(bothMultiList), time.ctime()
- skippedReads = 0
- if doRadius:
- print "doing uniq read radius with radius = %d" % radius
- multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
- print "got multiDict"
- RDS.setSynchronousPragma("OFF")
- rindex = 0
- for readID in multiIDs:
- theID = readID
- if theID in fixedReads:
- skippedReads += 1
- continue
+ return jointList, bothMultiList
- if "::" in readID:
- (readID, multiplicity) = readID.split("::")
-
- scores = []
- coords = []
- for read in multiDict[readID]:
- (start, weight, rID, chrom) = read
- achrom = "chr%s" % chrom
- regionStart = start + halfreadlen - radius
- regionStop = start + halfreadlen + radius
- uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
- scores.append(uniqs + 1)
- coords.append((achrom, start, theID))
-
- total = float(sum(scores))
- reweighList = []
- for index in range(len(scores)):
- reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
- RDS.reweighMultireads(reweighList)
- rindex += 1
- if rindex % 10000 == 0:
- print rindex
+def getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose=False):
+ uniqReadsDict = {}
+ uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+ if verbose:
+ print "got uniq dict ", len(uniqDict), time.ctime()
- RDS.setSynchronousPragma("ON")
+ if RDS.dataType == "RNA":
+ spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
if verbose:
- print "skipped ", skippedReads
+ print "got splice dict ", len(spliceDict), time.ctime()
- print "reweighted ", rindex
+ for readID in jointList:
+ try:
+ uniqReadsDict[readID] = uniqDict[readID][0]
+ except KeyError:
+ if RDS.dataType == "RNA":
+ uniqReadsDict[readID] = spliceDict[readID][0]
+
+ return uniqReadsDict
- if doCache:
- RDS.saveCacheDB(rdsfile)
+def getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose=False):
+ multiReadSubsetDict = {}
+ multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
if verbose:
- print "finished", time.ctime()
-
+ print "got multi dict ", len(multiDict), time.ctime()
+
+ for readID in jointList:
+ multiReadSubsetDict[readID] = multiDict[readID]
+
+ for readID in bothMultiList:
+ multiReadSubsetDict[readID] = multiDict[readID]
+
+ return multiReadSubsetDict
+
+
+def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
+ skippedReads = 0
+ readlen = RDS.getReadSize()
+ halfreadlen = readlen / 2
+ print "doing uniq read radius with radius = %d" % radius
+ multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
+ print "got multiDict"
+ RDS.setSynchronousPragma("OFF")
+ reweighedCount = 0
+ for readID in multiIDs:
+ originalMultiReadID = readID
+ if originalMultiReadID in readsToSkip:
+ skippedReads += 1
+ continue
+
+ if "::" in readID:
+ (readID, multiplicity) = readID.split("::")
+
+ scores = []
+ coords = []
+ for read in multiDict[readID]:
+ start = read["start"]
+ chromosome = "chr%s" % read["chrom"]
+ regionStart = start + halfreadlen - radius
+ regionStop = start + halfreadlen + radius
+ uniqs = RDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
+ scores.append(uniqs + 1)
+ coords.append((chromosome, start, originalMultiReadID))
+
+ total = float(sum(scores))
+ reweighList = []
+ for index in range(len(scores)):
+ reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
+
+ RDS.reweighMultireads(reweighList)
+ reweighedCount += 1
+ if reweighedCount % 10000 == 0:
+ print reweighedCount
+
+ RDS.setSynchronousPragma("ON")
+ if verbose:
+ print "skipped ", skippedReads
+
+ print "reweighted ", reweighedCount
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file