snapshot of 4.0a development. initial git repo commit
authorSean Upchurch <sau@caltech.edu>
Fri, 8 Oct 2010 23:32:13 +0000 (16:32 -0700)
committerSean Upchurch <sau@caltech.edu>
Fri, 8 Oct 2010 23:32:13 +0000 (16:32 -0700)
143 files changed:
MakeBamFromRds.py [new file with mode: 0644]
MakeRdsFromBam.py [new file with mode: 0644]
ReadDataset.py [new file with mode: 0644]
__init__.py [new file with mode: 0644]
altSpliceCounts.py [new file with mode: 0755]
analyzego.py [new file with mode: 0755]
bedtoregion.py [new file with mode: 0755]
binstocdf.py [new file with mode: 0755]
buildMatrix.py [new file with mode: 0755]
buildrmaskdb.py [new file with mode: 0755]
buildsnpdb.py [new file with mode: 0755]
cdfdist.py [new file with mode: 0755]
checkrmask.py [new file with mode: 0755]
chiapet/.svn/entries [new file with mode: 0644]
chiapet/.svn/text-base/__init__.py.svn-base [new file with mode: 0644]
chiapet/.svn/text-base/linkers.fa.svn-base [new file with mode: 0644]
chiapet/.svn/text-base/markLinkers.py.svn-base [new file with mode: 0644]
chiapet/.svn/text-base/segregateLinkers.py.svn-base [new file with mode: 0644]
chiapet/.svn/tmp/markLinkers.py.tmp [new file with mode: 0644]
chiapet/__init__.py [new file with mode: 0644]
chiapet/linkers.fa [new file with mode: 0644]
chiapet/markLinkers.py [new file with mode: 0644]
chiapet/segregateLinkers.py [new file with mode: 0644]
chkSNPrmask.py [new file with mode: 0755]
chksnp.py [new file with mode: 0755]
colsum.py [new file with mode: 0755]
combineRPKMs.py [new file with mode: 0755]
combinerds.py [new file with mode: 0755]
commoncode.py [new file with mode: 0755]
crossmatch.py [new file with mode: 0755]
distalPairs.py [new file with mode: 0755]
docs/ERANGE.copyright [new file with mode: 0644]
docs/README.build-rds [new file with mode: 0644]
docs/README.chip-seq [new file with mode: 0644]
docs/README.rna-esnp [new file with mode: 0644]
docs/README.rna-seq [new file with mode: 0644]
docs/README.rnapath [new file with mode: 0644]
docs/RNA-seq.analysisSteps.txt [new file with mode: 0644]
docs/buildMatrix.sh [new file with mode: 0644]
docs/partition.sh [new file with mode: 0644]
docs/regionCounts.sh [new file with mode: 0644]
docs/runRNAPairedAnalysis.sh [new file with mode: 0755]
docs/runSNPAnalysis.sh [new file with mode: 0755]
docs/runStandardAnalysis.sh [new file with mode: 0755]
docs/runStrandedAnalysis.sh [new file with mode: 0755]
farPairs.py [new file with mode: 0644]
featureIntersects.py [new file with mode: 0755]
findMotifs.py [new file with mode: 0755]
findall.py [new file with mode: 0755]
fraction.py [new file with mode: 0755]
geneDownstreamBins.py [new file with mode: 0755]
geneLocusBins.py [new file with mode: 0755]
geneLocusCounts.py [new file with mode: 0755]
geneLocusPeaks.py [new file with mode: 0755]
geneMrnaCounts.py [new file with mode: 0755]
geneMrnaCountsWeighted.py [new file with mode: 0755]
geneNeighbors.py [new file with mode: 0755]
geneStallingBins.py [new file with mode: 0755]
geneStartBins.py [new file with mode: 0755]
geneUpstreamBins.py [new file with mode: 0755]
getGOgenes.py [new file with mode: 0755]
getNovelSNPs.py [new file with mode: 0755]
getSNPGeneInfo.py [new file with mode: 0755]
getSNPs.py [new file with mode: 0755]
getallNRSE.py [new file with mode: 0755]
getallgenes.py [new file with mode: 0755]
getallsites.py [new file with mode: 0755]
getfasta.py [new file with mode: 0755]
getgosig.py [new file with mode: 0755]
getmers.py [new file with mode: 0755]
getsplicefa.py [new file with mode: 0755]
gfftocis.py [new file with mode: 0644]
gointersects.py [new file with mode: 0755]
hepg2.rds [new file with mode: 0644]
intersects.py [new file with mode: 0755]
listGeneFeatures.py [new file with mode: 0755]
makeGraphs.py [new file with mode: 0644]
makeSNPtrack.py [new file with mode: 0755]
makebedfromrds.py [new file with mode: 0755]
makerdsfrombed.py [new file with mode: 0755]
makerdsfromblat.py [new file with mode: 0755]
makerdsfrombowtie.py [new file with mode: 0755]
makerdsfromeland2.py [new file with mode: 0755]
makesitetrack.py [new file with mode: 0755]
makewiggle.py [new file with mode: 0755]
normalizeExpandedExonic.py [new file with mode: 0644]
normalizeFinalExonic.py [new file with mode: 0755]
partition.py [new file with mode: 0755]
peakstoregion.py [new file with mode: 0755]
plotbardist.py [new file with mode: 0755]
plotnomogram.py [new file with mode: 0755]
plotprofile.py [new file with mode: 0755]
predictSpliceCount.py [new file with mode: 0755]
profilebins.py [new file with mode: 0755]
ratio.py [new file with mode: 0755]
rdsmetadata.py [new file with mode: 0755]
recordLog.py [new file with mode: 0755]
regionBins.py [new file with mode: 0755]
regionCounts.py [new file with mode: 0755]
regionintersects.py [new file with mode: 0755]
regiontobed.py [new file with mode: 0755]
rnaAToIFilter.py [new file with mode: 0644]
rnaEditing.py [new file with mode: 0644]
rnafarPairs.py [new file with mode: 0755]
rnapath/.svn/entries [new file with mode: 0644]
rnapath/.svn/text-base/RNAPATH.py.svn-base [new file with mode: 0644]
rnapath/.svn/text-base/__init__.py.svn-base [new file with mode: 0644]
rnapath/.svn/text-base/processvelvet.py.svn-base [new file with mode: 0644]
rnapath/.svn/tmp/RNAPATH.py.tmp [new file with mode: 0644]
rnapath/.svn/tmp/processvelvet.py.tmp [new file with mode: 0644]
rnapath/RNAPATH.py [new file with mode: 0644]
rnapath/__init__.py [new file with mode: 0644]
rnapath/processvelvet.py [new file with mode: 0644]
scatterfields.py [new file with mode: 0755]
siteintersects.py [new file with mode: 0755]
stallCategory.py [new file with mode: 0755]
test/testAnalyzeGO.py [new file with mode: 0644]
test/testChkSNP_input.txt [new file with mode: 0644]
test/testChksnp.py [new file with mode: 0644]
test/testCommoncode.py [new file with mode: 0644]
test/testErange.py [new file with mode: 0644]
test/testGeneMrnaCounts.py [new file with mode: 0644]
test/testGetFasta.py [new file with mode: 0644]
test/testGetNovelSNPs.py [new file with mode: 0644]
test/testGetSNPGeneInfo.py [new file with mode: 0644]
test/testGetSNPs.py [new file with mode: 0644]
test/testMakeBamFromRds.py [new file with mode: 0644]
test/testMakeGraphs.py [new file with mode: 0644]
test/testMakeRdsFromBam.py [new file with mode: 0644]
test/testMakeSNPTrack.py [new file with mode: 0644]
test/testMarkLinkers.py [new file with mode: 0644]
test/testPeaksToRegion.py [new file with mode: 0644]
test/testProcessVelvet.py [new file with mode: 0644]
test/testRNAPATH.py [new file with mode: 0644]
test/testReadDataset.py [new file with mode: 0644]
test/testRnaAToIFilter.py [new file with mode: 0644]
test/testRnaEditing.py [new file with mode: 0644]
test/testTranscripts.py [new file with mode: 0644]
test/testmakebedfromrds.py [new file with mode: 0644]
transcripts.py [new file with mode: 0755]
trimreads.py [new file with mode: 0755]
utrChanges.py [new file with mode: 0755]
weighMultireads.py [new file with mode: 0755]

diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py
new file mode 100644 (file)
index 0000000..935a04e
--- /dev/null
@@ -0,0 +1,281 @@
+"""
+MakeBamFromRds
+
+Converts ERANGE RDS zero based file to Bam zero based format.
+
+Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
+
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import re
+import optparse
+import random
+import pysam
+from commoncode import readDataset
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    verstring = "MakeBamFromRds: version 1.0"
+    print verstring
+
+    doPairs = False
+    
+    usage = "usage: python %prog rdsFile bamFile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", action="store_true", dest="doPairs")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--fasta", dest="fastaFileName")
+    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
+                        doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
+                        doCache=False, cachePages=100000, fastaFileName="",
+                        chromList=[])
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    rdsfile = args[0]
+    outfilename = args[1]
+
+    allChrom = True
+    if options.chromList:
+        allChrom = False
+
+    makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
+                     options.doSplices, doPairs, options.withFlag, options.useFlagLike,
+                     options.enforceChr, allChrom, options.doCache, options.cachePages,
+                     options.chromList, options.fastaFileName)
+
+
+def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
+                     doSplices=False, doPairs=False, withFlag="",
+                     useFlagLike=False, enforceChr=False, allChrom=True,
+                     doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
+
+    if not withUniqs and not withMulti and not doSplices:
+        print "must be outputting at least one of uniqs, multi, or -splices - exiting"
+        sys.exit(1)
+
+    print "\nsample:"
+    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+
+    if cachePages > RDS.getDefaultCacheSize():
+        RDS.setDBcache(cachePages)
+
+    readlength = RDS.getReadSize()
+
+    if allChrom:
+        if withUniqs:
+            chromList = RDS.getChromosomes()
+        elif withMulti:
+            chromList = RDS.getChromosomes(table="multi")
+        else:
+            chromList = RDS.getChromosomes(table="splices")
+
+        chromList.sort()
+
+    fastaSequenceDict = {}
+    if fastaFileName:
+        fastafile = open(fastaFileName)
+        fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
+        fastafile.close()
+
+    referenceSequenceList = []
+    chromRemoveList = []
+    for chromosome in chromList:
+        if doNotOutputChromosome(chromosome, enforceChr):
+            chromRemoveList.append(chromosome)
+        else:
+            chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
+            referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
+            referenceSequenceList.append(referenceDataDict)
+
+    for chrom in chromRemoveList:
+        chromList.remove(chrom)
+
+    header = {"HD": {"VN": "1.0"}}
+    if referenceSequenceList:
+        header["SQ"] = referenceSequenceList
+
+    outfile = pysam.Samfile(outfilename, "wb", header=header)
+
+    totalWrites = 0
+    noncanonicalSplices = 0
+    for chrom in chromList:
+        index = 0
+        print "chromosome %s" % (chrom)
+        if withUniqs or withMulti:
+            hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
+                                       withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+                                       flagLike=useFlagLike, entryDict=True)
+
+            for read in hitDict[chrom]:
+                writeBAMEntry(outfile, chrom, read, readlength)
+                index += 1
+
+        if doSplices:
+            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+            index += numSpliceReadsWritten
+            noncanonicalSplices += noncanonical
+
+        print index
+        totalWrites += index
+
+    outfile.close()
+    print "%d total reads written" % totalWrites
+    print "%d non-canonical splices" % noncanonicalSplices
+
+
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+    index = 0
+    noncanonicalSplices = 0
+    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+    if chrom not in spliceDict:
+        pass
+    else:
+        for read in spliceDict[chrom]:
+            if fastaSequenceDict.has_key(chrom):
+                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+                noncanonicalSplices += noncanonical
+
+            writeBAMEntry(outfile, chrom, read, readlength)
+            index += 1
+
+    return index, noncanonicalSplices
+
+
+def writeBAMEntry(outfile, chrom, outputDict, readlength):
+    tagList = []
+    alignedRead = pysam.AlignedRead()
+    alignedRead.qname = outputDict["readID"]
+    if outputDict["sense"] == "-":
+        alignedRead.is_reverse = True
+
+    alignedRead.rname = outfile.references.index(chrom)
+
+    if outputDict.has_key("startL"):
+        startL = outputDict["startL"]
+        stopL = outputDict["stopL"]
+        startR = outputDict["startR"]
+        stopR = outputDict["stopR"]
+        alignedRead.pos = startL
+        alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
+        tagList.append(("XS", outputDict["sense"]))
+    else:
+        alignedRead.pos = outputDict["start"]
+        alignedRead.cigar = [(0, readlength)]
+
+    if outputDict.has_key("pairID"):
+        pairID = outputDict["pairID"]
+        if pairID == "1":
+            alignedRead.is_read1 = True
+            alignedRead.is_proper_pair = True
+        elif pairID == "2":
+            alignedRead.is_read2 = True
+            alignedRead.is_proper_pair = True
+        else:
+            pass
+
+    if outputDict.has_key("mismatch"):
+        mismatchTag = getMismatches(outputDict["mismatch"])
+        if mismatchTag:
+            tagList.append(("MD", mismatchTag))
+    
+    if tagList:
+        alignedRead.tags = tagList
+
+    outfile.write(alignedRead)
+
+
+def getMismatches(mismatchString):
+    mismatch = ""
+    positions = re.findall("\d+", mismatchString)
+    nucleotides = re.findall("([ACGTN])\d+", mismatchString)
+    for index in range(0, len(positions)):
+        mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+
+    return mismatch
+
+
+def doNotOutputChromosome(chrom, enforceChr):
+    result = False
+
+    if chrom == "chrM":
+        result = True
+
+    if enforceChr and ("chr" not in chrom):
+        result = True
+
+    return result
+
+
+def getFastaSequenceDictionary(fastaFileName):
+    fastaSeqDict = {}
+    fchrom = ""
+    fseq = ""
+
+    fastafile = open(fastaFileName)
+    for line in fastafile:
+        if line[0] == ">":
+            if fchrom != "":
+                fastaSeqDict[fchrom] = fseq
+
+            fseq = ""
+            fchrom = line[1:-1]
+        else:
+            fseq += line.strip()
+
+    fastafile.close()
+
+    return fastaSeqDict
+
+
+def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+    spliceSense = {"GTAG": "+",
+                   "GCAG": "+",
+                   "ATAC": "+",
+                   "CTAC": "-",
+                   "CTGC": "-",
+                   "GTAT": "-"
+    }
+
+    noncanonical = 0
+    intronstart = stopLeft
+    intronlen = startRight - stopLeft
+    leftJunctionSig =fastaSequence[intronstart:intronstart+2]
+    rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
+    spliceJunction = leftJunctionSig + rightJunctionSig
+    spliceJunction = spliceJunction.upper()
+    if spliceSense.has_key(spliceJunction):
+        sense = spliceSense[spliceJunction]
+    else:
+        noncanonical += 1
+        senses = ["+", "-"]
+        random.shuffle(senses)
+        sense = senses[0]
+
+    return sense, noncanonical
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py
new file mode 100644 (file)
index 0000000..e9df847
--- /dev/null
@@ -0,0 +1,397 @@
+"""
+MakeRdsFromBam
+
+Created on Jun 3, 2010
+
+@author: sau
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse, re
+import pysam
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 1.0"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+    
+    print verstring
+
+    usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
+            \ninput reads must be sorted to properly record multireads"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init",
+                      help="append to existing rds file [default: create new]")
+    parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+                      help="set data type to RNA [default: DNA]")
+    parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+                      help="input file is in sam format")
+    parser.add_option("--index", action="store_true", dest="doIndex",
+                      help="index the output rds file")
+    parser.add_option("--cache", type="int", dest="cachePages",
+                      help="number of cache pages to use [default: 100000")
+    parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+                      help="multi counts over this value are discarded [default: 10]")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+                      help="use the raw read names")
+    parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
+                        maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    try:
+        label = args[0]
+    except IndexError:
+        print "no label specified - see --help for usage"
+        sys.exit(1)
+
+    try:
+        samFileName = args[1]
+    except IndexError:
+        print "no samfile specified - see --help for usage"
+        sys.exit(1)
+
+    try:
+        outDbName = args[2]
+    except IndexError:
+        print "no outrdsfile specified - see --help for usage"
+        sys.exit(1)
+
+    makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
+                   options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
+
+
+def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
+                   cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
+
+    if useSamFile:
+        fileMode = "r"
+    else:
+        fileMode = "rb"
+
+    try:
+        samfile = pysam.Samfile(samFileName, fileMode)
+    except ValueError:
+        print "samfile index not found"
+        sys.exit(1)
+
+    if rnaDataType:
+        dataType = "RNA"
+    else:
+        dataType = "DNA"
+
+    writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
+
+    rds = readDataset(outDbName, init, dataType, verbose=True)
+    if not init and doIndex:
+        try:
+            if rds.hasIndex():
+                rds.dropIndex()
+        except:
+            pass
+
+    if "sam_mapped" not in rds.getMetadata():
+        rds.insertMetadata([("sam_mapped", "True")])
+
+    defaultCacheSize = rds.getDefaultCacheSize()
+
+    if cachePages > defaultCacheSize:
+        if init:
+            rds.setDBcache(cachePages, default=True)
+        else:
+            rds.setDBcache(cachePages)
+
+    propertyList = []
+    for arg in sys.argv:
+        if "::" in arg:
+            (pname, pvalue) = arg.strip().split("::")
+            propertyList.append((pname, pvalue))
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+    countReads = {"unmapped": 0,
+                  "total": 0,
+                  "unique": 0,
+                  "multi": 0,
+                  "multiDiscard": 0,
+                  "splice": 0
+    }
+
+    readsize = 0
+    insertSize = 100000
+
+    uniqueInsertList = []
+    multiInsertList = []
+    spliceInsertList = []
+
+    processedEntryDict = {}
+    uniqueReadDict = {}
+    multiReadDict = {}
+    spliceReadDict = {}
+
+    samFileIterator = samfile.fetch(until_eof=True)
+
+    for read in samFileIterator:
+        if read.is_unmapped:
+            countReads["unmapped"] += 1
+            continue
+
+        if readsize == 0:
+            take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+            readsize = sum([length for op,length in read.cigar if op in take])
+            if init:
+                rds.insertMetadata([("readsize", readsize)])
+
+        #Build the read dictionaries
+        try:
+            readSequence = read.seq
+        except KeyError:
+            readSequence = ""
+
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
+        if trimReadID:
+            rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+        else:
+            rdsEntryName = read.qname
+
+        if processedEntryDict.has_key(readName):
+            if isSpliceEntry(read.cigar):
+                if spliceReadDict.has_key(readName):
+                    del spliceReadDict[readName]
+            else:
+                if uniqueReadDict.has_key(readName):
+                    del uniqueReadDict[readName]
+
+                if multiReadDict.has_key(readName):
+                    (read, priorCount, rdsEntryName) = multiReadDict[readName]
+                    count = priorCount + 1
+                    multiReadDict[readName] = (read, count, rdsEntryName)
+                else:
+                    multiReadDict[readName] = (read, 1, rdsEntryName)
+        else:
+            processedEntryDict[readName] = ""
+            if isSpliceEntry(read.cigar):
+                spliceReadDict[readName] = (read,rdsEntryName)
+            else:
+                uniqueReadDict[readName] = (read, rdsEntryName)
+
+        if countReads["total"] % insertSize == 0:
+            for entry in uniqueReadDict.keys():
+                (readData, rdsEntryName) = uniqueReadDict[entry]
+                chrom = samfile.getrname(readData.rname)
+                uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
+                countReads["unique"] += 1
+
+            for entry in spliceReadDict.keys():
+                (readData, rdsEntryName) = spliceReadDict[entry]
+                chrom = samfile.getrname(readData.rname)
+                spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+                countReads["splice"] += 1
+
+            for entry in multiReadDict.keys():
+                (readData, count, rdsEntryName) = multiReadDict[entry]
+                chrom = samfile.getrname(readData.rname)
+                if count > maxMultiReadCount:
+                    countReads["multiDiscard"] += 1
+                else:
+                    multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
+                    countReads["multi"] += 1
+
+            rds.insertUniqs(uniqueInsertList)
+            rds.insertMulti(multiInsertList)
+            uniqueInsertList = []
+            uniqueReadDict = {}
+            multiInsertList = []
+            multiReadDict = {}
+            if dataType == "RNA":
+                rds.insertSplices(spliceInsertList)
+                spliceInsertList = []
+                spliceReadDict = {}
+
+            print ".",
+            sys.stdout.flush()
+            processedEntryDict = {}
+
+        countReads["total"] += 1
+
+    if len(uniqueReadDict.keys()) > 0:
+        for entry in uniqueReadDict.keys():
+            (readData, rdsEntryName) = uniqueReadDict[entry]
+            chrom = samfile.getrname(readData.rname)
+            uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
+            countReads["unique"] += 1
+
+        rds.insertUniqs(uniqueInsertList)
+
+    if len(multiReadDict.keys()) > 0:
+        for entry in multiReadDict.keys():
+            (readData, count, rdsEntryName) = multiReadDict[entry]
+            chrom = samfile.getrname(readData.rname)
+            if count > maxMultiReadCount:
+                countReads["multiDiscard"] += 1
+            else:
+                multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+                countReads["multi"] += 1
+
+        countReads["multi"] += len(multiInsertList)
+
+    if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
+        for entry in spliceReadDict.keys():
+            (readData, rdsEntryName) = spliceReadDict[entry]
+            chrom = samfile.getrname(readData.rname)
+            spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+            countReads["splice"] += 1
+
+        rds.insertSplices(spliceInsertList)
+
+    countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
+    countString += "\t%d unique reads" % countReads["unique"]
+    countString += "\t%d multi reads" % countReads["multi"]
+    countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
+    if dataType == "RNA":
+        countString += "\t%d spliced reads" % countReads["splice"]
+
+    print countString.replace("\t", "\n")
+
+    writeLog("%s.log" % outDbName, verstring, countString)
+
+    if doIndex:
+        print "building index...."
+        if cachePages > defaultCacheSize:
+            rds.setDBcache(cachePages)
+            rds.buildIndex(cachePages)
+        else:
+            rds.buildIndex(defaultCacheSize)
+
+
+def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
+    start = int(alignedRead.pos)
+    stop = int(start+readSize)
+    sense = getReadSense(alignedRead.is_reverse)
+    try:
+        mismatchTag = alignedRead.opt("MD")
+        mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
+    except KeyError:
+        mismatches = ""
+
+    return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
+
+
+def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
+    (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
+    startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
+    
+    return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
+
+
+def getPairedReadNumberSuffix(read):
+    readSuffix = ""
+    if not isPairedRead(read):
+        return ""
+
+    if read.is_read1:
+        readSuffix = "/1"
+    elif read.is_read2:
+        readSuffix = "/2"
+
+    return readSuffix
+
+
+def isPairedRead(read):
+    return read.is_proper_pair and (read.is_read1 or read.is_read2)
+
+
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            isSplice = True
+            break
+
+    return isSplice
+
+
+def getReadSense(reverse):
+    if reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
+    output = []
+    deletionMarker = "^"
+    position = 0
+
+    lengths = re.findall("\d+", mismatchTag)
+    mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
+
+    for mismatchEntry in range(len(mismatchSequences)):
+        mismatch = mismatchSequences[mismatchEntry]
+        position = position + int(lengths[mismatchEntry])
+        if string.find(mismatch, deletionMarker) == 0:
+            continue
+
+        try:
+            if querySequence:
+                genomicNucleotide = querySequence[position]
+            else:
+                genomicNucleotide = "N"
+
+            if sense == "-":
+                mismatch = getComplementNucleotide(mismatch)
+                genomicNucleotide  = getComplementNucleotide(genomicNucleotide)
+
+            elandCompatiblePosition = int(position + 1)
+            output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+            position += 1
+        except IndexError:
+            if logErrors:
+                errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
+                writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
+
+            return ""
+
+    return string.join(output, ",")
+
+
+def getComplementNucleotide(nucleotide):
+    complement = {"A": "T",
+                  "T": "A",
+                  "C": "G",
+                  "G": "C",
+                  "N": "N"
+    }
+
+    return complement[nucleotide]
+
+
+def getSpliceBounds(start, readsize, cigarTupleList):
+    stopR = int(start + readsize)
+    offset = 0
+
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            stopL = int(start + offset)
+            startR = int(stopL + length)
+
+            return start, startR, stopL, stopR
+        else:
+            offset += length
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/ReadDataset.py b/ReadDataset.py
new file mode 100644 (file)
index 0000000..ef80d65
--- /dev/null
@@ -0,0 +1,1246 @@
+"""
+Created on Jul 1, 2010
+
+@author: sau
+"""
+
+import sqlite3 as sqlite
+import string
+import tempfile
+import shutil
+import os
+from os import environ
+from array import array
+from commoncode import getReverseComplement
+
+if environ.get("CISTEMATIC_TEMP"):
+    cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+    cisTemp = "/tmp"
+
+tempfile.tempdir = cisTemp
+currentRDSVersion = "1.1"
+
+
+class ReadDatasetError(Exception):
+    pass
+
+
+class ReadDataset():
+    """ Class for storing reads from experiments. Assumes that custom scripts
+    will translate incoming data into a format that can be inserted into the
+    class using the insert* methods. Default class subtype ('DNA') includes
+    tables for unique and multireads, whereas 'RNA' subtype also includes a
+    splices table.
+    """
+
+    def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
+                 cache=False, reportCount=True):
+        """ creates an rds datafile if initialize is set to true, otherwise
+        will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
+        """
+        self.dbcon = ""
+        self.memcon = ""
+        self.dataType = ""
+        self.rdsVersion = currentRDSVersion
+        self.memBacked = False
+        self.memChrom = ""
+        self.memCursor = ""
+        self.cachedDBFile = ""
+
+        if cache:
+            if verbose:
+                print "caching ...."
+
+            self.cacheDB(datafile)
+            dbFile = self.cachedDBFile
+        else:
+            dbFile = datafile
+
+        self.dbcon = sqlite.connect(dbFile)
+        self.dbcon.row_factory = sqlite.Row
+        self.dbcon.execute("PRAGMA temp_store = MEMORY")
+        if initialize:
+            if datasetType not in ["DNA", "RNA"]:
+                raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
+            else:
+                self.dataType = datasetType
+
+            self.initializeTables(self.dbcon)
+        else:
+            metadata = self.getMetadata("dataType")
+            self.dataType = metadata["dataType"]
+
+        try:
+            metadata = self.getMetadata("rdsVersion")
+            self.rdsVersion = metadata["rdsVersion"]
+        except:
+            try:
+                self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
+            except IOError:
+                print "could not add rdsVersion - read-only ?"
+                self.rdsVersion = "pre-1.0"
+
+        if verbose:
+            if initialize:
+                print "INITIALIZED dataset %s" % datafile
+            else:
+                print "dataset %s" % datafile
+
+            metadata = self.getMetadata()
+            print "metadata:"
+            pnameList = metadata.keys()
+            pnameList.sort()
+            for pname in pnameList:
+                print "\t" + pname + "\t" + metadata[pname]
+
+            if reportCount:
+                ucount = self.getUniqsCount()
+                mcount = self.getMultiCount()
+                if self.dataType == "DNA" and not initialize:
+                    try:
+                        print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
+                    except ValueError:
+                        print "\n%s unique reads and %s multireads" % (ucount, mcount)
+                elif self.dataType == "RNA" and not initialize:
+                    scount = self.getSplicesCount()
+                    try:
+                        print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
+                    except ValueError:
+                        print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
+
+            print "default cache size is %d pages" % self.getDefaultCacheSize()
+            if self.hasIndex():
+                print "found index"
+            else:
+                print "not indexed"
+
+
+    def __len__(self):
+        """ return the number of usable reads in the dataset.
+        """
+        total = self.getUniqsCount()
+        total += self.getMultiCount()
+
+        if self.dataType == "RNA":
+            total += self.getSplicesCount()
+
+        total = int(total)
+
+        return total
+
+
+    def __del__(self):
+        """ cleanup copy in local cache, if present.
+        """
+        if self.cachedDBFile != "":
+            self.uncacheDB()
+
+
+    def cacheDB(self, filename):
+        """ copy geneinfoDB to a local cache.
+        """
+        self.cachedDBFile =  "%s.db" % tempfile.mktemp()
+        shutil.copyfile(filename, self.cachedDBFile)
+
+
+    def saveCacheDB(self, filename):
+        """ copy geneinfoDB to a local cache.
+        """
+        shutil.copyfile(self.cachedDBFile, filename)
+
+
+    def uncacheDB(self):
+        """ delete geneinfoDB from local cache.
+        """
+        global cachedDBFile
+        if self.cachedDBFile != "":
+            try:
+                os.remove(self.cachedDBFile)
+            except:
+                print "could not delete %s" % self.cachedDBFile
+
+            self.cachedDB = ""
+
+
+    def attachDB(self, filename, asname):
+        """ attach another database file to the readDataset.
+        """
+        stmt = "attach '%s' as %s" % (filename, asname)
+        self.execute(stmt)
+
+
+    def detachDB(self, asname):
+        """ detach a database file to the readDataset.
+        """
+        stmt = "detach %s" % (asname)
+        self.execute(stmt)
+
+
+    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+        """ import into current RDS the table (with columns destcolumns,
+            with default all columns) from the database file asname,
+            using the column specification of ascolumns (default all).
+        """
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        if flagged != "":
+            stmt += " where flag = '%s' " % flagged
+
+        self.executeCommit(stmt)
+
+
+    def getTables(self, asname=""):
+        """ get a list of table names in a particular database file.
+        """
+        resultList = []
+        sql = self.getSqlCursor()
+
+        if asname != "":
+            asname += "."
+
+        stmt = "select name from %ssqlite_master where type='table'" % asname
+        sql.execute(stmt)
+        results = sql.fetchall()
+
+        for row in results:
+            resultList.append(row["name"])
+
+        return resultList
+
+
+    def getSqlCursor(self):
+        if self.memBacked:
+            sql = self.getMemCursor()
+        else:
+            sql = self.getFileCursor()
+
+        return sql
+
+
+    def hasIndex(self):
+        """ check whether the RDS file has at least one index.
+        """
+        stmt = "select count(*) from sqlite_master where type='index'"
+        count = int(self.execute(stmt, returnResults=True)[0][0])
+        if count > 0:
+            return True
+
+        return False
+
+
+    def initializeTables(self, dbConnection, cache=100000):
+        """ creates table schema in a database connection, which is
+        typically a database file or an in-memory database.
+        """
+        dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+        dbConnection.execute("create table metadata (name varchar, value varchar)")
+        dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
+        positionSchema = "start int, stop int"
+        tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+        dbConnection.execute("create table uniqs %s" % tableSchema)
+        dbConnection.execute("create table multi %s" % tableSchema)
+        if self.dataType == "RNA":
+            positionSchema = "startL int, stopL int, startR int, stopR int"
+            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+            dbConnection.execute("create table splices %s" % tableSchema)
+
+        dbConnection.commit()
+
+
+    def getFileCursor(self):
+        """ returns a cursor to file database for low-level (SQL)
+        access to the data.
+        """
+        return self.dbcon.cursor()
+
+
+    def getMemCursor(self):
+        """ returns a cursor to memory database for low-level (SQL)
+        access to the data.
+        """
+        return self.memcon.cursor()
+
+
+    def getMetadata(self, valueName=""):
+        """ returns a dictionary of metadata.
+        """
+        whereClause = ""
+        resultsDict = {}
+
+        if valueName != "":
+            whereClause = " where name='%s'" % valueName
+
+        sql = self.getSqlCursor()
+
+        sql.execute("select name, value from metadata %s" % whereClause)
+        results = sql.fetchall()
+
+        for row in results:
+            parameterName = row["name"]
+            parameterValue = row["value"]
+            if parameterName not in resultsDict:
+                resultsDict[parameterName] = parameterValue
+            else:
+                trying = True
+                index = 2
+                while trying:
+                    newName = string.join([parameterName, str(index)], ":")
+                    if newName not in resultsDict:
+                        resultsDict[newName] = parameterValue
+                        trying = False
+
+                    index += 1
+
+        return resultsDict
+
+
+    def getReadSize(self):
+        """ returns readsize if defined in metadata.
+        """
+        metadata = self.getMetadata()
+        if "readsize" not in metadata:
+            raise ReadDatasetError("no readsize parameter defined")
+        else:
+            mysize = metadata["readsize"]
+            if "import" in mysize:
+                mysize = mysize.split()[0]
+
+            return int(mysize)
+
+
+    def getDefaultCacheSize(self):
+        """ returns the default cache size.
+        """
+        return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+
+
+    def getChromosomes(self, table="uniqs", fullChrom=True):
+        """ returns a list of distinct chromosomes in table.
+        """
+        statement = "select distinct chrom from %s" % table
+        sql = self.getSqlCursor()
+
+        sql.execute(statement)
+        results = []
+        for row in sql:
+            if fullChrom:
+                if row["chrom"] not in results:
+                    results.append(row["chrom"])
+            else:
+                if  len(row["chrom"][3:].strip()) < 1:
+                    continue
+
+                if row["chrom"][3:] not in results:
+                    results.append(row["chrom"][3:])
+
+        results.sort()
+
+        return results
+
+
+    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+                         doMulti=False, doSplices=False):
+        """ returns the maximum coordinate for reads on a given chromosome.
+        """
+        maxCoord = 0
+        sql = self.getSqlCursor()
+
+        if doUniqs:
+            try:
+                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
+                maxCoord = int(sql.fetchall()[0][0])
+            except:
+                print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        if doSplices:
+            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
+            try:
+                spliceMax = int(sql.fetchall()[0][0])
+                if spliceMax > maxCoord:
+                    maxCoord = spliceMax
+            except:
+                pass
+
+        if doMulti:
+            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
+            try:
+                multiMax = int(sql.fetchall()[0][0])
+                if multiMax > maxCoord:
+                    maxCoord = multiMax
+            except:
+                pass
+
+        if verbose:
+            print "%s maxCoord: %d" % (chrom, maxCoord)
+
+        return maxCoord
+
+
+    def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
+                     flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
+                     withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
+                     readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+                     flagLike=False, strand='', combine5p=False):
+        """ returns a dictionary of reads in a variety of formats
+        and which can be restricted by chromosome or custom-flag.
+        Returns unique reads by default, but can return multireads
+        with doMulti set to True.
+        """
+        whereClause = []
+        resultsDict = {}
+
+        if chrom != "" and chrom != self.memChrom:
+            whereClause.append("chrom = '%s'" % chrom)
+
+        if flag != "":
+            if flagLike:
+                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+                whereClause.append(flagLikeClause)
+            else:
+                whereClause.append("flag = '%s'" % flag)
+
+        if start > -1:
+            whereClause.append("start > %d" % start)
+
+        if stop > -1:
+            whereClause.append("stop < %d" % stop)
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
+        if hasMismatch:
+            whereClause.append("mismatch != ''")
+
+        if strand in ["+", "-"]:
+            whereClause.append("sense = '%s'" % strand)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        groupBy = []
+        if findallOptimize:
+            selectClause = ["select start, sense, sum(weight)"]
+            groupBy = ["GROUP BY start, sense"]
+        else:
+            selectClause = ["select ID, chrom, start, readID"]
+            if bothEnds:
+                selectClause.append("stop")
+
+            if not noSense:
+                selectClause.append("sense")
+
+            if withWeight:
+                selectClause.append("weight")
+
+            if withFlag:
+                selectClause.append("flag")
+
+            if withMismatch:
+                selectClause.append("mismatch")
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        selectQuery = string.join(selectClause, ",")
+        groupQuery = string.join(groupBy)
+        if doUniqs:
+            stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
+            if doMulti:
+                stmt.append("UNION ALL")
+                stmt.append(selectQuery)
+                stmt.append("from multi")
+                stmt.append(whereQuery)
+                stmt.append(groupQuery)
+        else:
+            stmt = [selectQuery, "from multi", whereQuery]
+
+        if combine5p:
+            if findallOptimize:
+                selectQuery = "select start, sense, weight, chrom"
+
+            if doUniqs:
+                subSelect = [selectQuery, "from uniqs", whereQuery]
+                if doMulti:
+                    subSelect.append("union all")
+                    subSelect.append(selectQuery)
+                    subSelect.append("from multi")
+                    subSelect.append(whereQuery)
+            else:
+                subSelect = [selectQuery, "from multi", whereQuery]
+
+            sqlStmt = string.join(subSelect)
+            if findallOptimize:
+                selectQuery = "select start, sense, sum(weight)"
+
+            stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
+                    selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
+
+        if findallOptimize:
+            if self.memBacked:
+                self.memcon.row_factory = None
+                sql = self.memcon.cursor()
+            else:
+                self.dbcon.row_factory = None
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by start")
+        elif readIDDict:
+            if self.memBacked:
+                sql = self.memcon.cursor()
+            else:
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by readID, start")
+        else:
+            if self.memBacked:
+                sql = self.memcon.cursor()
+            else:
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by chrom, start")
+
+        sqlQuery = string.join(stmt)
+        sql.execute(sqlQuery)
+
+        if findallOptimize:
+            resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
+            if self.memBacked:
+                self.memcon.row_factory = sqlite.Row
+            else:
+                self.dbcon.row_factory = sqlite.Row
+        else:
+            currentChrom = ""
+            currentReadID = ""
+            pairID = 0
+            for row in sql:
+                readID = row["readID"]
+                if fullChrom:
+                    chrom = row["chrom"]
+                else:
+                    chrom = row["chrom"][3:]
+
+                if not readIDDict and chrom != currentChrom:
+                    resultsDict[chrom] = []
+                    currentChrom = chrom
+                    dictKey = chrom
+                elif readIDDict:
+                    theReadID = readID
+                    if "::" in readID:
+                        theReadID = readID.split("::")[0]
+
+                    if "/" in theReadID and withPairID:
+                        (theReadID, pairID) = readID.split("/")
+
+                    if theReadID != currentReadID:
+                        resultsDict[theReadID] = []
+                        currentReadID = theReadID
+                        dictKey = theReadID
+
+                newrow = {"start": int(row["start"])}
+                if bothEnds:
+                    newrow["stop"] = int(row["stop"])
+
+                if not noSense:
+                    newrow["sense"] = row["sense"]
+
+                if withWeight:
+                    newrow["weight"] = float(row["weight"])
+
+                if withFlag:
+                    newrow["flag"] = row["flag"]
+
+                if withMismatch:
+                    newrow["mismatch"] = row["mismatch"]
+
+                if withID:
+                    newrow["readID"] = readID
+
+                if withChrom:
+                    newrow["chrom"] = chrom
+
+                if withPairID:
+                    newrow["pairID"] = pairID
+
+                resultsDict[dictKey].append(newrow)
+
+        return resultsDict
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereClause = []
+        resultsDict = {}
+
+        if chrom != "" and chrom != self.memChrom:
+            whereClause = ["chrom = '%s'" % chrom]
+
+        if flag != "":
+            if flagLike:
+                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+                whereClause.append(flagLikeClause)
+            else:
+                whereClause.append("flag = '%s'" % flag)
+
+        if hasMismatch:
+            whereClause.append("mismatch != ''")
+
+        if strand != "":
+            whereClause.append("sense = '%s'" % strand)
+
+        if start > -1:
+            whereClause.append("startL > %d" % start)
+
+        if stop > -1:
+            whereClause.append("stopR < %d" % stop)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        if not noSense:
+            selectClause.append("sense")
+
+        if withWeight:
+            selectClause.append("weight")
+
+        if withFlag:
+            selectClause.append("flag")
+
+        if withMismatch:
+            selectClause.append("mismatch")
+
+        selectQuery = string.join(selectClause, " ,")
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
+        sql.execute(stmt)
+        currentReadID = ""
+        currentChrom = ""
+        for row in sql:
+            pairID = 0
+            readID = row["readID"]
+            if fullChrom:
+                chrom = row["chrom"]
+            else:
+                chrom = row["chrom"][3:]
+
+            if not readIDDict and chrom != currentChrom:
+                resultsDict[chrom] = []
+                currentChrom = chrom
+                dictKey = chrom
+            elif readIDDict:
+                if "/" in readID:
+                    (theReadID, pairID) = readID.split("/")
+                else:
+                    theReadID = readID
+
+                if theReadID != currentReadID:
+                    resultsDict[theReadID] = []
+                    currentReadID = theReadID
+                    dictKey = theReadID
+
+            newrow = {"startL": int(row["startL"])}
+            newrow["stopL"] = int(row["stopL"])
+            newrow["startR"] = int(row["startR"])
+            newrow["stopR"] = int(row["stopR"])
+            if not noSense:
+                newrow["sense"] = row["sense"]
+
+            if withWeight:
+                newrow["weight"] = float(row["weight"])
+
+            if withFlag:
+                newrow["flag"] = row["flag"]
+
+            if withMismatch:
+                newrow["mismatch"] = row["mismatch"]
+
+            if withID:
+                newrow["readID"] = readID
+
+            if withChrom:
+                newrow["chrom"] = chrom
+
+            if withPairID:
+                newrow["pairID"] = pairID
+
+            if splitRead:
+                leftDict = newrow.copy()
+                del leftDict["startR"]
+                del leftDict["stopR"]
+                rightDict = newrow
+                del rightDict["startL"]
+                del rightDict["stopL"]
+                resultsDict[dictKey].append(leftDict)
+                resultsDict[dictKey].append(rightDict)
+            else:
+                resultsDict[dictKey].append(newrow)
+
+        return resultsDict
+
+
+    def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+                  splices=False, reportCombined=True, sense="both"):
+        """ return read counts for a given region.
+        """
+        ucount = 0
+        mcount = 0
+        scount = 0
+        restrict = ""
+        if sense in ["+", "-"]:
+            restrict = " sense ='%s' " % sense
+
+        if uniqs:
+            try:
+                ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
+            except:
+                ucount = 0
+
+        if multi:
+            try:
+                mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
+            except:
+                mcount = 0
+
+        if splices:
+            try:
+                scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
+            except:
+                scount = 0
+
+        if reportCombined:
+            total = ucount + mcount + scount
+            return total
+        else:
+            return (ucount, mcount, scount)
+
+
+    def getTotalCounts(self, chrom="", rmin="", rmax=""):
+        """ return read counts for a given region.
+        """
+        return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
+
+
+    def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
+        """ returns the number of row in the uniqs table.
+        """
+        whereClause = []
+        count = 0
+
+        if chrom !=""  and chrom != self.memChrom:
+            whereClause = ["chrom='%s'" % chrom]
+
+        if rmin != "":
+            whereClause.append("%s >= %s" % (startField, str(rmin)))
+
+        if rmax != "":
+            whereClause.append("%s <= %s" % (startField, str(rmax)))
+
+        if restrict != "":
+            whereClause.append(restrict)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        if distinct:
+            sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
+        else:
+            sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+
+        result = sql.fetchone()
+
+        try:
+            count = int(result[0])
+        except:
+            count = 0
+
+        return count
+
+
+    def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the number of row in the splices table.
+        """
+        return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+
+
+    def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the number of distinct readIDs in the uniqs table.
+        """
+        return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+
+
+    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the total weight of readIDs in the multi table.
+        """
+        return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+
+
+    def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
+        """ get readID's.
+        """
+        stmt = []
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
+        if uniqs:
+            stmt.append("select readID from uniqs")
+
+        if multi:
+            stmt.append("select readID from multi")
+
+        if splices:
+            stmt.append("select readID from splices")
+
+        if len(stmt) > 0:
+            selectPart = string.join(stmt, " union ")
+        else:
+            selectPart = ""
+
+        sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        sql.execute(sqlQuery)
+        result = sql.fetchall()
+
+        if paired:
+            return [x[0].split("/")[0] for x in result]
+        else:
+            return [x[0] for x in result]
+
+
+    def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
+        """ returns the uniq and spliced mismatches in a dictionary.
+        """
+        readlen = self.getReadSize()
+        if mischrom:
+            hitChromList = [mischrom]
+        else:
+            hitChromList = self.getChromosomes()
+            hitChromList.sort()
+
+        snpDict = {}
+        for achrom in hitChromList:
+            if verbose:
+                print "getting mismatches from chromosome %s" % (achrom)
+
+            snpDict[achrom] = []
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
+            if useSplices and self.dataType == "RNA":
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+                spliceIDList = spliceDict.keys()
+                for k in spliceIDList:
+                    spliceEntry = spliceDict[k][0]
+                    startpos = spliceEntry["startL"]
+                    lefthalf = spliceEntry["stopL"]
+                    rightstart = spliceEntry["startR"]
+                    sense = spliceEntry["sense"]
+                    mismatches = spliceEntry["mismatch"]
+                    spMismatchList = mismatches.split(",")
+                    for mismatch in spMismatchList:
+                        if "N" in mismatch:
+                            continue
+
+                        change_len = len(mismatch)
+                        if sense == "+":
+                            change_from = mismatch[0]
+                            change_base = mismatch[change_len-1]
+                            change_pos = int(mismatch[1:change_len-1])
+                        elif sense == "-":
+                            change_from = getReverseComplement([mismatch[0]])
+                            change_base = getReverseComplement([mismatch[change_len-1]])
+                            change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+                        firsthalf = int(lefthalf)-int(startpos)+1
+                        secondhalf = 0
+                        if int(change_pos) <= int(firsthalf):
+                            change_at = startpos + change_pos - 1
+                        else:
+                            secondhalf = change_pos - firsthalf
+                            change_at = rightstart + secondhalf
+
+                        snpDict[achrom].append([startpos, change_at, change_base, change_from])
+
+            if achrom not in hitDict.keys():
+                continue
+
+            for readEntry in hitDict[achrom]:
+                start = readEntry["start"]
+                sense = readEntry["sense"]
+                mismatches = readEntry["mismatch"]
+                mismatchList = mismatches.split(",")
+                for mismatch in mismatchList:
+                    if "N" in mismatch:
+                        continue
+
+                    change_len = len(mismatch)
+                    if sense == "+":
+                        change_from = mismatch[0]
+                        change_base = mismatch[change_len-1]
+                        change_pos = int(mismatch[1:change_len-1])
+                    elif sense == "-":
+                        change_from = getReverseComplement([mismatch[0]])
+                        change_base = getReverseComplement([mismatch[change_len-1]])
+                        change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+                    change_at = start + change_pos - 1
+                    snpDict[achrom].append([start, change_at, change_base, change_from])
+
+        return snpDict
+
+
+    def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
+                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        keepStrand="both", shiftValue=0):
+        """return a profile of the chromosome as an array of per-base read coverage....
+            keepStrand = 'both', 'plusOnly', or 'minusOnly'.
+            Will also shift position of unique and multireads (but not splices) if shift is a natural number
+        """
+        metadata = self.getMetadata()
+        try:
+            readlen = int(metadata["readsize"])
+        except KeyError:
+            readlen = 0
+
+        dataType = metadata["dataType"]
+        scale = 1. / normalizationFactor
+        shift = {}
+        shift['+'] = int(shiftValue)
+        shift['-'] = -1 * int(shiftValue)
+
+        if cstop > 0:
+            lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
+        else:
+            lastNT = cstop - cstart + readlen + shift["+"]
+
+        chromModel = array("f",[0.] * lastNT)
+        hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
+        if cstart < 0:
+            cstart = 0
+
+        for readEntry in hitDict[chromosome]:
+            hstart = readEntry["start"]
+            sense =  readEntry ["sense"]
+            weight = readEntry["weight"]
+            hstart = hstart - cstart + shift[sense]
+            for currentpos in range(hstart,hstart+readlen):
+                try:
+                    if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
+                        chromModel[currentpos] += scale * weight
+                    elif sense == "-" and keepStrand != "plusOnly":
+                        chromModel[currentpos] -= scale * weight
+                except:
+                    continue
+
+        del hitDict
+        if useSplices and dataType == "RNA":
+            if cstop > 0:
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
+            else:
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
+   
+            if chromosome in spliceDict:
+                for spliceEntry in spliceDict[chromosome]:
+                    Lstart = spliceEntry["startL"]
+                    Lstop = spliceEntry["stopL"]
+                    Rstart = spliceEntry["startR"]
+                    Rstop = spliceEntry["stopR"]
+                    rsense = spliceEntry["sense"]
+                    if (Rstop - cstart) < lastNT:
+                        for index in range(abs(Lstop - Lstart)):
+                            currentpos = Lstart - cstart + index
+                            # we only track unique splices
+                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+                                chromModel[currentpos] += scale
+                            elif rsense == "-" and keepStrand != "plusOnly":
+                                chromModel[currentpos] -= scale
+
+                        for index in range(abs(Rstop - Rstart)):
+                            currentpos = Rstart - cstart + index
+                            # we only track unique splices
+                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+                                chromModel[currentpos] += scale
+                            elif rsense == "-" and keepStrand != "plusOnly":
+                                chromModel[currentpos] -= scale
+
+            del spliceDict
+
+        return chromModel
+
+
+    def insertMetadata(self, valuesList):
+        """ inserts a list of (pname, pvalue) into the metadata
+        table.
+        """
+        self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def updateMetadata(self, pname, newValue, originalValue=""):
+        """ update a metadata field given the original value and the new value.
+        """
+        stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
+        if originalValue != "":
+            stmt += " and value='%s' " % str(originalValue)
+
+        self.dbcon.execute(stmt)
+        self.dbcon.commit()
+
+
+    def insertUniqs(self, valuesList):
+        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+        into the uniqs table.
+        """
+        self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def insertMulti(self, valuesList):
+        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+        into the multi table.
+        """
+        self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def insertSplices(self, valuesList):
+        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+        into the splices table.
+        """
+        self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
+        """ update reads on file database in a list region of regions for a chromosome to have a new flag.
+            regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
+            sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
+        """
+        restrict = ""
+        if sense != "both":
+            restrict = " and sense = ? "
+
+        if uniqs:
+            self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+        if multi:
+            self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+        if self.dataType == "RNA" and splices:
+            self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
+            self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
+
+        self.dbcon.commit()
+
+
+    def setFlags(self, flag, uniqs=True, multi=True, splices=True):
+        """ set the flag fields in the entire dataset.
+        """
+        if uniqs:
+            self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+
+        if multi:
+            self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+
+        if self.dataType == "RNA" and splices:
+            self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+
+        self.dbcon.commit()
+
+
+    def resetFlags(self, uniqs=True, multi=True, splices=True):
+        """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+        """
+        self.setFlags("", uniqs, multi, splices)
+
+
+    def reweighMultireads(self, readList):
+        self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
+
+
+    def setSynchronousPragma(self, value="ON"):
+        try:
+            self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
+        except:
+            print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+
+
+    def setDBcache(self, cache, default=False):
+        self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
+        if default:
+            self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+
+
+    def execute(self, statement, returnResults=False):
+        sql = self.getSqlCursor()
+
+        sql.execute(statement)
+        if returnResults:
+            result = sql.fetchall()
+            return result
+
+
+    def executeCommit(self, statement):
+        self.execute(statement)
+
+        if self.memBacked:
+            self.memcon.commit()
+        else:
+            self.dbcon.commit()
+
+
+    def buildIndex(self, cache=100000):
+        """ Builds the file indeces for the main tables.
+            Cache is the number of 1.5 kb pages to keep in memory.
+            100000 pages translates into 150MB of RAM, which is our default.
+        """
+        if cache > self.getDefaultCacheSize():
+            self.setDBcache(cache)
+        self.setSynchronousPragma("OFF")
+        self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+        print "built uPosIndex"
+        self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
+        print "built uChromIndex"
+        self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+        print "built mPosIndex"
+        self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
+        print "built mChromIndex"
+
+        if self.dataType == "RNA":
+            self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
+            print "built sPosIndex"
+            self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
+            print "built sPosIndex2"
+            self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
+            print "built sChromIndex"
+
+        self.dbcon.commit()
+        self.setSynchronousPragma("ON")
+
+
+    def dropIndex(self):
+        """ drops the file indices for the main tables.
+        """
+        try:
+            self.setSynchronousPragma("OFF")
+            self.dbcon.execute("DROP INDEX uPosIndex")
+            self.dbcon.execute("DROP INDEX uChromIndex")
+            self.dbcon.execute("DROP INDEX mPosIndex")
+            self.dbcon.execute("DROP INDEX mChromIndex")
+
+            if self.dataType == "RNA":
+                self.dbcon.execute("DROP INDEX sPosIndex")
+                try:
+                    self.dbcon.execute("DROP INDEX sPosIndex2")
+                except:
+                    pass
+
+                self.dbcon.execute("DROP INDEX sChromIndex")
+
+            self.dbcon.commit()
+        except:
+            print "problem dropping index"
+
+        self.setSynchronousPragma("ON")
+
+
+    def memSync(self, chrom="", index=False):
+        """ makes a copy of the dataset into memory for faster access.
+        Can be restricted to a "full" chromosome. Can also build the
+        memory indices.
+        """
+        self.memcon = ""
+        self.memcon = sqlite.connect(":memory:")
+        self.initializeTables(self.memcon)
+        cursor = self.dbcon.cursor()
+        whereclause = ""
+        if chrom != "":
+            print "memSync %s" % chrom
+            whereclause = " where chrom = '%s' " % chrom
+            self.memChrom = chrom
+        else:
+            self.memChrom = ""
+
+        self.memcon.execute("PRAGMA temp_store = MEMORY")
+        self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
+        # copy metadata to memory
+        self.memcon.execute("delete from metadata")
+        results = cursor.execute("select name, value from metadata")
+        results2 = []
+        for row in results:
+            results2.append((row["name"], row["value"]))
+
+        self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
+
+        self.copyDBEntriesToMemory("uniqs", whereclause)
+        self.copyDBEntriesToMemory("multi", whereclause)
+        if self.dataType == "RNA":
+            self.copySpliceDBEntriesToMemory(whereclause)
+
+        if index:
+            if chrom != "":
+                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
+                self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
+                if self.dataType == "RNA":
+                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
+                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
+            else:
+                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+                self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+                if self.dataType == "RNA":
+                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
+                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+
+        self.memBacked = True
+        self.memcon.row_factory = sqlite.Row
+        self.memcon.commit()
+
+
+    def copyDBEntriesToMemory(self, dbName, whereClause=""):
+        cursor = self.dbcon.cursor()
+        sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
+        destinationEntries = []
+        for row in sourceEntries:
+            destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+        self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
+
+
+    def copySpliceDBEntriesToMemory(self, whereClause=""):
+        cursor = self.dbcon.cursor()
+        sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
+        destinationEntries = []
+        for row in sourceEntries:
+            destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
+                                       row["weight"], row["flag"], row["mismatch"]))
+
+        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+
diff --git a/__init__.py b/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/altSpliceCounts.py b/altSpliceCounts.py
new file mode 100755 (executable)
index 0000000..1517ef8
--- /dev/null
@@ -0,0 +1,152 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+print 'version 3.6'
+
+import sys, optparse
+from commoncode import readDataset
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s rdsfile outfilename [--cache pages]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", type="int", dest="numCachePages",
+                      help="number of cache pages to use [default: 100000]")
+    parser.set_defaults(numCachePages=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    hitfile =  args[0]
+    outfilename = args[1]
+
+    if options.numCachePages is not None:
+        doCache = True
+        cachePages = options.numCachePages
+    else:
+        doCache = False
+        cachePages = 100000
+
+    altSpliceCounts(hitfile, outfilename, doCache, cachePages)
+
+
+def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
+    startDict = {}
+    stopDict = {}
+    resultDict = {}
+
+    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    readlen = hitRDS.getReadSize()
+    hitDict = hitRDS.getSplicesDict(noSense=True)
+    outfile = open(outfilename,'w')
+
+    for chrom in hitDict:
+        startDict[chrom] = []
+        stopDict[chrom] = []
+        resultDict[chrom] = []
+
+    index = 0
+    for chrom in hitDict:
+        for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]:
+            index += 1
+            length = tagStop - tagStart
+            if length < readlen + 5:
+                continue
+
+            startDict[chrom].append((tagStart, length))
+            stopDict[chrom].append((tagStop, length))
+
+        startDict[chrom].sort()
+        stopDict[chrom].sort()
+
+    spliceEvent = 0
+    altSpliceEvent = 0
+    alternative = 1
+    for chrom in startDict:
+        firstIndex = 0
+        maxIndex = len(startDict[chrom])
+        while firstIndex < maxIndex:
+            (fstart, flen) = startDict[chrom][firstIndex]
+            (start, length) = (fstart, flen)
+            secondIndex = firstIndex
+            secondLengths = []
+            while (start - fstart) < readlen:
+                if secondIndex >= maxIndex:
+                    break
+
+                (start, length) = startDict[chrom][secondIndex]
+                if (start - fstart) < readlen and abs(length - flen) > readlen:
+                    line =  (chrom, fstart, fstart + flen, chrom, start, start + length)
+                    alreadySeen = False
+                    for slength in secondLengths:
+                        if abs(slength - length) < readlen:
+                            alreadySeen = True
+
+                    if len(resultDict[chrom]) == 0:
+                        resultDict[chrom].append(line)
+                    elif line != resultDict[chrom][-1] and not alreadySeen:
+                        resultDict[chrom].append(line)
+                        secondLengths.append(length)
+                        altSpliceEvent += 1
+                        spliceEvent += 1
+
+                secondIndex += 1
+
+            firstIndex = secondIndex
+            spliceEvent += 1
+
+        firstIndex = 0
+        maxIndex = len(stopDict[chrom])
+        while firstIndex < maxIndex:
+            (fstop, flen) = stopDict[chrom][firstIndex]
+            (stop, length) = (fstop, flen)
+            secondIndex = firstIndex
+            secondLengths = []
+            while (stop - fstop) < readlen:
+                if secondIndex >= maxIndex:
+                    break
+                (stop, length) = stopDict[chrom][secondIndex]
+                if (stop - fstop) < readlen and abs(length - flen) > readlen:
+                    line = (chrom, fstop - flen, fstop, chrom, stop - length, stop)
+                    alreadySeen = False
+                    for slength in secondLengths:
+                        if abs(slength - length) < readlen:
+                            alreadySeen = True
+
+                    if len(resultDict[chrom]) == 0:
+                        resultDict[chrom].append(line)
+
+                    if line != resultDict[chrom][-1] and not alreadySeen:
+                        resultDict[chrom].append(line)
+                        secondLengths.append(length)
+                        altSpliceEvent += 1
+                        spliceEvent += 1
+
+                secondIndex += 1
+
+            firstIndex = secondIndex
+            spliceEvent += 1
+
+        resultDict[chrom].sort()
+        for line in resultDict[chrom]:
+            outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
+            alternative += 1
+
+        print chrom, maxIndex, spliceEvent, altSpliceEvent
+
+    print spliceEvent, altSpliceEvent
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/analyzego.py b/analyzego.py
new file mode 100755 (executable)
index 0000000..d4f9f6f
--- /dev/null
@@ -0,0 +1,86 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print "psyco not running"
+
+import sys, optparse
+from cistematic.cisstat.analyzego import calculateGOStats
+from cistematic.core.geneinfo import geneinfoDB
+
+print "version 2.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--geneName", action="store_true", dest="translateGene",
+                      help="translate gene")
+    parser.add_option("--field", type="int", dest="fieldID",
+                      help="column containing gene ID/Name")
+    parser.set_defaults(translateGene=False, fieldID=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    fieldID = 1
+    if options.translateGene:
+        fieldID = 0
+
+    if options.fieldID is not None:
+        fieldID = options.fieldID
+
+    genome = args[0]
+    infilename = args[1]
+    prefix = args[2]
+
+    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+
+
+def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
+    infile = open(infilename)
+    analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
+    infile.close()
+
+
+def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
+    if translateGene:
+        idb = geneinfoDB(cache=True)
+        geneinfoDict = idb.getallGeneInfo(genome)
+        symbolToGidDict = {}
+        for gid in geneinfoDict:
+            symbol = geneinfoDict[gid][0][0].strip()
+            symbolToGidDict[symbol] = gid
+
+    locusList = []
+    for line in geneInfoList:
+        fields = line.split()
+        if translateGene:
+            gene = fields[fieldID]
+            if "LOC" in gene:
+                gID = gene[3:]
+            elif "FAR" in gene:
+                print "ignoring %s" % gene
+                continue
+            else:
+                try:
+                    gID = symbolToGidDict[gene]
+                except KeyError:
+                    print "ignoring %s" % gene
+                    continue
+        else:
+            gID = fields[fieldID]
+
+        if (genome, gID) not in locusList:
+            locusList.append((genome, gID))
+
+    if len(locusList) > 0:
+        calculateGOStats(locusList, prefix)
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/bedtoregion.py b/bedtoregion.py
new file mode 100755 (executable)
index 0000000..d6c44de
--- /dev/null
@@ -0,0 +1,35 @@
+import sys, string
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s factorlabel bedinfilename regionoutfile" % sys.argv[0]
+
+    if len(argv) < 4:
+        print usage
+        sys.exit(1)
+
+    factor = argv[1]
+    infilename = argv[2]
+    outfilename = argv[3]
+
+    bedToRegion(factor, infilename, outfilename)
+
+
+def bedToRegion(factor, infilename, outfilename):
+    index = 1
+    infile = open(infilename)
+    outfile = open(outfilename, 'w')
+    for line in infile:
+        if 'track' in line:
+            continue
+        fields = line.split()
+        line = string.join(fields, '\t')
+        outfile.write('%s%d\t%s\n' % (factor, index, line))
+        index += 1
+    infile.close()
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/binstocdf.py b/binstocdf.py
new file mode 100755 (executable)
index 0000000..9381866
--- /dev/null
@@ -0,0 +1,46 @@
+import sys
+
+print 'version 1.0'
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 2:
+        print 'usage: python %s infile outfile' % sys.argv[0]
+        sys.exit(1)
+
+    infilename = argv[0]
+    outfilename = argv[1]
+
+    binToCDF(infilename, outfilename)
+
+
+def binToCDF(infilename, outfilename):
+    infile = open(infilename)
+    outfile = open(outfilename, 'w')
+
+    for line in infile:
+        fields = line.strip().split()
+        if len(fields) < 4:
+            continue
+
+        total = int(fields[2])
+        if total == 0:
+            outfile.write(line)
+            continue
+
+        outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
+        cum = 0
+        for bin in fields[4:]:
+            cum += int(bin)
+            percent = 100 * cum / total
+            outfile.write('\t%d' % percent)
+
+        outfile.write('\n')
+
+    infile.close()
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/buildMatrix.py b/buildMatrix.py
new file mode 100755 (executable)
index 0000000..361f56e
--- /dev/null
@@ -0,0 +1,120 @@
+#
+#  buildMatrix.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 3/6/09.
+#
+import sys, string, optparse
+from commoncode import writeLog
+
+versionString = "%prog: version 1.3"
+print versionString
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog matrix.step.N-1 data.part matrix.step.N [--rescale] [--truncate maxRPKM] [--log altlogfile]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--rescale", action="store_true", dest="rescale")
+    parser.add_option("--truncate", type="int", dest="maxRPKM")
+    parser.add_option("--log", dest="logfilename")
+    parser.set_defaults(rescale=False, maxRPKM=None, logfilename="buildMatrix.log")
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(0)
+
+    infile = args[0]
+    colfilename = args[1]
+    outfilename = args[2]
+
+    if options.maxRPKM is not None:
+        truncateRPKM = True
+        maxRPKM = options.maxRPKM
+    else:
+        truncateRPKM = False
+        maxRPKM = 100000000
+
+    buildMatrix(infile, colfilename, outfilename, truncateRPKM, maxRPKM,
+                options.rescale, options.logfilename)
+
+
+def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
+                maxRPKM=100000000, rescale=False, logfilename="buildMatrix.log"):
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+    if "/" in colfilename:
+        colname = colfilename.split("/")[-1]
+    else:
+        colname = colfilename
+
+    fileParts = colname.split(".")
+    colID =  fileParts[0]
+
+    infile = open(inFileName)
+    colfile = open(colfilename)
+    outfile = open(outfilename, "w")
+    header = infile.readline()[:-1]
+    if header.strip() == "":
+        header = "#\t"
+
+    outfile.write( "%s\t%s\n" % (header, colID))
+
+    values = []
+    min = 20000000000.
+    max = -1.
+    untruncatedMax = -1.
+    for line in colfile:
+        if doNotProcessLine(line):
+            continue
+
+        fields = line.strip().split()
+        val = float(fields[-1])
+        if truncateRPKM and val > maxRPKM:
+            if val > untruncatedMax:
+                untruncatedMax = val
+
+            val = maxRPKM
+
+        values.append(val)
+        if val < min:
+            min = val
+
+        if val > max:
+            max = val
+
+    range = max - min
+    if rescale:
+        finalValues = [(val - min)/range for val in values]
+    else:
+        finalValues = values
+
+    for val in finalValues:
+        line = infile.readline().strip()
+        line += "\t%1.3f\n" % val
+        outfile.write(line)
+
+    outfile.close()
+
+    if untruncatedMax > 0:
+        max = untruncatedMax
+
+    message = "max value in %s was %.2f" % (colname, max)
+    if untruncatedMax > 0:
+        message += " but was truncated to %d" % maxRPKM
+
+    print message
+    writeLog(logfilename, versionString, message)
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/buildrmaskdb.py b/buildrmaskdb.py
new file mode 100755 (executable)
index 0000000..d1d6b00
--- /dev/null
@@ -0,0 +1,61 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import sqlite3 as sqlite
+import os
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "version 2.0"
+    if len(argv) < 3:
+        print "usage: python %s rmaskdir rmaskdbfile" % argv[0]
+        exit(1)
+
+    rmaskdir = argv[1]
+    rmaskdb = argv[2]
+
+    buildrmaskdb(rmaskdir, rmaskdb)
+
+
+def buildrmaskdb(rmaskdir, rmaskdb):
+    files = os.listdir(rmaskdir)
+    db = sqlite.connect(rmaskdb)
+    sql = db.cursor()
+    sql.execute("create table repeats (chrom varchar, start int, stop int, name varchar, family varchar)")
+    sql.execute("PRAGMA temp_store = MEMORY")
+    sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+    db.commit()
+
+    for filename in files:
+        if "rmsk" not in filename:
+            continue
+
+        print filename
+        infile = open(rmaskdir + "/" + filename)
+        for entry in infile:
+            fields = entry.strip().split("\t")
+            chrom = fields[5][3:]
+            start = int(fields[6])
+            stop = int(fields[7])
+            name = fields[10]
+            family = fields[12]
+            stmt = "insert into repeats values('%s', %d, %d, '%s', '%s')" % (chrom, start, stop, name, family)
+            sql.execute(stmt)
+
+        db.commit()
+
+    print "building index..."
+    sql.execute("PRAGMA SYNCHRONOUS = OFF")
+    sql.execute("create index chromIndex on repeats(chrom)")
+    sql.execute("create index mainIndex on repeats(chrom, start, stop)")
+    db.commit()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/buildsnpdb.py b/buildsnpdb.py
new file mode 100755 (executable)
index 0000000..2510443
--- /dev/null
@@ -0,0 +1,95 @@
+"""
+creates table snp {chrom varchar,
+                   start int,
+                   stop int,
+                   name varchar,
+                   observed varchar,
+                   strand varchar,
+                   ucscref varchar,
+                   ncbiref varchar,
+                   func varchar,
+                   moltype varchar,
+                   valid varchar,
+                   class varchar
+}
+
+sample line in dbsnp file
+608   chr1    3093453 3093454 rs52602943      0       +       G       G        C/G   genomic single  unknown 0       0       unknown exact   1
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+import sys
+import sqlite3 as sqlite
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "version 2.0"
+    if len(argv) < 3:
+        print "usage: python %s snpfile snpdbname" % argv[0]
+        sys.exit(1)
+
+    snpfilename = argv[1]
+    snpdb = argv[2]
+
+    buildsnpdb(snpfilename, snpdb)
+
+
+def buildsnpdb(snpfilename, snpdb):
+    db = sqlite.connect(snpdb)
+    sql = db.cursor()
+    sql.execute("create table snp (chrom varchar, start long, stop long, name varchar, observed varchar, strand varchar, ucscref varchar, ncbiref varchar, func varchar, moltype varchar, valid varchar, class varchar)")
+    sql.execute("PRAGMA temp_store = MEMORY")
+    sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+    db.commit()
+
+    insertSize = 100000
+    insertCounter = 0
+    valuesList = []
+    print snpfilename
+    infile = open(snpfilename)
+    for entry in infile:
+        try:
+            fields = entry.strip().split("\t")
+            chrom = fields[1][3:]
+            start = int(fields[2])
+            stop = int(fields[3])
+            name = fields[4]
+            strand = fields[6]
+            refNcbi = fields[7]
+            refUcsc = fields[8]
+            observed = fields[9]
+            molType = fields[10]
+            classes = fields[11]
+            valid = fields[12]
+            func = fields[15]
+
+            valuesList.append((chrom, start, stop, name, observed, strand, refUcsc, refNcbi, func, molType, valid, classes))
+            insertCounter += 1
+        except:
+            continue
+
+        if insertCounter % insertSize == 0:
+            print insertCounter
+            db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList)
+            valuesList = []
+
+    if len(valuesList) > 0:
+        db.executemany("insert into snp values (?,?,?,?,?,?,?,?,?,?,?,?)", valuesList)
+
+    db.commit()
+
+    print "building index"
+    sql.execute("PRAGMA SYNCHRONOUS = OFF")
+    sql.execute("create index chromIndex on snp(chrom)")
+    sql.execute("create index mainIndex on snp(chrom,start,stop)")
+    db.commit()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/cdfdist.py b/cdfdist.py
new file mode 100755 (executable)
index 0000000..7166244
--- /dev/null
@@ -0,0 +1,37 @@
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 4:
+        print "usage: python %s bins percent infile" % sys.argv[0]
+        sys.exit(1)
+
+    bins = int(argv[0])
+    percent = int(argv[1])
+    infilename = argv[2]
+
+    cdfDist(bins, percent, infilename)
+
+
+def cdfDist(bins, percent, infilename):
+    infile = open(infilename)
+    binsList = [0] * bins
+
+    for line in infile:
+        fields = line.strip().split()
+        index = 0
+        for binCdf in fields[-1 * bins:]:
+            if int(binCdf) > percent:
+                binsList[index] += 1
+                break
+
+            index += 1
+
+    infile.close()
+    print binsList
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/checkrmask.py b/checkrmask.py
new file mode 100755 (executable)
index 0000000..9f58983
--- /dev/null
@@ -0,0 +1,189 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sqlite3 as sqlite
+import sys, string, optparse
+import os.path
+from commoncode import writeLog
+
+versionString = "%prog: version 3.5"
+print versionString
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--startField", type="int", dest="startField")
+    parser.add_option("--log", dest="logfilename")
+    parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    dbfile = args[0]
+    filename = args[1]
+    outfile = args[2]
+    goodfile = args[3]
+
+    checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
+
+
+def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
+    outfile = open(outFileName, "w")
+    goodfile = open(goodFileName, "w")
+    if startField < 0:
+        startField = 0
+
+    if cachePages < 250000:
+        cachePages = 250000
+
+    doLog = False
+    if logfilename is not None:
+        writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+        doLog = True
+
+    infile = open(filename)
+    if os.path.isfile(dbfile):
+        db = sqlite.connect(dbfile)
+        sql = db.cursor()
+        sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+        sql.execute("PRAGMA temp_store = MEMORY")
+    else:
+        print "No database - passing through"
+        if doLog:
+            writeLog(logfilename, versionString, "No database - passing through")
+
+        for line in infile:
+            outfile.write("%s\tNR\tNR\t0.00\n" % line)
+            goodfile.write(line)
+
+        outfile.close()
+        goodfile.close()
+        sys.exit(0)
+
+    featureList = []
+    featureDict = {}
+
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split("\t")
+        chrom = fields[startField][3:]
+        start = int(fields[startField + 1])
+        stop = int(fields[startField + 2])
+        featureList.append((chrom,start, stop))
+        featureDict[(chrom, start, stop)] = line.strip()
+
+    infile.close()
+
+    featureList.sort()
+    currentChrom = ""
+    currentMax = 0
+    increment = 20000000
+    for (chrom, start, stop) in featureList:
+        if chrom != currentChrom:
+            currentMax = 0
+
+        if start > currentMax:
+            currentChrom = chrom
+            currentMin = currentMax
+            currentMax += increment
+            print "caching %s from %d to %d" % (chrom, currentMin, currentMax)
+            try:
+                del con
+            except:
+                pass
+
+            con = sqlite.connect(":memory:")
+            sql.execute("select start, stop, name, family from repeats where chrom = '%s' and start >= %d and start <= %d order by start" % (chrom, currentMin, currentMax + 10000))
+            results = sql.fetchall()
+            results2 = []
+            con.execute("create table repeats(name, family, start, stop)")
+            con.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+            con.execute("PRAGMA temp_store = MEMORY")
+            for (rstart, rstop, name, family) in results:
+                results2.append((name, family, int(rstart), int(rstop)))
+
+            con.executemany("insert into repeats(name, family, start, stop) values (?,?,?,?)", results2)
+            con.execute("CREATE INDEX posIndex on repeats(start, stop)")
+            print chrom, len(results2)
+            sql2 = con.cursor()
+
+        featureLength = abs(stop - start)
+        results = []
+        finalresults = []
+        sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (start, start))
+        results = sql2.fetchall()
+        for (rstart, rstop, name, family) in results:
+            overlapLength = float(abs(rstop - start))
+            if overlapLength > featureLength:
+                overlapLength = featureLength
+
+            ratio = overlapLength / featureLength
+            if (name, family, ratio) not in finalresults:
+                finalresults.append((name, family, ratio))
+
+        sql2.execute("select start, stop, name, family from repeats where start < %d and stop > %d" % (stop, stop))
+        results = sql2.fetchall()
+        for (rstart, rstop, name, family) in results:
+            overlapLength = float(abs(rstart - stop))
+            if overlapLength > featureLength:
+                overlapLength = featureLength
+
+            ratio = overlapLength / featureLength
+            if (name, family, ratio) not in finalresults:
+                finalresults.append((name, family, ratio))
+
+        sql2.execute("select start, stop, name, family from repeats where start <= %d and stop >= %d" % (start, stop))
+        results = sql2.fetchall()
+        for (rstart, rstop, name, family) in results:
+            overlapLength = float(abs(rstop - rstart))
+            if overlapLength > featureLength:
+                overlapLength = featureLength
+
+            ratio = overlapLength / featureLength
+            if (name, family, ratio) not in finalresults:
+                finalresults.append((name, family, ratio))
+
+        sql2.execute("select start, stop, name, family from repeats where start >= %d and stop <= %d" % (start, stop))
+        results = sql2.fetchall()
+        for (rstart, rstop, name, family) in results:
+            overlapLength = float(abs(rstop - rstart))
+            if overlapLength > featureLength:
+                overlapLength = featureLength
+
+            ratio = overlapLength / featureLength
+            if (name, family, ratio) not in finalresults:
+                finalresults.append((name, family, ratio))
+
+        line = featureDict[(chrom, start, stop)]
+        total = 0.
+        for (name, family, fraction) in finalresults:
+            outline = "%s\t%s\t%s\t%2.2f" % (line, name, family, fraction)
+            total += fraction
+            print outline
+            outfile.write(outline + "\n")
+
+        if len(finalresults) == 0:
+            outline = "%s\tNR\tNR\t%0.00" % line
+            print outline
+            outfile.write(outline + "\n")
+
+        if total < 0.2:
+            goodfile.write(line + "\n")
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chiapet/.svn/entries b/chiapet/.svn/entries
new file mode 100644 (file)
index 0000000..595b06e
--- /dev/null
@@ -0,0 +1,142 @@
+10
+
+dir
+23
+file:///Users/sau/svn/repos/erange/source/Erange/chiapet
+file:///Users/sau/svn/repos
+
+
+
+2010-10-01T18:32:26.347691Z
+22
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9
+\f
+segregateLinkers.py
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+a847d39676e6a4fb9501811ab9a4c0b9
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2380
+\f
+__init__.py
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+d41d8cd98f00b204e9800998ecf8427e
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+0
+\f
+markLinkers.py
+file
+
+
+
+
+
+10c527dc803a21ba14dfd8efc4f1e3d3
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
+linkers.fa
+file
+
+
+
+
+2010-09-15T19:01:49.000000Z
+2b64087c826083f04e0ff968312e019a
+2010-09-15T19:02:48.670738Z
+21
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+63
+\f
diff --git a/chiapet/.svn/text-base/__init__.py.svn-base b/chiapet/.svn/text-base/__init__.py.svn-base
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/chiapet/.svn/text-base/linkers.fa.svn-base b/chiapet/.svn/text-base/linkers.fa.svn-base
new file mode 100644 (file)
index 0000000..290c98b
--- /dev/null
@@ -0,0 +1,4 @@
+>linker_b.1
+GTTGGATAAGATATCGCGG
+>linker_b.2
+GTTGGAATGTATATCGCGG
\ No newline at end of file
diff --git a/chiapet/.svn/text-base/markLinkers.py.svn-base b/chiapet/.svn/text-base/markLinkers.py.svn-base
new file mode 100644 (file)
index 0000000..a2a97e6
--- /dev/null
@@ -0,0 +1,68 @@
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    linkerfile = argv[1]
+    infile = argv[2]
+    outfile = argv[3]
+
+    markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+    for line in infile:
+        if len(line) < 2:
+            continue
+
+        if "@" in line:
+            readID = line.strip()
+            readID = readID.replace("@", "")
+        else:
+            found = False
+            for linkerID in linkerList:
+                position = line.find(linkerDict[linkerID])
+                if position >= 19:
+                    found = True
+                    outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+                    outfile.write("%s\n" % line[:20])
+
+                if not found:
+                    outfile.write(">NA_%s\n" % readID)
+                    outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+    linkerDict = {}
+    linkerList = []
+    try:
+        linkerfile = open(linkerFileName)
+        return getLinkerInformation(linkerfile)
+    except IOError:
+        return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+    linkerDict = {}
+    linkerList = []
+
+    for entry in linkerInformationList:
+        if ">" in entry:
+            linkerID = entry.strip()
+            linkerID = linkerID[1:]
+            linkerList.append(linkerID)
+        else:
+            sequence = entry.strip()
+            linkerDict[linkerID] = sequence[:10]
+
+    return linkerDict, linkerList
+
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chiapet/.svn/text-base/segregateLinkers.py.svn-base b/chiapet/.svn/text-base/segregateLinkers.py.svn-base
new file mode 100644 (file)
index 0000000..3d213da
--- /dev/null
@@ -0,0 +1,88 @@
+import sys
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    infile1 = argv[1]
+    infile2 = argv[2]
+    outprefix = argv[3]
+
+    segregateLinkers(infile1, infile2, outprefix)
+
+
+def segregateLinkers(infile1name, infile2name, outprefix):
+    infile1 = open(infile1name)
+    infile2 = open(infile2name)
+    same1 = 0
+    same2 = 0
+    mixed = 0
+    hasNA = 0
+
+    outsame1 = open("%s.same1.fa" % outprefix, "w")
+    outsame2 = open("%s.same2.fa" % outprefix, "w")
+    outNA = open("%s.NA.fa" % outprefix, "w")
+    outmixed = open("%s.mixed.fa" % outprefix, "w")
+
+    lines1 = infile1.readlines()
+
+    failed = False
+    for line1 in lines1:
+        line2 = infile2.readline()
+        if failed:
+            line2 = infile2.readline()
+            print line1.strip()
+            print line2.strip()
+            sys.exit(1)
+            continue
+
+        if ">" in line1:
+            try:
+                (linker1, readid1) = line1.split("_")
+                (linker2, readid2) = line2.split("_")
+                shortid1 = readid1.split("/")[0]
+                shortid2 = readid2.split("/")[0]
+                if shortid1 != shortid2:
+                    print shortid1, shortid2
+                    sys.exit(1)
+
+                failed = False
+            except:
+                print line1.strip()
+                print line2.strip()
+                failed = True
+
+            continue
+
+        if "NA" in linker1 or "NA" in linker2:
+            hasNA += 1
+            outNA.write("%s_%s%s" % (linker1, readid1, line1))
+            outNA.write("%s_%s%s" % (linker2, readid2, line2))
+        elif linker1 == linker2:
+            if "L1" in linker1:
+                same1 += 1
+                outsame1.write("%s_%s%s" % (linker1, readid1, line1))
+                outsame1.write("%s_%s%s" % (linker2, readid2, line2))
+            else:
+                same2 += 1
+                outsame2.write("%s_%s%s" % (linker1, readid1, line1))
+                outsame2.write("%s_%s%s" % (linker2, readid2, line2))
+        else:
+            mixed += 1
+            outmixed.write("%s_%s%s" % (linker1, readid1, line1))
+            outmixed.write("%s_%s%s" % (linker2, readid2, line2))
+
+    print same1
+    print same2
+    print mixed
+    print hasNA
+
+    outmixed.close()
+    outNA.close()
+    outsame1.close()
+    outsame2.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chiapet/.svn/tmp/markLinkers.py.tmp b/chiapet/.svn/tmp/markLinkers.py.tmp
new file mode 100644 (file)
index 0000000..a2a97e6
--- /dev/null
@@ -0,0 +1,68 @@
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    linkerfile = argv[1]
+    infile = argv[2]
+    outfile = argv[3]
+
+    markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+    for line in infile:
+        if len(line) < 2:
+            continue
+
+        if "@" in line:
+            readID = line.strip()
+            readID = readID.replace("@", "")
+        else:
+            found = False
+            for linkerID in linkerList:
+                position = line.find(linkerDict[linkerID])
+                if position >= 19:
+                    found = True
+                    outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+                    outfile.write("%s\n" % line[:20])
+
+                if not found:
+                    outfile.write(">NA_%s\n" % readID)
+                    outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+    linkerDict = {}
+    linkerList = []
+    try:
+        linkerfile = open(linkerFileName)
+        return getLinkerInformation(linkerfile)
+    except IOError:
+        return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+    linkerDict = {}
+    linkerList = []
+
+    for entry in linkerInformationList:
+        if ">" in entry:
+            linkerID = entry.strip()
+            linkerID = linkerID[1:]
+            linkerList.append(linkerID)
+        else:
+            sequence = entry.strip()
+            linkerDict[linkerID] = sequence[:10]
+
+    return linkerDict, linkerList
+
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chiapet/__init__.py b/chiapet/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/chiapet/linkers.fa b/chiapet/linkers.fa
new file mode 100644 (file)
index 0000000..290c98b
--- /dev/null
@@ -0,0 +1,4 @@
+>linker_b.1
+GTTGGATAAGATATCGCGG
+>linker_b.2
+GTTGGAATGTATATCGCGG
\ No newline at end of file
diff --git a/chiapet/markLinkers.py b/chiapet/markLinkers.py
new file mode 100644 (file)
index 0000000..1f7c675
--- /dev/null
@@ -0,0 +1,67 @@
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    linkerfile = argv[1]
+    infile = argv[2]
+    outfile = argv[3]
+
+    markLinkers(linkerfile, infile, outfile)
+
+
+def markLinkers(linkerFileName, inFileName, outFileName):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    linkerDict, linkerList = getLinkerInformationFromFile(linkerFileName)
+
+    for line in infile:
+        if len(line) < 2:
+            continue
+
+        if "@" in line:
+            readID = line.strip()
+            readID = readID.replace("@", "")
+        else:
+            found = False
+            for linkerID in linkerList:
+                position = line.find(linkerDict[linkerID])
+                if position >= 19:
+                    found = True
+                    outfile.write(">L%s_%s\n" % (linkerID[-1:], readID))
+                    outfile.write("%s\n" % line[:20])
+
+                if not found:
+                    outfile.write(">NA_%s\n" % readID)
+                    outfile.write("%s\n" % line[:20])
+
+
+def getLinkerInformationFromFile(linkerFileName):
+    linkerDict = {}
+    linkerList = []
+    try:
+        linkerfile = open(linkerFileName)
+        return getLinkerInformation(linkerfile)
+    except IOError:
+        return linkerDict, linkerList
+
+
+def getLinkerInformation(linkerInformationList):
+    linkerDict = {}
+    linkerList = []
+
+    for entry in linkerInformationList:
+        if ">" in entry:
+            linkerID = entry.strip()
+            linkerID = linkerID[1:]
+            linkerList.append(linkerID)
+        else:
+            sequence = entry.strip()
+            linkerDict[linkerID] = sequence[:10]
+
+    return linkerDict, linkerList
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chiapet/segregateLinkers.py b/chiapet/segregateLinkers.py
new file mode 100644 (file)
index 0000000..3d213da
--- /dev/null
@@ -0,0 +1,88 @@
+import sys
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    infile1 = argv[1]
+    infile2 = argv[2]
+    outprefix = argv[3]
+
+    segregateLinkers(infile1, infile2, outprefix)
+
+
+def segregateLinkers(infile1name, infile2name, outprefix):
+    infile1 = open(infile1name)
+    infile2 = open(infile2name)
+    same1 = 0
+    same2 = 0
+    mixed = 0
+    hasNA = 0
+
+    outsame1 = open("%s.same1.fa" % outprefix, "w")
+    outsame2 = open("%s.same2.fa" % outprefix, "w")
+    outNA = open("%s.NA.fa" % outprefix, "w")
+    outmixed = open("%s.mixed.fa" % outprefix, "w")
+
+    lines1 = infile1.readlines()
+
+    failed = False
+    for line1 in lines1:
+        line2 = infile2.readline()
+        if failed:
+            line2 = infile2.readline()
+            print line1.strip()
+            print line2.strip()
+            sys.exit(1)
+            continue
+
+        if ">" in line1:
+            try:
+                (linker1, readid1) = line1.split("_")
+                (linker2, readid2) = line2.split("_")
+                shortid1 = readid1.split("/")[0]
+                shortid2 = readid2.split("/")[0]
+                if shortid1 != shortid2:
+                    print shortid1, shortid2
+                    sys.exit(1)
+
+                failed = False
+            except:
+                print line1.strip()
+                print line2.strip()
+                failed = True
+
+            continue
+
+        if "NA" in linker1 or "NA" in linker2:
+            hasNA += 1
+            outNA.write("%s_%s%s" % (linker1, readid1, line1))
+            outNA.write("%s_%s%s" % (linker2, readid2, line2))
+        elif linker1 == linker2:
+            if "L1" in linker1:
+                same1 += 1
+                outsame1.write("%s_%s%s" % (linker1, readid1, line1))
+                outsame1.write("%s_%s%s" % (linker2, readid2, line2))
+            else:
+                same2 += 1
+                outsame2.write("%s_%s%s" % (linker1, readid1, line1))
+                outsame2.write("%s_%s%s" % (linker2, readid2, line2))
+        else:
+            mixed += 1
+            outmixed.write("%s_%s%s" % (linker1, readid1, line1))
+            outmixed.write("%s_%s%s" % (linker2, readid2, line2))
+
+    print same1
+    print same2
+    print mixed
+    print hasNA
+
+    outmixed.close()
+    outNA.close()
+    outsame1.close()
+    outsame2.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chkSNPrmask.py b/chkSNPrmask.py
new file mode 100755 (executable)
index 0000000..498ef49
--- /dev/null
@@ -0,0 +1,131 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sqlite3 as sqlite
+import sys
+import tempfile, shutil, os, optparse
+from os import environ
+
+if environ.get("CISTEMATIC_TEMP"):
+    cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+    cisTemp = "/tmp"
+tempfile.tempdir = cisTemp
+
+print "version 3.3: %prog"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s dbfile snpsfile nr_snps_outfile [--cache numPages] [--repeats]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--repeats", action="store_true", dest="repeats")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(repeats=False, cachePages=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    dbfile = args[0]
+    filename = args[1]
+    outfile = args[2]
+
+    chkSNPrmask(dbfile, filename, outfile, options.repeats, options.cachePages)
+
+
+def chkSNPrmask(dbfile, filename, outfile, repeats=False, cachePages=None):
+    print dbfile
+
+    if cachePages is not None:
+        if cachePages < 250000:
+            cachePages = 250000
+
+        print "caching locally..."
+        cachefile = tempfile.mktemp() + ".db"
+        shutil.copyfile(dbfile, cachefile)
+        db = sqlite.connect(cachefile)
+        doCache = True
+        print "cached..."
+    else:
+        cachePages = 500000
+        doCache = False
+        db = sqlite.connect(dbfile)
+
+    sql = db.cursor()
+    sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+    sql.execute("PRAGMA temp_store = MEMORY")
+    sql.execute("ANALYZE")
+
+    infile = open(filename)
+    featureList = []
+    featureDict = {}
+
+    for line in infile:
+        if doNotProcessLine(line):
+            continue
+
+        fields = line.strip().split("\t")
+        chrom = fields[2][3:]
+        pos = int(fields[3])
+        featureList.append((chrom,pos))
+        featureDict[(chrom, pos)] = line.strip()
+
+    featureList.sort()
+
+    index = 0
+    currentChrom=None
+    for (chrom, pos) in featureList:
+        index += 1
+        if chrom != currentChrom:
+            print "\n%s" % chrom
+            currentChrom = chrom
+
+        results = []
+        try:
+            sql.execute("select family from repeats where chrom = '%s' and %d between start and stop" % (chrom, pos)) 
+            results = sql.fetchall()
+        except:
+            pass
+
+        if repeats: # if user wants to keep track of the SNPs in repeats
+            featureDict[(chrom,pos)] += "\tN\A" 
+            for x in results:
+                featureDict[(chrom,pos)] += "\t" + str(x)
+        else:
+            for x in results:
+                try:
+                    del featureDict[(chrom,pos)]
+                except KeyError:
+                    pass
+
+        if index % 100 == 0:
+            print ".",
+            sys.stdout.flush()
+
+    if doCache:
+        print "removing cache"
+        del db
+        os.remove(cachefile)
+
+    outFile = open(outfile, "w") 
+    for key, value in featureDict.iteritems():
+        outStr = str(value) + "\n"
+        outFile.write(outStr)
+
+    outFile.close()
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/chksnp.py b/chksnp.py
new file mode 100755 (executable)
index 0000000..daf6b0a
--- /dev/null
+++ b/chksnp.py
@@ -0,0 +1,169 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import optparse
+import tempfile
+import shutil
+import os
+import string
+import sqlite3 as sqlite
+
+print "version 3.6: %s" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog dbfile snpsfile dbsnp_outfile [--cache numPages] [--snpDB dbfile]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--snpDB", action="append", dest="snpDBList",
+                      help="additional snp db files to check will be searched in order given")
+    parser.set_defaults(cachePages=None, snpDBList=[])
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    dbfile = args[0]
+    infile = args[1]
+    outfile = args[2]
+
+    chkSNPFile(dbfile, infile, outfile, options.cachePages, options.snpDBList)
+
+
+def chkSNPFile(dbfile, inputFileName, outputFileName, cachePages=None, snpDBList=[]):
+
+    snpInputFile = open(inputFileName)
+    snpLocationList, snpDict = getSNPLocationInfo(snpInputFile)
+
+    dbList = [dbfile]
+    for dbFileName in snpDBList:
+        dbList.append(dbFileName)
+
+    annotatedSnpDict = annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
+
+    outputFile = open(outputFileName, "w")
+    outputLine = ""
+    outputFile.write(outputLine)
+    for key,value in annotatedSnpDict.iteritems():
+        outputLine = "%s\n" % str(value)
+        outputFile.write(outputLine)
+
+    outputFile.close()
+
+
+def chkSNP(dbList, snpPropertiesList, cachePages=None):
+
+    snpLocationList, snpDict = getSNPLocationInfo(snpPropertiesList)
+    return annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages)
+
+
+def getSNPLocationInfo(snpPropertiesList):
+    snpLocationList = []
+    snpDict = {}
+
+    for line in snpPropertiesList:
+        if doNotProcessLine(line):
+            continue
+
+        fields = line.strip().split("\t")
+        chromosome = fields[2][3:]
+        position = int(fields[3])
+        snpLocation = (chromosome, position)
+        snpLocationList.append(snpLocation)
+        snpDict[snpLocation] = line.strip()
+
+    snpLocationList.sort()
+
+    return snpLocationList, snpDict
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None):
+    return annotateSNPFromDBList(snpLocationList, snpDict, [dbFileName], cachePages)
+
+
+def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
+    if os.environ.get("CISTEMATIC_TEMP"):
+        cisTemp = os.environ.get("CISTEMATIC_TEMP")
+    else:
+        cisTemp = "/tmp"
+
+    tempfile.tempdir = cisTemp
+
+    for dbFileName in dbList:
+        if cachePages is not None:
+            print "caching locally..."
+            cachefile = "%s.db" % tempfile.mktemp()
+            shutil.copyfile(dbFileName, cachefile)
+            db = sqlite.connect(cachefile)
+            doCache = True
+            print "cached..."
+        else:
+            db = sqlite.connect(dbFileName)
+            doCache = False
+
+        cacheSize = max(cachePages, 500000)
+        sql = db.cursor()
+        sql.execute("PRAGMA CACHE_SIZE = %d" % cacheSize)
+        sql.execute("PRAGMA temp_store = MEMORY")
+
+        index = 0
+        foundEntries = []
+        for chromosomePosition in snpLocationList:
+            (chromosome, position) = chromosomePosition
+            found = False
+            results = []
+            index += 1
+            startPosition = position - 1
+            sql.execute("select func, name from snp where chrom = '%s' and start = %d and stop = %d" % (chromosome, startPosition, position)) 
+            results = sql.fetchall()
+            try:
+                (func, name) = results[0]
+                found = True
+            except IndexError:
+                sql.execute("select func, name from snp where chrom = '%s' and start <= %d and stop >= %d" % (chromosome, startPosition, position))
+                results = sql.fetchall()
+                try:
+                    (func, name) = results[0]
+                    found = True
+                except IndexError:
+                    pass
+
+            if found:
+                snpEntry = snpDict[chromosomePosition]
+                snpDict[chromosomePosition] = string.join([snpEntry, str(name), str(func)], "\t")
+                foundEntries.append(chromosomePosition)
+
+            if index % 100 == 0:
+                print ".",
+                sys.stdout.flush()
+
+        for chromosomePosition in foundEntries:
+            del snpLocationList[snpLocationList.index(chromosomePosition)]
+
+        if doCache:
+            print "\nremoving cache"
+            del db
+            os.remove(cachefile)
+
+    for chromosomePosition in snpLocationList:
+        snpEntry = snpDict[chromosomePosition]
+        snpDict[chromosomePosition] = string.join([snpEntry, "N\A", "N\A"], "\t")
+
+    return snpDict
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/colsum.py b/colsum.py
new file mode 100755 (executable)
index 0000000..703bd5c
--- /dev/null
+++ b/colsum.py
@@ -0,0 +1,39 @@
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "version 1.2"
+    if len(argv) < 3:
+        print "usage: python %s field filename" % argv[0]
+        print "\n\tfields are counted starting at zero.\n"
+        sys.exit(1)
+
+    fieldID = int(argv[1])
+    filename = argv[2]
+
+    count = colsum(fieldID, filename)
+    print count
+
+
+def colsum(fieldID, filename):
+    infile = open(filename)
+    count = 0
+
+    for line in infile:
+        fields = line.strip().split()
+        try:
+            if "." in fields[fieldID]:
+                count += float(fields[fieldID])
+            else:
+                count += int(fields[fieldID])
+        except ValueError:
+            pass
+
+    infile.close()
+    return count
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/combineRPKMs.py b/combineRPKMs.py
new file mode 100755 (executable)
index 0000000..8fd8f9f
--- /dev/null
@@ -0,0 +1,87 @@
+#
+#  combineRPKMS.py
+#  ENRAGE
+#
+
+print 'version 1.0'
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
+    parser.set_defaults(doFraction=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    firstfile = args[0]
+    expandedfile = args[1]
+    finalfile = args[2]
+    outfile = args[3]
+
+    combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
+
+
+def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
+    firstfile = open(firstfileName)
+    expandedfile = open(expandedfileName)
+    finalfile = open(finalfileName)
+    outfile = open(outfileName, "w")
+
+    firstDict = {}
+    gidDict = {}
+    expandedDict = {}
+
+    for line in firstfile:
+        fields = line.strip().split()
+        firstDict[fields[1]] = fields[-1]
+
+    firstfile.close()
+
+    for line in expandedfile:
+        fields = line.strip().split()
+        expandedDict[fields[1]] = fields[-1]
+        gidDict[fields[1]] = fields[0]
+
+    expandedfile.close()
+
+    if doFraction:
+        header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
+    else:
+        header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
+
+    outfile.write(header)
+
+    for line in finalfile:
+        fields = line.strip().split()
+        gene = fields[0]
+        rnakb = fields[1]
+        finalRPKM = fields[2]
+        firstRPKM = firstDict.get(gene, "")
+        outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+
+        if doFraction:
+            fraction = fields[3]
+            outline += "\t%s" % fraction
+    
+        outfile.write(outline + '\n')
+
+    finalfile.close()
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/combinerds.py b/combinerds.py
new file mode 100755 (executable)
index 0000000..7eac48a
--- /dev/null
@@ -0,0 +1,121 @@
+#
+#  combinerds.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+from commoncode import readDataset
+
+print '%s: version 1.1' % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 2:
+        print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
+        #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+        sys.exit(1)
+
+    doCache = False
+    cachePages = -1
+    if '--cache' in argv:
+        doCache = True
+        try:
+            cachePages =  int(argv[sys.argv.index('-cache') + 1])
+        except: 
+            pass
+
+    datafile = argv[1]
+    infileList = []
+    for index in range(2, len(argv)):
+        if argv[index][0] == '-':
+            break
+        infileList.append(sys.argv[index])
+
+    print "destination RDS: %s" % datafile
+
+    if '--initrna' in argv:
+        rds = readDataset(datafile, initialize=True, datasetType='RNA')
+    elif '--init' in argv:
+        rds = readDataset(datafile, initialize=True)
+
+    withFlag = ''
+    if '--flag' in argv:
+        withFlag = argv[sys.argv.index('-flag') + 1]
+        print "restrict to flag = %s" % withFlag
+
+    rds = readDataset(datafile, verbose=True, cache=doCache)
+
+    if cachePages > rds.getDefaultCacheSize():
+        rds.setDBcache(cachePages)
+        cacheVal = cachePages
+    else:
+        cacheVal = rds.getDefaultCacheSize()
+
+    doIndex = False
+    if '--index' in argv:
+        doIndex = True
+
+    tableList = []
+    if '--table' in argv:
+        tableList.append(argv[argv.index('-table') + 1])
+    else:
+        tableList = rds.getTables()
+
+    combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
+
+
+def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
+    metaDict = rds.getMetadata()
+    if "numberImports" not in metaDict:
+        origIndex = 0
+        rds.insertMetadata([("numberImports", str(0))])
+    else:
+        origIndex = int(metaDict["numberImports"])
+
+    index = origIndex
+    for inputfile in infileList:
+        asname = "input" + str(index)
+        rds.attachDB(inputfile,asname)
+        for table in tableList:
+            print "importing table %s from file %s" % (table, inputfile)
+            ascols = "*"
+            if table == "uniqs":
+                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+            elif table == "multi":
+                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+            elif table == "splices":
+                ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
+            elif table == "metadata":
+                ascols = "name, value || ' (import_%d)'" % index
+                rds.importFromDB(asname, table, ascols)
+
+            if table != "metadata":
+                rds.importFromDB(asname, table, ascols, withFlag)
+
+        rds.detachDB(asname)
+        rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+        index += 1
+
+    rds.updateMetadata("numberImports", index, origIndex)
+    if doIndex:
+        print "building index...."
+        if cacheVal > 0:
+            rds.buildIndex(cacheVal)
+        else:
+            rds.buildIndex()
+
+    if doCache:
+        rds.saveCacheDB(datafile)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/commoncode.py b/commoncode.py
new file mode 100755 (executable)
index 0000000..9d86473
--- /dev/null
@@ -0,0 +1,2068 @@
+#
+#  commoncode.py
+#  ENRAGE
+#
+
+import tempfile
+import shutil
+import os
+from os import environ
+import string
+import sqlite3 as sqlite
+from time import strftime
+from array import array
+from collections import defaultdict
+
+commoncodeVersion = 5.5
+currentRDSversion = 1.1
+
+if environ.get("CISTEMATIC_TEMP"):
+    cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+    cisTemp = "/tmp"
+
+tempfile.tempdir = cisTemp
+
+
+def getReverseComplement(base):
+    revComp = {"A": "T",
+               "T": "A",
+               "G": "C",
+               "C": "G",
+               "N": "N"
+        }
+
+    return revComp[base]
+
+
+def countDuplicatesInList(listToCheck):
+    tally = defaultdict(int)
+    for item in listToCheck:
+        tally[item] += 1
+
+    return tally.items()
+
+
+def writeLog(logFile, messenger, message):
+    """ create a log file to write a message from a messenger or append to an existing file.
+    """
+    try:
+        logfile = open(logFile)
+    except IOError:
+        logfile = open(logFile, "w")
+    else:
+        logfile = open(logFile, "a")
+
+    logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
+    logfile.close()
+
+
+def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
+                     fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+                     doMerge=True, keepPeak=False, returnTop=0):
+
+    """ returns a list of merged overlapping regions; 
+    can optionally filter regions that have a scoreField fewer than minHits.
+    Can also optionally return the label of each region, as well as the
+    peak, if supplied (peakPos and peakHeight should be the last 2 fields).
+    Can return the top regions based on score if higher than minHits.
+    """
+    infile = open(regionfilename)
+    lines = infile.readlines()
+    regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
+                                       fullChrom, chromField, scoreField, pad, compact,
+                                       doMerge, keepPeak, returnTop)
+
+    infile.close()
+
+    return regions
+
+
+def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
+                     fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+                     doMerge=True, keepPeak=False, returnTop=0):
+    """ returns a list of merged overlapping regions; 
+    can optionally filter regions that have a scoreField fewer than minHits.
+    Can also optionally return the label of each region, as well as the
+    peak, if supplied (peakPos and peakHeight should be the last 2 fields).
+    Can return the top regions based on score if higher than minHits.
+    """
+    regions = {}
+    hasPvalue = 0
+    hasShift = 0
+    if 0 < returnTop < len(regionList):
+        scores = []
+        for regionEntry in regionList:
+            if regionEntry[0] == "#":
+                if "pvalue" in regionEntry:
+                    hasPvalue = 1
+
+                if "readShift" in regionEntry:
+                    hasShift = 1
+
+                continue
+
+            fields = regionEntry.strip().split("\t")
+            hits = float(fields[scoreField].strip())
+            scores.append(hits)
+
+        scores.sort()
+        returnTop = -1 * returnTop 
+        minScore = scores[returnTop]
+        if minScore > minHits:
+            minHits = minScore
+
+    mergeCount = 0
+    chromField = int(chromField)
+    count = 0
+    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
+    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
+    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
+    #      be merged with A but that will exit the loop and the merge with C will be missed.
+    for regionEntry in regionList:
+        if regionEntry[0] == "#":
+            if "pvalue" in regionEntry:
+                hasPvalue = 1
+
+            if "readShift" in regionEntry:
+                hasShift = 1
+
+            continue
+
+        fields = regionEntry.strip().split("\t")
+        if minHits >= 0:
+            try:
+                hits = float(fields[scoreField].strip())
+            except (IndexError, ValueError):
+                continue
+
+            if hits < minHits:
+                continue
+
+        if compact:
+            (chrom, pos) = fields[chromField].split(":")
+            (front, back) = pos.split("-")
+            start = int(front)
+            stop = int(back)
+        elif chromField > 1:
+            label = string.join(fields[:chromField],"\t")
+            chrom = fields[chromField]
+            start = int(fields[chromField + 1]) - pad
+            stop = int(fields[chromField + 2]) + pad
+        else:
+            label = fields[0]
+            chrom = fields[1]
+            start = int(fields[2]) - pad
+            stop = int(fields[3]) + pad
+
+        if not fullChrom:
+            chrom = chrom[3:]
+
+        length = abs(stop - start)
+        if keepPeak:
+            peakPos = int(fields[-2 - hasPvalue - hasShift])
+            peakHeight = float(fields[-1 - hasPvalue - hasShift])
+
+        if chrom not in regions:
+            regions[chrom] = []
+
+        merged = False
+
+        if doMerge and len(regions[chrom]) > 0:
+            for index in range(len(regions[chrom])):
+                if keepLabel and keepPeak:
+                    (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
+                elif keepLabel:
+                    (rlabel, rstart, rstop, rlen) = regions[chrom][index]
+                elif keepPeak:
+                    (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
+                else:
+                    (rstart, rstop, rlen) = regions[chrom][index]
+
+                if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
+                    if start < rstart:
+                        rstart = start
+
+                    if rstop < stop:
+                        rstop = stop
+
+                    rlen = abs(rstop - rstart)
+                    if keepPeak:
+                        if peakHeight > rpeakHeight:
+                            rpeakHeight = peakHeight
+                            rpeakPos = peakPos
+
+                    if keepLabel and keepPeak:
+                        regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight)
+                    elif keepLabel:
+                        regions[chrom][index] = (label, rstart, rstop, rlen)
+                    elif keepPeak:
+                        regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight)
+                    else:
+                        regions[chrom][index] = (rstart, rstop, rlen)
+
+                    mergeCount += 1
+                    merged = True
+                    break
+
+        if not merged:
+            if keepLabel and keepPeak:
+                regions[chrom].append((label, start, stop, length, peakPos, peakHeight))
+            elif keepLabel:
+                regions[chrom].append((label, start, stop, length))
+            elif keepPeak:
+                regions[chrom].append((start, stop, length, peakPos, peakHeight))
+            else:
+                regions[chrom].append((start, stop, length))
+
+            count += 1
+
+        if verbose and (count % 100000 == 0):
+            print count
+
+    regionCount = 0
+    for chrom in regions:
+        regionCount += len(regions[chrom])
+        if keepLabel:
+            regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1]))
+        else:
+            regions[chrom].sort()
+
+    if verbose:
+        print "merged %d times" % mergeCount
+        print "returning %d regions" % regionCount
+
+    return regions
+
+
+def regionsOverlap(start, stop, rstart, rstop):
+    if start > stop:
+        (start, stop) = (stop, start)
+
+    if rstart > rstop:
+        (rstart, rstop) = (rstop, rstart)
+
+    return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
+
+
+def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
+    if start > stop:
+        (start, stop) = (stop, start)
+
+    if rstart > rstop:
+        (rstart, rstop) = (rstop, rstart)
+
+    return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
+
+
+def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
+             shift=0, returnShift=False, maxshift=75):
+    """ find the peak in a list of reads (hitlist) in a region
+    of a given length and absolute start point. returns a
+    list of peaks, the number of hits, a triangular-smoothed
+    version of hitlist, and the number of reads that are
+    forward (plus) sense.
+    If doWeight is True, weight the reads accordingly.
+    If leftPlus is True, return the number of plus reads left of
+    the peak, taken to be the first TopPos position.
+    """
+
+    seqArray = array("f", [0.] * length)
+    smoothArray = array("f", [0.] * length)
+    numHits = 0.
+    numPlus = 0.
+    regionArray = []
+    if shift == "auto":
+        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+
+    # once we have the best shift, compute seqArray
+    for read in hitList:
+        currentpos = read[0] - start
+        if read[1] == "+":
+            currentpos += shift
+        else:
+            currentpos -= shift
+
+        if (currentpos <  1 - readlen) or (currentpos >= length):
+            continue
+
+        hitIndex = 0
+        if doWeight:
+            weight = read[2]
+        else:
+            weight = 1.0
+
+        numHits += weight
+        if leftPlus:
+            regionArray.append(read)
+
+        while currentpos < 0:
+            hitIndex += 1
+            currentpos += 1
+
+        while hitIndex < readlen and  currentpos < length:
+            seqArray[currentpos] += weight
+            hitIndex += 1
+            currentpos += 1
+
+        if read[1] == "+":
+            numPlus += weight
+
+    # implementing a triangular smooth
+    for pos in range(2,length -2):
+        smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
+
+    topNucleotide = 0
+    topPos = []
+    for currentpos in xrange(length):
+        if topNucleotide < smoothArray[currentpos]:
+            topNucleotide = smoothArray[currentpos]
+            topPos = [currentpos]
+        elif topNucleotide  == smoothArray[currentpos]:
+            topPos.append(currentpos)
+
+    if leftPlus:
+        numLeftPlus = 0
+        maxPos = topPos[0]
+        for read in regionArray:
+            if doWeight:
+                weight = read[2]
+            else:
+                weight = 1.0
+
+            currentPos = read[0] - start
+            if currentPos <= maxPos and read[1] == "+":
+                numLeftPlus += weight
+
+        if returnShift:
+            return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift)
+        else:
+            return (topPos, numHits, smoothArray, numPlus, numLeftPlus)
+    else:
+        if returnShift:
+            return (topPos, numHits, smoothArray, numPlus, shift)
+        else:
+            return (topPos, numHits, smoothArray, numPlus)
+
+
+def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+    bestShift = 0
+    lowestScore = 20000000000
+    for testShift in xrange(maxShift + 1):
+        shiftArray = array("f", [0.] * length)
+        for read in hitList:
+            currentpos = read[0] - start
+            if read[1] == "+":
+                currentpos += testShift
+            else:
+                currentpos -= testShift
+
+            if (currentpos < 1) or (currentpos >= length):
+                continue
+
+            if doWeight:
+                weight = read[2]
+            else:
+                weight = 1.0
+
+            if read[1] == "+":
+                shiftArray[currentpos] += weight
+            else:
+                shiftArray[currentpos] -= weight
+
+        currentScore = 0
+        for score in shiftArray:
+            currentScore += abs(score)
+
+        print currentScore
+        if currentScore < lowestScore:
+            bestShift = testShift
+            lowestScore = currentScore
+
+    return bestShift
+
+
+def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
+                           restrictList=[], regionComplement=False, maxStop=250000000):
+    """ return a dictionary of cistematic gene features. Requires
+    cistematic, obviously. Can filter-out pseudogenes. Will use
+    additional regions dict to supplement gene models, if available.
+    Can restrict output to a list of GIDs.
+    If regionComplement is set to true, returns the regions *outside* of the
+    calculated boundaries, which is useful for retrieving intronic and
+    intergenic regions. maxStop is simply used to define the uppermost
+    boundary of the complement region.
+    """ 
+    featuresDict = genomeObject.getallGeneFeatures()
+    restrictGID = False
+    if len(restrictList) > 0:
+        restrictGID = True
+
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for (label, start, stop, length) in additionalRegionsDict[chrom]:
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, start, stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    featuresByChromDict = {}
+    for gid in featuresDict:
+        if restrictGID and gid not in restrictList:
+            continue
+
+        featureList = featuresDict[gid]
+        newFeatureList = []
+        isPseudo = False
+        for (ftype, chrom, start, stop, sense) in featureList:
+            if ftype == "PSEUDO":
+                isPseudo = True
+
+            if (start, stop, ftype) not in newFeatureList:
+                notContained = True
+                containedList = []
+                for (fstart, fstop, ftype2) in newFeatureList:
+                    if start >= fstart and stop <= fstop:
+                        notContained = False
+
+                    if start < fstart and stop > fstop:
+                        containedList.append((fstart, fstop))
+
+                if len(containedList) > 0:
+                    newFList = []
+                    notContained = True
+                    for (fstart, fstop, ftype2) in newFeatureList:
+                        if (fstart, fstop) not in containedList:
+                            newFList.append((fstart, fstop, ftype2))
+                            if start >= fstart and stop <= fstop:
+                                notContained = False
+
+                    newFeatureList = newFList
+                if notContained:
+                    newFeatureList.append((start, stop, ftype))
+
+        if ignorePseudo and isPseudo:
+            continue
+
+        if chrom not in featuresByChromDict:
+            featuresByChromDict[chrom] = []
+
+        for (start, stop, ftype) in newFeatureList:
+            featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
+
+    for chrom in featuresByChromDict:
+        featuresByChromDict[chrom].sort()
+
+    if regionComplement:
+        complementByChromDict = {}
+        complementIndex = 0
+        for chrom in featuresByChromDict:
+            complementByChromDict[chrom] = []
+            listLength = len(featuresByChromDict[chrom])
+            if listLength > 0:
+                currentStart = 0
+                for index in range(listLength):
+                    currentStop = featuresByChromDict[chrom][index][0]
+                    complementIndex += 1
+                    if currentStart < currentStop:
+                        complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
+
+                    currentStart = featuresByChromDict[chrom][index][1]
+
+                currentStop = maxStop
+                complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
+
+        return (featuresByChromDict, complementByChromDict)
+    else:
+        return featuresByChromDict
+
+
+def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+                        additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
+                        lengthCDS=0, keepSense=False, adjustToNeighbor=True):
+    """ return a dictionary of gene loci. Can be used to retrieve additional
+    sequence upstream or downstream of gene, up to the next gene. Requires
+    cistematic, obviously.
+    Can filter-out pseudogenes and use additional regions outside of existing
+    gene models. Use upstreamSpanTSS to overlap half of the upstream region
+    over the TSS.
+    If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
+    lengthCDS < 0bp, return only the last X bp from CDS.
+    """ 
+    locusByChromDict = {}
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return locusByChromDict
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return locusByChromDict
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return locusByChromDict
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return locusByChromDict
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return locusByChromDict
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return locusByChromDict
+
+    genome = genomeObject.genome
+    featuresDict = genomeObject.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for (label, start, stop, length) in additionalRegionsDict[chrom]:
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, start, stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    for gid in featuresDict:
+        featureList = featuresDict[gid]
+        newFeatureList = []
+        for (ftype, chrom, start, stop, sense) in featureList:
+            newFeatureList.append((start, stop))
+
+        if ignorePseudo and ftype == "PSEUDO":
+            continue
+
+        newFeatureList.sort()
+
+        sense = featureList[0][-1]
+        gstart = newFeatureList[0][0]
+        gstop = newFeatureList[-1][1]
+        glen = abs(gstart - gstop)
+        if sense == "F":
+            if not useCDS and upstream > 0:
+                if upstreamSpanTSS:
+                    if gstop > (gstart + upstream / 2):
+                        gstop = gstart + upstream / 2
+                else:
+                    gstop = gstart
+            elif not useCDS and downstream > 0:
+                gstart = gstop
+
+            if upstream > 0:
+                if upstreamSpanTSS:
+                    distance = upstream / 2
+                else:
+                    distance = upstream
+
+                if adjustToNeighbor:
+                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    if nextGene < distance * 2:
+                        distance = nextGene / 2
+
+                if distance < 1:
+                    distance = 1
+
+                gstart -= distance
+
+            if downstream > 0:
+                distance = downstream
+                if adjustToNeighbor:
+                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    if nextGene < downstream * 2:
+                        distance = nextGene / 2
+
+                if distance < 1:
+                    distance = 1
+
+                gstop += distance
+
+            if lengthCDS > 0:
+                if lengthCDS < glen:
+                    gstop = newFeatureList[0][0] + lengthCDS
+
+            if lengthCDS < 0:
+                if abs(lengthCDS) < glen:
+                    gstart = newFeatureList[-1][1] + lengthCDS
+        else:
+            if not useCDS and upstream > 0:
+                if upstreamSpanTSS:
+                    if gstart < (gstop - upstream / 2):
+                        gstart = gstop - upstream / 2
+                else:
+                    gstart = gstop
+            elif not useCDS and downstream > 0:
+                    gstop = gstart
+
+            if upstream > 0:
+                if upstreamSpanTSS:
+                    distance = upstream /2
+                else:
+                    distance = upstream
+
+                if adjustToNeighbor:
+                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    if nextGene < distance * 2:
+                        distance = nextGene / 2
+
+                if distance < 1:
+                    distance = 1
+
+                gstop += distance
+
+            if downstream > 0:
+                distance = downstream
+                if adjustToNeighbor:
+                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    if nextGene < downstream * 2:
+                        distance = nextGene / 2
+
+                if distance < 1:
+                    distance = 1
+
+                gstart -= distance
+
+            if lengthCDS > 0:
+                if lengthCDS < glen:
+                    gstart = newFeatureList[-1][-1] - lengthCDS
+
+            if lengthCDS < 0:
+                if abs(lengthCDS) < glen:
+                    gstop = newFeatureList[0][0] - lengthCDS
+
+        glen = abs(gstop - gstart)
+        if chrom not in locusByChromDict:
+            locusByChromDict[chrom] = []
+
+        if keepSense:
+            locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
+        else:
+            locusByChromDict[chrom].append((gstart, gstop, gid, glen))
+
+    for chrom in locusByChromDict:
+        locusByChromDict[chrom].sort()
+
+    return locusByChromDict
+
+
+def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
+                      normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
+                      binLength=-1):
+    """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
+        a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
+        or a different weight / tag.
+    """
+    index = 0
+    regionsBins = {}
+    regionsLen = {}
+
+    if defaultRegionFormat:
+        regionIDField = 0
+        startField = 1
+        stopField = 2
+        lengthField = 3
+    else:
+        startField = 0
+        stopField = 1
+        regionIDField = 2
+        lengthField = 3
+
+    senseField = 4
+
+    print "entering computeRegionBins"
+    if len(regionList) > 0:
+        for readID in regionList:
+            regionsBins[readID] = [0.] * bins
+    else:
+        for chrom in regionsByChromDict:
+            for regionTuple in regionsByChromDict[chrom]:
+                regionID = regionTuple[regionIDField]
+                regionsBins[regionID] = [0.] * bins
+
+    for chrom in hitDict:
+        if chrom not in regionsByChromDict:
+            continue
+
+        for regionTuple in regionsByChromDict[chrom]:
+            regionID = regionTuple[regionIDField]
+            regionsLen[regionID] = regionTuple[lengthField]
+
+        print "%s\n" % chrom
+        startRegion = 0
+        for (tagStart, sense, weight) in hitDict[chrom]:
+            index += 1
+            if index % 100000 == 0:
+                print "read %d " % index,
+
+            stopPoint = tagStart + readlen
+            if startRegion < 0:
+                startRegion = 0
+
+            for regionTuple in regionsByChromDict[chrom][startRegion:]:
+                start = regionTuple[startField]
+                stop = regionTuple[stopField]
+                regionID = regionTuple[regionIDField]
+                rlen = regionTuple[lengthField]
+                try:
+                    rsense = regionTuple[senseField]
+                except:
+                    rsense = "F"
+
+                if tagStart > stop:
+                    startRegion += 1
+                    continue
+
+                if start > stopPoint:
+                    startRegion -= 10
+                    break
+
+                if start <= tagStart <= stop:
+                    if binLength < 1:
+                        regionBinLength = rlen / bins
+                    else:
+                        regionBinLength = binLength
+
+                    startdist = tagStart - start
+                    if rsense == "F":
+                        # we are relying on python's integer division quirk
+                        binID = startdist / regionBinLength
+                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
+                            binID = 0
+                        elif fixedFirstBin > 0:
+                            binID = 1
+
+                        if binID >= bins:
+                            binID = bins - 1
+
+                        try:
+                            regionsBins[regionID][binID] += normalizedTag * weight
+                        except KeyError:
+                            print "%s %s" % (regionID, str(binID))
+                    else:
+                        rdist = rlen - startdist
+                        binID = rdist / regionBinLength
+                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
+                            binID = 0
+                        elif fixedFirstBin > 0:
+                            binID = 1
+
+                        if binID >= bins:
+                            binID = bins - 1
+
+                        try:
+                            regionsBins[regionID][binID] += normalizedTag * weight
+                        except KeyError:
+                            print "%s %s" % (regionID, str(binID))
+
+                    stopPoint = stop
+
+    return (regionsBins, regionsLen)
+
+
+# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will
+# require going through all the code to make the changes needed.  Major project for another
+# day, but it really needs to be done
+class readDataset:
+    """ Class for storing reads from experiments. Assumes that custom scripts
+    will translate incoming data into a format that can be inserted into the
+    class using the insert* methods. Default class subtype ('DNA') includes
+    tables for unique and multireads, whereas 'RNA' subtype also includes a
+    splices table.
+    """
+
+    def __init__(self, datafile, initialize=False, datasetType='', verbose=False, 
+                 cache=False, reportCount=True):
+        """ creates an rds datafile if initialize is set to true, otherwise
+        will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
+        """
+        self.dbcon = ""
+        self.memcon = ""
+        self.dataType = ""
+        self.rdsVersion = "1.1"
+        self.memBacked = False
+        self.memChrom = ""
+        self.memCursor = ""
+        self.cachedDBFile = ""
+
+        if cache:
+            if verbose:
+                print "caching ...."
+
+            self.cacheDB(datafile)
+            dbfile = self.cachedDBFile
+        else:
+            dbfile = datafile
+
+        self.dbcon = sqlite.connect(dbfile)
+        self.dbcon.row_factory = sqlite.Row
+        self.dbcon.execute("PRAGMA temp_store = MEMORY")
+        if initialize:
+            if datasetType == "":
+                self.dataType = "DNA"
+            else:
+                self.dataType = datasetType
+
+            self.initializeTables(self.dbcon)
+        else:
+            metadata = self.getMetadata("dataType")
+            self.dataType = metadata["dataType"]
+
+        try:
+            metadata = self.getMetadata("rdsVersion")
+            self.rdsVersion = metadata["rdsVersion"]
+        except:
+            try:
+                self.insertMetadata([("rdsVersion", currentRDSversion)])
+            except:
+                print "could not add rdsVersion - read-only ?"
+                self.rdsVersion = "pre-1.0"
+
+        if verbose:
+            if initialize:
+                print "INITIALIZED dataset %s" % datafile
+            else:
+                print "dataset %s" % datafile
+
+            metadata = self.getMetadata()
+            print "metadata:"
+            pnameList = metadata.keys()
+            pnameList.sort()
+            for pname in pnameList:
+                print "\t" + pname + "\t" + metadata[pname]
+
+            if reportCount:
+                ucount = self.getUniqsCount()
+                mcount = self.getMultiCount()
+                if self.dataType == "DNA" and not initialize:
+                    try:
+                        print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
+                    except:
+                        print "\n%s unique reads and %s multireads" % (ucount, mcount)
+                elif self.dataType == 'RNA' and not initialize:
+                    scount = self.getSplicesCount()
+                    try:
+                        print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
+                    except:
+                        print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
+
+            print "default cache size is %d pages" % self.getDefaultCacheSize()
+            if self.hasIndex():
+                print "found index"
+            else:
+                print "not indexed"
+
+
+    def __len__(self):
+        """ return the number of usable reads in the dataset.
+        """
+        try:
+            total = self.getUniqsCount()
+        except:
+            total = 0
+
+        try:
+            total += self.getMultiCount()
+        except:
+            pass
+
+        if self.dataType == "RNA":
+            try:
+                total += self.getSplicesCount()
+            except:
+                pass
+
+        try:
+            total = int(total)
+        except:
+            total = 0
+
+        return total
+
+
+    def __del__(self):
+        """ cleanup copy in local cache, if present.
+        """
+        if self.cachedDBFile != "":
+            self.uncacheDB()
+
+
+    def cacheDB(self, filename):
+        """ copy geneinfoDB to a local cache.
+        """
+        self.cachedDBFile = tempfile.mktemp() + ".db"
+        shutil.copyfile(filename, self.cachedDBFile)
+
+
+    def saveCacheDB(self, filename):
+        """ copy geneinfoDB to a local cache.
+        """
+        shutil.copyfile(self.cachedDBFile, filename)
+
+
+    def uncacheDB(self):
+        """ delete geneinfoDB from local cache.
+        """
+        global cachedDBFile
+        if self.cachedDBFile != "":
+            try:
+                os.remove(self.cachedDBFile)
+            except:
+                print "could not delete %s" % self.cachedDBFile
+
+            self.cachedDB = ""
+
+
+    def attachDB(self, filename, asname):
+        """ attach another database file to the readDataset.
+        """
+        stmt = "attach '%s' as %s" % (filename, asname)
+        self.execute(stmt)
+
+
+    def detachDB(self, asname):
+        """ detach a database file to the readDataset.
+        """
+        stmt = "detach %s" % (asname)
+        self.execute(stmt)
+
+
+    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+        """ import into current RDS the table (with columns destcolumns,
+            with default all columns) from the database file asname,
+            using the column specification of ascolumns (default all).
+        """
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        if flagged != "":
+            stmt += " where flag = '%s' " % flagged
+
+        self.execute(stmt, forceCommit=True)
+
+
+    def getTables(self, asname=""):
+        """ get a list of table names in a particular database file.
+        """
+        resultList = []
+
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        if asname != "":
+            asname += "."
+
+        stmt = "select name from %ssqlite_master where type='table'" % asname
+        sql.execute(stmt)
+        results = sql.fetchall()
+
+        for row in results:
+            resultList.append(row["name"])
+
+        return resultList
+
+
+    def hasIndex(self):
+        """ check whether the RDS file has at least one index.
+        """
+        stmt = "select count(*) from sqlite_master where type='index'"
+        count = int(self.execute(stmt, returnResults=True)[0][0])
+        if count > 0:
+            return True
+
+        return False
+
+
+    def initializeTables(self, acon, cache=100000):
+        """ creates table schema in database connection acon, which is
+        typically a database file or an in-memory database.
+        """
+        acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+        acon.execute("create table metadata (name varchar, value varchar)")
+        acon.execute("insert into metadata values('dataType','%s')" % self.dataType)
+        acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
+        acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
+        if self.dataType == "RNA":
+            acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)")
+
+        acon.commit()
+
+
+    def getFileCursor(self):
+        """ returns a cursor to file database for low-level (SQL)
+        access to the data.
+        """
+        return self.dbcon.cursor()
+
+
+    def getMemCursor(self):
+        """ returns a cursor to memory database for low-level (SQL)
+        access to the data.
+        """
+        return self.memcon.cursor()
+
+
+    def getMetadata(self, valueName=""):
+        """ returns a dictionary of metadata.
+        """
+        whereClause = ""
+        resultsDict = {}
+
+        if valueName != "":
+            whereClause = " where name = '%s' " % valueName
+
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        sql.execute("select name, value from metadata" + whereClause)
+        results = sql.fetchall()
+
+        for row in results:
+            pname = row["name"]
+            pvalue = row["value"]
+            if pname not in resultsDict:
+                resultsDict[pname] = pvalue
+            else:
+                trying = True
+                index = 2
+                while trying:
+                    newName = pname + ":" + str(index)
+                    if newName not in resultsDict:
+                        resultsDict[newName] = pvalue
+                        trying = False
+
+                    index += 1
+
+        return resultsDict
+
+
+    def getReadSize(self):
+        """ returns readsize if defined in metadata.
+        """
+        metadata = self.getMetadata()
+        if "readsize" not in metadata:
+            print "no readsize parameter defined - returning 0"
+            return 0
+        else:
+            mysize = metadata["readsize"]
+            if "import" in mysize:
+                mysize = mysize.split()[0]
+
+            return int(mysize)
+
+
+    def getDefaultCacheSize(self):
+        """ returns the default cache size.
+        """
+        return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+
+
+    def getChromosomes(self, table="uniqs", fullChrom=True):
+        """ returns a list of distinct chromosomes in table.
+        """
+        statement = "select distinct chrom from %s" % table
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        sql.execute(statement)
+        results = []
+        for row in sql:
+            if fullChrom:
+                if row["chrom"] not in results:
+                    results.append(row["chrom"])
+            else:
+                if  len(row["chrom"][3:].strip()) < 1:
+                    continue
+
+                if row["chrom"][3:] not in results:
+                    results.append(row["chrom"][3:])
+
+        results.sort()
+
+        return results
+
+
+    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+                         doMulti=False, doSplices=False):
+        """ returns the maximum coordinate for reads on a given chromosome.
+        """
+        maxCoord = 0
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        if doUniqs:
+            try:
+                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
+                maxCoord = int(sql.fetchall()[0][0])
+            except:
+                print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        if doSplices:
+            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
+            try:
+                spliceMax = int(sql.fetchall()[0][0])
+                if spliceMax > maxCoord:
+                    maxCoord = spliceMax
+            except:
+                pass
+
+        if doMulti:
+            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
+            try:
+                multiMax = int(sql.fetchall()[0][0])
+                if multiMax > maxCoord:
+                    maxCoord = multiMax
+            except:
+                pass
+
+        if verbose:
+            print "%s maxCoord: %d" % (chrom, maxCoord)
+
+        return maxCoord
+
+
+    def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="",
+                     flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
+                     withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
+                     readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+                     flagLike=False, strand="", entryDict=False, combine5p=False):
+        """ returns a dictionary of reads in a variety of formats
+        and which can be restricted by chromosome or custom-flag.
+        Returns unique reads by default, but can return multireads
+        with doMulti set to True.
+        """
+        whereClause = []
+        resultsDict = {}
+
+        if chrom != "" and chrom != self.memChrom:
+            whereClause.append("chrom = '%s'" % chrom)
+
+        if flag != "":
+            if flagLike:
+                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+                whereClause.append(flagLikeClause)
+            else:
+                whereClause.append("flag = '%s'" % flag)
+
+        if start > -1:
+            whereClause.append("start > %d" % start)
+
+        if stop > -1:
+            whereClause.append("stop < %d" % stop)
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
+        if hasMismatch:
+            whereClause.append("mismatch != ''")
+
+        if strand in ["+", "-"]:
+            whereClause.append("sense = '%s'" % strand)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        groupBy = []
+        if findallOptimize:
+            selectClause = ["select start, sense, sum(weight)"]
+            groupBy = ["GROUP BY start, sense"]
+        else:
+            selectClause = ["select ID, chrom, start, readID"]
+            if bothEnds:
+                selectClause.append("stop")
+
+            if not noSense:
+                selectClause.append("sense")
+
+            if withWeight:
+                selectClause.append("weight")
+
+            if withFlag:
+                selectClause.append("flag")
+
+            if withMismatch:
+                selectClause.append("mismatch")
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        selectQuery = string.join(selectClause, ",")
+        groupQuery = string.join(groupBy)
+        if doUniqs:
+            stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
+            if doMulti:
+                stmt.append("UNION ALL")
+                stmt.append(selectQuery)
+                stmt.append("from multi")
+                stmt.append(whereQuery)
+                stmt.append(groupQuery)
+        else:
+            stmt = [selectQuery, "from multi", whereQuery]
+
+        if combine5p:
+            if findallOptimize:
+                selectQuery = "select start, sense, weight, chrom"
+
+            if doUniqs:
+                subSelect = [selectQuery, "from uniqs", whereQuery]
+                if doMulti:
+                    subSelect.append("union all")
+                    subSelect.append(selectQuery)
+                    subSelect.append("from multi")
+                    subSelect.append(whereQuery)
+            else:
+                subSelect = [selectQuery, "from multi", whereQuery]
+
+            sqlStmt = string.join(subSelect)
+            if findallOptimize:
+                selectQuery = "select start, sense, sum(weight)"
+
+            stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
+                    selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
+
+        if findallOptimize:
+            if self.memBacked:
+                self.memcon.row_factory = None
+                sql = self.memcon.cursor()
+            else:
+                self.dbcon.row_factory = None
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by start")
+        elif readIDDict:
+            if self.memBacked:
+                sql = self.memcon.cursor()
+            else:
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by readID, start")
+        else:
+            if self.memBacked:
+                sql = self.memcon.cursor()
+            else:
+                sql = self.dbcon.cursor()
+
+            stmt.append("order by chrom, start")
+
+        sqlQuery = string.join(stmt)
+        sql.execute(sqlQuery)
+
+        if findallOptimize:
+            resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql]
+            if self.memBacked:
+                self.memcon.row_factory = sqlite.Row
+            else:
+                self.dbcon.row_factory = sqlite.Row
+        else:
+            currentChrom = ""
+            currentReadID = ""
+            pairID = 0
+            for row in sql:
+                readID = row["readID"]
+                if fullChrom:
+                    chrom = row["chrom"]
+                else:
+                    chrom = row["chrom"][3:]
+
+                if not readIDDict and chrom != currentChrom:
+                    resultsDict[chrom] = []
+                    currentChrom = chrom
+                    dictKey = chrom
+                elif readIDDict:
+                    theReadID = readID
+                    if "::" in readID:
+                        (theReadID, multiplicity) = readID.split("::")
+
+                    if "/" in theReadID and withPairID:
+                        (theReadID, pairID) = readID.split("/")
+
+                    if theReadID != currentReadID:
+                        resultsDict[theReadID] = []
+                        currentReadID = theReadID
+                        dictKey = theReadID
+
+                if entryDict:
+                    newrow = {"start": int(row["start"])}
+                    if bothEnds:
+                        newrow["stop"] = int(row["stop"])
+
+                    if not noSense:
+                        newrow["sense"] = row["sense"]
+
+                    if withWeight:
+                        newrow["weight"] = float(row["weight"])
+
+                    if withFlag:
+                        newrow["flag"] = row["flag"]
+
+                    if withMismatch:
+                        newrow["mismatch"] = row["mismatch"]
+
+                    if withID:
+                        newrow["readID"] = readID
+
+                    if withChrom:
+                        newrow["chrom"] = chrom
+
+                    if withPairID:
+                        newrow["pairID"] = pairID
+                else:
+                    newrow = [int(row["start"])]
+                    if bothEnds:
+                        newrow.append(int(row["stop"]))
+
+                    if not noSense:
+                        newrow.append(row["sense"])
+
+                    if withWeight:
+                        newrow.append(float(row["weight"]))
+
+                    if withFlag:
+                        newrow.append(row["flag"])
+
+                    if withMismatch:
+                        newrow.append(row["mismatch"])
+
+                    if withID:
+                        newrow.append(readID)
+
+                    if withChrom:
+                        newrow.append(chrom)
+
+                    if withPairID:
+                        newrow.append(pairID)
+
+                resultsDict[dictKey].append(newrow)
+
+        return resultsDict
+
+
+    def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand="", entryDict=False):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereClause = []
+        resultsDict = {}
+
+        if chrom != "" and chrom != self.memChrom:
+            whereClause = ["chrom = '%s'" % chrom]
+
+        if flag != "":
+            if flagLike:
+                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
+                whereClause.append(flagLikeClause)
+            else:
+                whereClause.append("flag = '%s'" % flag)
+
+        if hasMismatch:
+            whereClause.append("mismatch != ''")
+
+        if strand != "":
+            whereClause.append("sense = '%s'" % strand)
+
+        if start > -1:
+            whereClause.append("startL > %d" % start)
+
+        if stop > -1:
+            whereClause.append("stopR < %d" % stop)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        if not noSense:
+            selectClause.append("sense")
+
+        if withWeight:
+            selectClause.append("weight")
+
+        if withFlag:
+            selectClause.append("flag")
+
+        if withMismatch:
+            selectClause.append("mismatch")
+
+        selectQuery = string.join(selectClause, " ,")
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        if chrom == "" and not readIDDict:
+            stmt = "select distinct chrom from splices %s" % whereQuery
+            sql.execute(stmt)
+            for row in sql:
+                if fullChrom:
+                    chrom = row["chrom"]
+                else:
+                    chrom = row["chrom"][3:]
+
+                resultsDict[chrom] = []
+        elif chrom != "" and not readIDDict:
+            resultsDict[chrom] = []
+
+        stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
+        sql.execute(stmt)
+        currentReadID = ""
+        for row in sql:
+            pairID = 0
+            readID = row["readID"]
+            if fullChrom:
+                chrom = row["chrom"]
+            else:
+                chrom = row["chrom"][3:]
+
+            if readIDDict:
+                if "/" in readID:
+                    (theReadID, pairID) = readID.split("/")
+                else:
+                    theReadID = readID
+
+                if theReadID != currentReadID:
+                    resultsDict[theReadID] = []
+                    currentReadID = theReadID
+                    dictKey = theReadID
+            else:
+                dictKey = chrom
+
+            if entryDict:
+                newrow = {"startL": int(row["startL"])}
+                newrow["stopL"] = int(row["stopL"])
+                newrow["startR"] = int(row["startR"])
+                newrow["stopR"] = int(row["stopR"])
+                if not noSense:
+                    newrow["sense"] = row["sense"]
+
+                if withWeight:
+                    newrow["weight"] = float(row["weight"])
+
+                if withFlag:
+                    newrow["flag"] = row["flag"]
+
+                if withMismatch:
+                    newrow["mismatch"] = row["mismatch"]
+
+                if withID:
+                    newrow["readID"] = readID
+
+                if withChrom:
+                    newrow["chrom"] = chrom
+
+                if withPairID:
+                    newrow["pairID"] = pairID
+
+                if splitRead:
+                    leftDict = newrow
+                    del leftDict["startR"]
+                    del leftDict["stopR"]
+                    rightDict = newrow
+                    del rightDict["start"]
+                    del rightDict["stopL"]
+                    resultsDict[dictKey].append(leftDict)
+                    resultsDict[dictKey].append(rightDict)
+                else:
+                    resultsDict[dictKey].append(newrow)
+            else:
+                newrow = [int(row["startL"])]
+                newrow.append(int(row["stopL"]))
+                newrow.append(int(row["startR"]))
+                newrow.append(int(row["stopR"]))
+                if not noSense:
+                    newrow.append(row["sense"])
+
+                if withWeight:
+                    newrow.append(float(row["weight"]))
+
+                if withFlag:
+                    newrow.append(row["flag"])
+
+                if withMismatch:
+                    newrow.append(row["mismatch"])
+
+                if withID:
+                    newrow.append(readID)
+
+                if withChrom:
+                    newrow.append(chrom)
+
+                if withPairID:
+                    newrow.append(pairID)
+
+                if splitRead:
+                    resultsDict[dictKey].append(newrow[:2] + newrow[4:])
+                    resultsDict[dictKey].append(newrow[2:])
+                else:
+                    resultsDict[dictKey].append(newrow)
+
+        return resultsDict
+
+
+    def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+                  splices=False, reportCombined=True, sense="both"):
+        """ return read counts for a given region.
+        """
+        ucount = 0
+        mcount = 0
+        scount = 0
+        restrict = ""
+        if sense in ["+", "-"]:
+            restrict = " sense ='%s' " % sense
+
+        if uniqs:
+            try:
+                ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
+            except:
+                ucount = 0
+
+        if multi:
+            try:
+                mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
+            except:
+                mcount = 0
+
+        if splices:
+            try:
+                scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
+            except:
+                scount = 0
+
+        if reportCombined:
+            total = ucount + mcount + scount
+            return total
+        else:
+            return (ucount, mcount, scount)
+
+
+    def getTotalCounts(self, chrom="", rmin="", rmax=""):
+        return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
+
+
+    def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
+        """ returns the number of row in the uniqs table.
+        """
+        whereClause = []
+        count = 0
+
+        if chrom !=""  and chrom != self.memChrom:
+            whereClause = ["chrom='%s'" % chrom]
+
+        if rmin != "":
+            whereClause.append("%s >= %s" % (startField, str(rmin)))
+
+        if rmax != "":
+            whereClause.append("%s <= %s" % (startField, str(rmax)))
+
+        if restrict != "":
+            whereClause.append(restrict)
+
+        if len(whereClause) > 0:
+            whereStatement = string.join(whereClause, " and ")
+            whereQuery = "where %s" % whereStatement
+        else:
+            whereQuery = ""
+
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        if distinct:
+            sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery))
+        else:
+            sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+
+        result = sql.fetchone()
+
+        try:
+            count = int(result[0])
+        except:
+            count = 0
+
+        return count
+
+
+    def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the number of row in the splices table.
+        """
+        return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+
+
+    def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the number of distinct readIDs in the uniqs table.
+        """
+        return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+
+
+    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        """ returns the total weight of readIDs in the multi table.
+        """
+        return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+
+
+    def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
+        """ get readID's.
+        """
+        stmt = []
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
+        if uniqs:
+            stmt.append("select readID from uniqs")
+
+        if multi:
+            stmt.append("select readID from multi")
+
+        if splices:
+            stmt.append("select readID from splices")
+
+        if len(stmt) > 0:
+            selectPart = string.join(stmt, " union ")
+        else:
+            selectPart = ""
+
+        sqlQuery = "%s group by readID %s" (selectPart, limitPart)
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        sql.execute(sqlQuery)
+        result = sql.fetchall()
+
+        if paired:
+            return [x.split("/")[0][0] for x in result]
+        else:
+            return [x[0] for x in result]
+
+
+    def getMismatches(self, mischrom = None, verbose=False, useSplices=True):
+        """ returns the uniq and spliced mismatches in a dictionary.
+        """
+        revcomp = {"A": "T",
+                   "T": "A",
+                   "G": "C",
+                   "C": "G",
+                   "N": "N"
+        }
+
+        readlen = self.getReadSize()
+        if mischrom:
+            hitChromList = [mischrom]
+        else:
+            hitChromList = self.getChromosomes()
+            hitChromList.sort()
+
+        snpDict = {}
+        for achrom in hitChromList:
+            if verbose:
+                print "getting mismatches from chromosome %s" % (achrom)
+
+            snpDict[achrom] = []
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True)
+            if useSplices and self.dataType == "RNA":
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+                spliceIDList = spliceDict.keys()
+                for k in spliceIDList:
+                    (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0]
+                    spMismatchList = mismatches.split(",")
+                    for mismatch in spMismatchList:
+                        if "N" in mismatch:
+                            continue
+
+                        change_len = len(mismatch)
+                        if sense == "+":
+                            change_from = mismatch[0]
+                            change_base = mismatch[change_len-1]
+                            change_pos = int(mismatch[1:change_len-1])
+                        elif sense == "-":
+                            change_from = revcomp[mismatch[0]]
+                            change_base = revcomp[mismatch[change_len-1]]
+                            change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+                        firsthalf = int(lefthalf)-int(startpos)+1
+                        secondhalf = 0
+                        if int(change_pos) <= int(firsthalf):
+                            change_at = startpos + change_pos - 1
+                        else:
+                            secondhalf = change_pos - firsthalf
+                            change_at = rightstart + secondhalf
+
+                        snpDict[achrom].append([startpos, change_at, change_base, change_from])
+
+            if achrom not in hitDict:
+                continue
+
+            for (start, sense, mismatches) in hitDict[achrom]:
+                mismatchList = mismatches.split(",")
+                for mismatch in mismatchList:
+                    if "N" in mismatch:
+                        continue
+
+                    change_len = len(mismatch)
+                    if sense == "+":
+                        change_from = mismatch[0]
+                        change_base = mismatch[change_len-1]
+                        change_pos = int(mismatch[1:change_len-1])
+                    elif sense == "-":
+                        change_from = revcomp[mismatch[0]]
+                        change_base = revcomp[mismatch[change_len-1]]
+                        change_pos = readlen - int(mismatch[1:change_len-1]) + 1
+
+                    change_at = start + change_pos - 1
+                    snpDict[achrom].append([start, change_at, change_base, change_from])
+
+        return snpDict
+
+
+    def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
+                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        keepStrand="both", shiftValue=0):
+        """return a profile of the chromosome as an array of per-base read coverage....
+            keepStrand = 'both', 'plusOnly', or 'minusOnly'.
+            Will also shift position of unique and multireads (but not splices) if shift is a natural number
+        """
+        metadata = self.getMetadata()
+        readlen = int(metadata["readsize"])
+        dataType = metadata["dataType"]
+        scale = 1. / normalizationFactor
+        shift = {}
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
+
+        if cstop > 0:
+            lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
+        else:
+            lastNT = cstop - cstart + readlen + shift["+"]
+
+        chromModel = array("f", [0.] * lastNT)
+        hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
+        if cstart < 0:
+            cstart = 0
+
+        for (hstart, sense, weight) in hitDict[chromosome]:
+            hstart = hstart - cstart + shift[sense]
+            for currentpos in range(hstart,hstart+readlen):
+                try:
+                    if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
+                        chromModel[currentpos] += scale * weight
+                    elif sense == '-' and keepStrand != "plusOnly":
+                        chromModel[currentpos] -= scale * weight
+                except:
+                    continue
+
+        del hitDict
+        if useSplices and dataType == "RNA":
+            if cstop > 0:
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
+            else:
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
+   
+            if chromosome in spliceDict:
+                for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]:
+                    if (Rstop - cstart) < lastNT:
+                        for index in range(abs(Lstop - Lstart)):
+                            currentpos = Lstart - cstart + index
+                            # we only track unique splices
+                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+                                chromModel[currentpos] += scale
+                            elif rsense == "-" and keepStrand != "plusOnly":
+                                chromModel[currentpos] -= scale
+
+                        for index in range(abs(Rstop - Rstart)):
+                            currentpos = Rstart - cstart + index
+                            # we only track unique splices
+                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
+                                chromModel[currentpos] += scale
+                            elif rsense == "-" and keepStrand != "plusOnly":
+                                chromModel[currentpos] -= scale
+
+            del spliceDict
+
+        return chromModel
+
+
+    def insertMetadata(self, valuesList):
+        """ inserts a list of (pname, pvalue) into the metadata
+        table.
+        """
+        self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def updateMetadata(self, pname, newValue, originalValue=""):
+        """ update a metadata field given the original value and the new value.
+        """
+        stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
+        if originalValue != "":
+            stmt += " and value='%s' " % str(originalValue)
+
+        self.dbcon.execute(stmt)
+        self.dbcon.commit()
+
+
+    def insertUniqs(self, valuesList):
+        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+        into the uniqs table.
+        """
+        self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def insertMulti(self, valuesList):
+        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
+        into the multi table.
+        """
+        self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def insertSplices(self, valuesList):
+        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+        into the splices table.
+        """
+        self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
+    def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
+        """ update reads on file database in a list region of regions for a chromosome to have a new flag.
+            regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
+            sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
+        """
+        restrict = ""
+        if sense != "both":
+            restrict = " and sense = ? "
+
+        if uniqs:
+            self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+        if multi:
+            self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
+
+        if self.dataType == "RNA" and splices:
+            self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
+            self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
+
+        self.dbcon.commit()
+
+
+    def setFlags(self, flag, uniqs=True, multi=True, splices=True):
+        """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+        """
+        if uniqs:
+            self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+
+        if multi:
+            self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+
+        if self.dataType == 'RNA' and splices:
+            self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+
+        self.dbcon.commit()
+
+
+    def resetFlags(self, uniqs=True, multi=True, splices=True):
+        """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
+        """
+        if uniqs:
+            self.dbcon.execute("UPDATE uniqs SET flag = ''")
+
+        if multi:
+            self.dbcon.execute("UPDATE multi SET flag = ''")
+
+        if self.dataType == "RNA" and splices:
+            self.dbcon.execute("UPDATE splices SET flag = ''")
+
+        self.dbcon.commit()
+
+
+    def reweighMultireads(self, readList):
+        self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
+
+
+    def setSynchronousPragma(self, value="ON"):
+        try:
+            self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
+        except:
+            print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+
+
+    def setDBcache(self, cache, default=False):
+        self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
+        if default:
+            self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache)
+
+
+    def execute(self, statement, returnResults=False, forceCommit=False):
+        if self.memBacked:
+            sql = self.memcon.cursor()
+        else:
+            sql = self.dbcon.cursor()
+
+        sql.execute(statement)
+        if returnResults:
+            result = sql.fetchall()
+            return result
+
+        if forceCommit:
+            if self.memBacked:
+                self.memcon.commit()
+            else:
+                self.dbcon.commit()
+
+
+    def buildIndex(self, cache=100000):
+        """ Builds the file indeces for the main tables.
+            Cache is the number of 1.5 kb pages to keep in memory.
+            100000 pages translates into 150MB of RAM, which is our default.
+        """
+        if cache > self.getDefaultCacheSize():
+            self.setDBcache(cache)
+        self.setSynchronousPragma("OFF")
+        self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+        print "built uPosIndex"
+        self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
+        print "built uChromIndex"
+        self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+        print "built mPosIndex"
+        self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
+        print "built mChromIndex"
+
+        if self.dataType == "RNA":
+            self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
+            print "built sPosIndex"
+            self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
+            print "built sPosIndex2"
+            self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
+            print "built sChromIndex"
+
+        self.dbcon.commit()
+        self.setSynchronousPragma("ON")
+
+
+    def dropIndex(self):
+        """ drops the file indices for the main tables.
+        """
+        try:
+            self.setSynchronousPragma("OFF")
+            self.dbcon.execute("DROP INDEX uPosIndex")
+            self.dbcon.execute("DROP INDEX uChromIndex")
+            self.dbcon.execute("DROP INDEX mPosIndex")
+            self.dbcon.execute("DROP INDEX mChromIndex")
+
+            if self.dataType == "RNA":
+                self.dbcon.execute("DROP INDEX sPosIndex")
+                try:
+                    self.dbcon.execute("DROP INDEX sPosIndex2")
+                except:
+                    pass
+
+                self.dbcon.execute("DROP INDEX sChromIndex")
+
+            self.dbcon.commit()
+        except:
+            print "problem dropping index"
+
+        self.setSynchronousPragma("ON")
+
+
+    def memSync(self, chrom="", index=False):
+        """ makes a copy of the dataset into memory for faster access.
+        Can be restricted to a "full" chromosome. Can also build the
+        memory indices.
+        """
+        self.memcon = ""
+        self.memcon = sqlite.connect(":memory:")
+        self.initializeTables(self.memcon)
+        cursor = self.dbcon.cursor()
+        whereclause = ""
+        if chrom != "":
+            print "memSync %s" % chrom
+            whereclause = " where chrom = '%s' " % chrom
+            self.memChrom = chrom
+        else:
+            self.memChrom = ""
+
+        self.memcon.execute("PRAGMA temp_store = MEMORY")
+        self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
+        # copy metadata to memory
+        self.memcon.execute("delete from metadata")
+        results = cursor.execute("select name, value from metadata")
+        results2 = []
+        for row in results:
+            results2.append((row["name"], row["value"]))
+
+        self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
+        # copy uniqs to memory
+        results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause)
+        results2 = []
+        for row in results:
+            results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+        self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
+        # copy multi to memory
+        results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause)
+        results2 = []
+        for row in results:
+            results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+        self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
+        # copy splices to memory
+        if self.dataType == "RNA":
+            results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause)
+            results2 = []
+            for row in results:
+                results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+
+            self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2)
+        if index:
+            if chrom != "":
+                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
+                self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
+                if self.dataType == "RNA":
+                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
+                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
+            else:
+                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
+                self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
+                if self.dataType == "RNA":
+                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
+                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+
+        self.memBacked = True
+        self.memcon.row_factory = sqlite.Row
+        self.memcon.commit()
diff --git a/crossmatch.py b/crossmatch.py
new file mode 100755 (executable)
index 0000000..6a36758
--- /dev/null
@@ -0,0 +1,38 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+from cistematic.core.orthomatcher import orthoMatcher
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "version 1.1"
+    if len(argv) < 7:
+        print "usage: python %s prefix directory genome1 genefile1 genome2 genefile2 [genome3 genefile3 .....]" % argv[0]
+        sys.exit(1)
+
+    prefix = argv[1]
+    directory = argv[2]
+    matchFiles = {}
+
+    genomesToMatch = (len(argv) - 3) / 2
+    for index in range(genomesToMatch):
+        genome = argv[3 + index * 2]
+        print genome
+        if genome not in matchFiles:
+            matchFiles[genome] = []
+
+        matchFiles[genome].append(argv[4 + index * 2])
+
+    print matchFiles
+    orthoMatcher(matchFiles, prefix, directory, fileList=True)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/distalPairs.py b/distalPairs.py
new file mode 100755 (executable)
index 0000000..d24781a
--- /dev/null
@@ -0,0 +1,133 @@
+#
+#  distalPairs.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 10/14/08.
+#
+
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+from commoncode import readDataset
+import sys, time, optparse
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "%prog: version 3.3"
+    print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
+    usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    minDist = int(args[0])
+    rdsfile = args[1]
+    outfilename = args[2]
+
+    distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
+
+
+def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
+    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    if not RDS.hasIndex():
+        print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
+        sys.exit(1)
+
+    if cachePages > RDS.getDefaultCacheSize():
+        RDS.setDBcache(cachePages)
+
+    print time.ctime()
+
+    if doSplices:
+        print "getting splices"
+        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+        print "got splices"
+
+    print "getting uniq reads"    
+    uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+    print "got uniqs"
+
+    if doSplices:
+        for readID in splicesDict:
+            theRead = splicesDict[readID]
+            read0 = theRead[0]
+            del read0[1]
+            try:
+                uniqDict[readID].append(read0)
+            except:
+                if len(theRead) == 4:
+                    read2 = theRead[2]
+                    del read2[1]
+                    uniqDict[readID] = [read0,read2]
+
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    outfile = open(outfilename,"w")
+
+    diffChrom = 0
+    distal = 0
+    total = 0
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if len(readList) == 2:
+            total += 1
+            (start1, sense1, chrom1, pair1) = readList[0]
+            (start2, sense2, chrom2, pair2) = readList[1]
+
+            if chrom1 != chrom2:
+                diffChrom += 1
+                if sameChromOnly:
+                    continue
+                else:
+                    outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
+                    outfile.write(outline + "\n")
+                    if doVerbose:
+                        print diffChrom, outline
+            else:
+                dist = abs(start1 - start2)
+
+                if minDist < dist < maxDist:
+                    distal += 1
+                    outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
+                    outfile.write(outline + "\n")
+                    if doVerbose:
+                        print distal, outline
+
+    outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total))
+    total = float(total)
+    if total < 1:
+        total = 1.
+
+    outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)))
+    outfile.close()
+    print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total))
+    print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))
+    print time.ctime()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/docs/ERANGE.copyright b/docs/ERANGE.copyright
new file mode 100644 (file)
index 0000000..ac0d4fd
--- /dev/null
@@ -0,0 +1,29 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2007-09 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
diff --git a/docs/README.build-rds b/docs/README.build-rds
new file mode 100644 (file)
index 0000000..ef668d2
--- /dev/null
@@ -0,0 +1,328 @@
+This is a description of the sqlite-based read storage 
+files and of the scripts designed to import read 
+mappings from supported short read mappers.  The code 
+should run on any Unix-like system supporting python 2.5 
+or better. The code is developed on Linux and MacOS X on 
+python 2.5.
+
+This code is made available as open-source, as described 
+in the copyright file ERANGE.COPYRIGHT.
+
+1. REQUIREMENTS
+2. COMMAND LINE OPTIONS
+3. CREATING THE NECESSARY INPUT (RDS) FILES
+4. BUILDING EXPANDED GENOMES
+5. MAPPING READS WITH ELAND
+6. MAPPING READS WITH BOWTIE
+7. MAPPING READS WITH BLAT
+8. IMPORTING BED FILES
+9. COMBINING RDS FILES
+10. MANIPULATING RDS METADATA AND CACHING
+11. VISUALIZING THE DATA IN RDS FILES
+
+
+1. REQUIREMENTS 
+
+See README.chip-seq or README.rna-seq to see the requirements 
+for installing and running ERANGE specific to each 
+application. 
+
+
+2. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each script
+by typing:
+
+python $ERANGEPATH/<scriptname> 
+
+to see the command line options, where ERANGEPATH is the 
+environmental variable set to the path to the directory 
+holding the ERANGE scripts. Note that the command line 
+options are case sensitive and that they could well 
+fail silently. 
+
+
+3. CREATING THE NECESSARY INPUT (RDS) FILES
+
+Before you can use the rest of the ERANGE scripts to do 
+CHiP-seq or RNA-seq analyses, you will need to first 
+convert your read mappings to the native ERANGE read 
+storage format, which is sqlite-based, and which is 
+called RDS (Read DataSet). RDS files consist of four 
+tables:
+- metadata (tracks required and optional metadata)
+- uniqs (stores uniquely mappable-reads)
+- multi (stores reads that map equally well to multiple 
+locations in the genome)
+- splices (stores split reads)
+
+a readDataset python object (in commoncode.py) provides 
+the encapsulation of the read database which is accessed 
+through specific methods. Since an RDS file is a sqlite3 
+database, you can additionally use any of the sqlite-based 
+tools to look at the reads in the tables, if you wish to 
+do so.
+
+You will need to first map your reads with one of the 
+supported read mappers (see next paragraph) against a copy 
+of the appropriate genome. For ChIP-seq, it will be your 
+genome of interest, whereas for RNA-seq reads should be 
+mapped against an expanded genome, which consists of 
+chromosomes + splice junctions which depend on the read 
+length used. Note that several parts of the code assume 
+that your genomic sequences are labelled with the "chr" 
+chromosomes prefix. For more information on creating 
+expanded genomes, see BUILDING EXPANDED GENOMES.
+
+The currently supported read mappers are:
+- Eland (part of the Illumina GA pipeline)
+- Bowtie (bowtie-bio.sourceforge.net)
+- Blat (from UCSC)
+
+These are described in the sections on MAPPING READS WITH 
+ELAND, MAPPING READS WITH BOWTIE, MAPPING READS WITH BLAT. 
+
+For ChIP-seq, you can also import bed files of unique reads 
+only using makerdsfrombed.py .
+
+Also see MANIPULATING RDS METADATA AND CACHING to learn about 
+some important aspects of working with RDS files.
+
+
+4. BUILDING EXPANDED GENOMES
+
+For RNA-seq using ELAND or BOWTIE mappings, you will need to build 
+an expanded genome consisting of genomic sequences, spike sequences, 
+and splice-spanning sequences in order to run ERANGE on your own 
+datasets. This expanded genome is specific to the read size used, 
+i.e. there will be a different expanded genome for mouse when using 
+25bp reads or 32bp reads. For reads longer than 32 bp, we recommend 
+using BOWTIE. If your reads are longer than 50bp, consider using 
+BLAT instead.
+
+Download the chromosomes from UCSC, as well as the knownGene.txt (or 
+equivalent table) and a directory of repeatmask annotations for each 
+chromosome (also from UCSC) for your genome of interest.
+
+You will need to build a splice fasta file using the script 
+getsplicefa.py, which needs Cistematic, the knownGene table, and a 
+paremeter for splice radius, which is 4 bp shorter than the length 
+of the reads.
+
+Once you have the splice fasta file, drop it into the same directory 
+as well as a fasta file for your spikes. Then use squashGenome 
+(part of Eland) or bowtie-build (part of Bowtie), to build the 
+expanded genome. Please refer to the documentation for each 
+package to run the genome squasher/builder.
+
+You will also build a repeat database using buildrmaskdb.py for use 
+in the candidate exon analysis from UCSC repeatmasker annotations.
+
+
+5. MAPPING READS WITH ELAND
+
+Please refer to the Illumina documentation for the details on 
+running squashGenome and Eland. If you do not have access to the 
+Illumina pipeline, use bowtie as described in the next section.
+
+For ChIP-seq, you could take the output of the Illumina pipeline, 
+e.g. eland_multi.txt or eland_extended.txt and use them as inputs 
+for makerdsfromeland2.py .
+
+Once you have run Eland with the --multi option (which we 
+colloquially call "eland2") for each RNA-seq lane against the 
+expanded genome, combine all of the outputs for one sample into a 
+single file e.g. test.comb.eland2
+
+The makerdsfromeland2.py script is used to import the reads 
+into RDS:
+
+python makerdsfromeland2.py label infilename outrdsfile [-append] [-RNA ucscGeneModels] 
+[propertyName::propertyValue] [-index] [-paired 1 or 2] [-extended] [-verbose] 
+[-olddelimiter] [-maxlines num] [-cache numPages]
+
+The first 3 arguments are required:
+- label is any label that you wish (a combination flowcell+lane# 
+is a good choice) 
+- infilename is the output of eland in eland_multi format 
+(default) or eland_extended format (with the -extended flag)
+- outdbname is the name of the rds file, e.g. test.rds
+
+If the reads are from paired-end runs, enter each eland_multi 
+(or extended) file separately with the "-paired 1" or "-paired 2" 
+flag, as appropriate.
+
+If entering more than one lane, use -append for all subsequent 
+lanes. Upon entering the last lane, use -index to build a read 
+index. Refer to MANIPULATING RDS METADATA AND CACHING for 
+information on the optional property::value pairs and caching.
+
+For RNA-seq, you must in addition specify the path to knownGene.txt 
+using the -RNA flag, e.g.
+
+python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+
+
+6. MAPPING READS WITH BOWTIE
+
+Bowtie (bowtie-bio.sourceforge.net) is a new read-mapper that 
+is very fast and friendly. ERANGE supports version 0.10.X 
+and higher that allow you to control how many multireads 
+are reported. We recommend the following settings:
+
+$BOWTIEDIR/bowtie zzz -v 2 -k 11 -m 10 -t --strata --best -f s1.query32.txt --un s1.unm.fa --max s1.max.fa s1.zzz.bowtie.txt
+
+where zzz is the genome prefix that you gave when building the 
+genome. In particular, we ask bowtie to map all multireads up 
+to 11 ("-k") with up to 2 mismatches ("-v" and "--best"), however 
+we will only import all multireads up to 10x multiplicity ("-m").
+Note that bowtie is multithreaded and can use multiple cpu based 
+on the -p flag (e.g. use "-p 4" to use 4 CPUs). Unmapped reads 
+are saved in unmapped.fa for later analysis.
+
+Once reads are mapped, they can be imported using:
+
+python $ERANGEPATH/makerdsfrombowtie.py testLabel s1.mm9.bowtie.txt bowtietest.rds
+
+The options for the script are:
+
+python makerdsfrombowtie.py label infilename outrdsfile 
+[-RNA ucscGeneModels]  [-append]  [-index] [propertyName::propertyValue] 
+[-rawreadID] [-verbose] [-cache numPages]
+
+Refer to "MAPPING READS WITH ELAND" for a description of label, 
+infilename, outdbname, '-append', '-index', and '-cache'.
+
+****REMEMBER TO USE -index WHEN LOADING THE LAST LANE OF YOUR 
+DATASET.****
+
+The script assumes that the read ID are from Illumina, i.e. that 
+they have multiple fields separated by ':' and that paired-end 
+reads have an additional '/1' or '/2' depending on the end. 
+It will by default strip the first part of the readID (up to the 
+first ':') and replace it with the label. If you want raw readIDs
+because you mapped raw reads that do not have an associated ID or 
+an ID that doesn't follow Illumina's conventions, use -rawreadID.
+
+If not using Illumina readIDs, use any identifier of the format
+
+throw_away:uniqueid if unpaired
+throw_away:uniqueid/1 and throw_away:uniqueid/2 for paired-ends.
+
+For RNA-seq, you must in addition specify the path to knownGene.txt 
+using the -RNA flag, e.g.
+
+python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+
+
+7. MAPPING READS WITH BLAT
+
+BLAT SUPPORT IN ERANGE IS STILL UNDER DEVELOPMENT AND THE 
+SCRIPTS AND SETTINGS BELOW MAY BE OPTIMIZED FURTHER IN 
+FUTURE RELEASES OF ERANGE.
+
+Reads longer than 40-50bp can be fruitfully mapped with BLAT 
+against the reference genome without needing to provide the 
+exon junctions. While BLAT is much slower than BOWTIE, it 
+has the great advantage of seeing novel splices (i.e. 
+splices not present in knownGene models).
+
+We use the following settings to map 75bp reads with BLAT and 
+filter them with pslReps: 
+
+$BLATPATH/blat /tmp/hg18.fa s3_1.query75.txt -out=pslx s3_1.hg18.blat
+$BLATPATH/pslReps -minNearTopSize=70 s3_1.hg18.blat s3_1.hg18.blatbetter s3_1.blatpsr
+
+where the binaries are in $BLATPATH anywhere on your system.
+
+Once the reads have been filtered, the makerdsfromblat.py 
+script is used to import the mapped reads (in the example 
+above s3_1.hg18.blatbetter) into RDS:
+
+python makerdsfromblat.py label infilename outrdsfile [-append] [-index] [propertyName::propertyValue] 
+[-rawreadID] [-forceRNA]  [-flag] [-strict minSpliceLen] [-spliceonly] [-verbose] [-cache numPages]
+
+If you are using BLAT for RNA-seq, please be sure to use
+-forceRNA in order to import spliced reads and consider 
+using -strict to require a minimum length of bases on 
+each side of the splice. 
+
+You can combine BOWTIE and BLAT by mapping reads with BOWTIE 
+first, and then using BLAT to map the unmapped reads. In 
+that case, you may want to only load the spliced reads 
+using the -spliceonly flag. To track those reads in the RDS 
+file, use -flag ; you can then retrieve those reads using 
+the options "-flag blat -flagLike" with the makebedfromrds.py 
+script.
+
+
+8. IMPORTING BED FILES
+
+If you do not have the raw read data, you can import unique 
+reads only using the script makerdsfrombed.py . Note that 
+this is not particularly useful for RNA-seq since you will 
+have neither the multireads nor the spliced reads.
+
+The command line options are similar to those for other 
+scripts described in part 5-7:
+
+python makerdsfrombed.py label bedfile outrdsfile [-append] [-index] [propertyName::propertyValue] [-cache numPages]
+
+
+9.  COMBINING RDS FILES
+
+Previously created RDS files can be combined into a new RDS 
+dataset using the combinerds.py command with the granularity 
+of importing all tables or specific ones (e.g. uniqs, splices).
+
+The combinerds.py command options are:
+
+python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [-init] [-initrna] [-index] [-cache pages]
+
+
+10. MANIPULATING RDS METADATA AND CACHING
+
+One of the advantages of RDS over bed, is the possibility of 
+attaching arbitrary sets of annotations with the data, which 
+are then carried along. Both the makerds* scripts and 
+rdsmetadata.py allows you to both enter key::value 
+combinations. Entering a key multiple times will cause the 
+same instance to be recorded multiple times, which is 
+appropriate in some settings (e.g. to enter flowcell info). 
+In addition rdsmetadata.py allows you to inspect various 
+attributes of your RDS files such as # of reads and size 
+of the default cache size.
+
+Sqlite files have a certain amount of RAM set aside as cache 
+for lookups, indexes, etc.... where the amount is measured in 
+1.5kb pages. Each RDS instance come with a default of 100000 
+pages (150MB) of cache, which is needlessly small in most 
+situations. Whenever appropriate, try using more cache (e.g. 
+750000 pages on a 2GB RAM machine, much more if more RAM is 
+available) for a significant speed increase in indexing and 
+lookups. You can change the default value for each RDS file 
+by using the -defaultcache option of rdsmetadata.py.
+
+Note that sqlite can be very slow over NFS. Wherever 
+possible, copy your RDS file locally before running an I/O 
+intensive script.
+
+
+11. VISUALIZING THE DATA IN RDS FILES
+
+You can output bed-files of the raw reads using 
+makebedfromrds.py. A more practical way to look at the data 
+might be to ouput it as a bedGraph file using makewiggle.py . 
+
+Note that UCSC has a hard limit on the size of their files 
+and you will likely need to break the wiggles on a per-chromosome 
+basis for mammalian genomes.
+
+RELEASE HISTORY
+
+version 3.2    October  2009 - added combinerds.py
+version 3.01   February 2009 - bug fixes
+version 3.0    January  2009 - added logging to buildrdsfrom*
+version 3.0rc1 December 2008 - added blat support
+
+
diff --git a/docs/README.chip-seq b/docs/README.chip-seq
new file mode 100644 (file)
index 0000000..6529a6f
--- /dev/null
@@ -0,0 +1,232 @@
+This is an updated version of the core of the ChIP-seq 
+analysis code described in Johnson et al (2007).  It 
+should run on any Unix-like system supporting python 2.5 
+or better. The code is developed on Linux and MacOS X on 
+python 2.5.
+
+These scripts in the ChIPSeqMini package are now part of 
+the ERANGE package, but are still available as a 
+standalone package for now.
+
+This code is made available as open-source, as described 
+in the copyright file ERANGE.COPYRIGHT.
+
+
+1. REQUIREMENTS
+2. COMMAND LINE OPTIONS
+3. MAKING THE NECESSARY INPUT (RDS) FILES
+4. WEIGHING MULTIREADS
+5. RUNNING THE PEAK FINDER
+6. DISPLAYING DATA ONTO THE  UCSC GENOME BROWSER
+7. DOWNSTREAM ANALYSES
+
+
+1. REQUIREMENTS
+
+1) Python 2.5 is required because some of the scripts and 
+Cistematic (see below) need pysqlite, which is now bundled in 
+Python.
+
+2) You will also need to use Cistematic 2.3 (available at 
+cistematic.caltech.edu) for all of the scripts that are 
+part of the downstream analyses.
+
+(optional) Use of the psyco module (psyco.sf.net) on 32-bit 
+Linux or Mac Intel machines is highly recommended.
+
+(optional) Three visualization scripts also depend on the 
+additional package pylab (matplotlib). These scripts are:
+- getgosig.py
+- plotbardist.py
+- scatterfields.py 
+You do not need to install pylab if you will be 
+visualizing some of your analysis results differently.
+
+
+2. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each script
+by typing:
+
+python $ERANGEPATH/<scriptname> 
+
+to see the command line options, where ERANGEPATH is the 
+environmental variable set to the path to the directory 
+holding the ERANGE scripts. Note that the command line 
+options are case sensitive and that they could well 
+fail silently.
+
+
+3. MAKING THE NECESSARY INPUT (RDS) FILES
+
+You will want to first convert your read mappings to the 
+native ERANGE read store. Please see the file 
+README.build-rds for instructions on how to do this.
+
+Build an RDS file for both the ChIP, and if available and 
+appropriate, the control. Note that we *HIGHLY* recommend 
+the use of a matched control sample to account for some 
+of the general background artifacts that can be present 
+in ChIP-seq samples (e.g. DNAse hypersensitivity, 
+assembly collapse of some sattelite repeats, etc....). 
+
+
+4. WEIGHING MULTIREADS
+
+Version 3.0 of the peak finder can use multireads, i.e. 
+reads that map equally well to more than one location 
+in the genome, to find binding sites that are in low 
+copy-number on-unique regions (typically less than 10).
+
+ERANGE offers 3 ways to analyze these regions:
+(a) default weighing of 1/multiplicity
+(b) ignoring multireads
+(c) weighing of multireads based on unique reads in a 
+given radius 
+
+(a) is the default in the current release of ERANGE. 
+Simply proceed to RUNNING THE PEAK FINDER for (a) and 
+(a). You can ignore multireads (b) by using the -nomulti 
+flag with findall.py. For (c), use weighMultireads.py 
+to weigh multireads based on a unique reads in the 
+respective radius of each potential location. Once run, 
+proceed to the section below.
+
+
+5. RUNNING THE PEAK FINDER
+
+To run the peak finder without read shifting, use the 
+following command:
+
+python $ERANGEPATH/findall.py label chip.rds chip.regions.txt -control control.rds -listPeak -revbackground
+
+which will run the peak finder on chip.rds / control.rds , 
+store the enriched region coordinates in chip.regions.txt, 
+also store the actual local maximum in each region in the 
+same file, and also calculate an FDR by running the 
+finder on control.rds / chip.rds . 
+
+A log file (findall.log by default, change with -log) 
+tracks the settings used to run the program as well as 
+some of the summary statistics, which are also stored 
+at the bottom of the regions.txt output file.
+
+findall.py is tuned to conservative settings for 10-12M 
+mappable read IPs of static, sequence-specific 
+transcription factors in mammals with very short 
+fragment sizes, on the order of 40-60 bp. 
+
+You will *NEED* to change some of the default parameters 
+if working in smaller genomes (e.g. use smaller -spacing), 
+if working with certain types of IPs such as histones and 
+polymerases (test with and without -notrim and 
+-nodirectionality), if working with rather weak IPs
+(e.g. -minimum and -ratio), or if working with larger 
+fragment sizes (see the paragraph below discussing read 
+shifting). 
+
+findall.py returns a per-peak p-value. By default, this 
+is calculated using a Poisson distribution of peak RPMs 
+(or counts, if using -raw) for each chromosome in the IP. 
+P-value calculations can be turned off using 
+'-pvalue none '. Alternatively, the p-value can be 
+calculated from the background using the option 
+'-pvalue back ', which must be combined with the option 
+-revbackground.
+
+By default, findall.py does not try to adjust the location 
+of the reads based on half the size of the expected fragment 
+length (the "shift"). If you believe that you need to shift 
+your peaks, findall.py can try to pick the best shift based 
+on the best shift for strong sites using the parameter 
+'-shift learn '. You can also either manually specify a 
+shift value using '-shift #bp ' or ou can calculate a 
+"best shift" for each region using '-autoshift'. If you 
+need to using the shift options, the recommended usage is:
+(i) first run findall.py with '-shift learn ', which will 
+peak a shift if there are at least 30 regions that meet 
+its training criteria.
+(ii) if (i) couldn't pick a shift, run findall.py with 
+-autoshift and -reportshift
+(iii) look at the mode (most common #) for the shift
+(iv) rerun findall.py with -shift #bp where #bp is the mode
+  
+If you are storing the RDS files on an network-mounted 
+directory, make sure to use '-cache XXXXX' to enable 
+local caching, where is as large as appropriate as 
+described in section 9 of README.build-rds . 
+
+Note that ERANGE will cache by default to /tmp, but this 
+can be redirected to any directory pointed to by the 
+environmental variable CISTEMATIC_TEMP.
+
+To find out the current default settings and options, 
+simply type:
+
+python $ERANGEPATH/findall.py
+
+for more information.
+
+
+6. DISPLAYING DATA ONTO THE  UCSC GENOME BROWSER
+
+You can output bed-files of the raw reads using 
+makebedfromrds.py and  BEDGRAPH file using 
+makewiggle.py as described in README.build-rds .
+
+You can create bed files of regions and sites (see 
+below) using regiontobed.py and makesitetrack.py .
+
+
+7. DOWNSTREAM ANALYSES
+
+Recall that Cistematic 2.3 is a required to do motif 
+and gene-level analyses of the output of findall.py.
+
+Use getallgenes.py to find the nearest gene within a 
+radius of each binding site.
+
+Use analyzego.py to do a Gene Ontology enrichment 
+analysis of a gene list (such as from getallgenes.py). 
+You can look at a heatmap of your GO enrichments using 
+getgosig.py. You can also use getGOgenes.py to look at 
+the genes with particular GO annotations.
+
+To do motif-finding, use getfasta.py to get the sequences 
+centered on the peaks of your regions of interest. For 
+the sake of a pleasant experience, try limiting yourself 
+to less than 100kb of combined sequence (the easiest being 
+by picking your regions with the strongest signals).
+
+Once you have a fasta file of the regions of interest, you 
+can use findMotifs.py to find motifs using either 
+cisGreedy (bundled with Cistematic 2.2) which is good for 
+shorter motifs or Meme (must be installed separately - 
+refer to the instructions on cistematic.caltech.edu for 
+more information), which is better for longer motifs. 
+findmotifs.py will return a set motifs in Cistematic format 
+with a .mot extension. These motifs can then be used with 
+getallsites.py to get the coordinates and instances of each 
+motif in all of the regions found by the peak finder.
+
+The sites can be checked against repeat-masker annotations 
+(preloaded from UCSC with buildrmaskdb.py) using 
+checkrmask.py. The sites for each motif can also be fed 
+back into getallgenes.py to get genes, redo the GO analyses, 
+etc....
+
+You can use the intersect scripts (intersects.py, 
+gointersects.py, and siteintersects.py) to compare different 
+sets of genes/GO/site results across multiple experiments, 
+for example.
+
+
+RELEASE HISTORY
+
+version 3.1    February 2009 - support for read shifting
+version 3.0    February 2009 - support for UCSC narrowPeak format in regiontobed.py
+version 3.0rc1 December 2008 - added parameter to control peak-trimming
+version 3.0b2  December 2008 - added per-peak p-value
+version 3.0b   November 2008 - initial release of RDS-based code 
+with support for eland and bowtie.
+
diff --git a/docs/README.rna-esnp b/docs/README.rna-esnp
new file mode 100644 (file)
index 0000000..fbb2b96
--- /dev/null
@@ -0,0 +1,75 @@
+This is a description of the pipeline designed to analyze single 
+nucleotide changes found in the mapped reads. The code should run 
+on any Unix-like system supporting python 2.5 or better. The code 
+is developed on MacOS X on python 2.5.
+
+1. COMMAND LINE OPTIONS
+2. BUILDING THE SNP DATABASE
+3. RUNNING THE SNP PIPELINE
+
+
+1. COMMAND LINE OPTIONS
+
+To find out more about the settings for each script, type:
+
+python $ERANGEPATH/<scriptname> 
+
+to see the command line options. Note that all ERANGE command-line 
+options are case-sensitive & that the scripts typically ignore 
+command-line arguments that they do not recognize!
+
+
+2. BUILDING THE SNP DATABASE
+
+In order to check the candidate SNPs versus known SNPs, you will need 
+to first download the corresponding dbSNP database file from UCSC and 
+then build a sqlite version of it using:
+
+python $ERANGEPATH/buildsnpdb.py ucscSNPfile outdb
+
+e.g.
+
+python buildsnpdb.py snp128.txt dbSNP128
+
+
+3. RUNNING THE SNP PIPELINE
+
+The runSNPAnalysis.sh shell script is designed to retrieve SNPs, filter
+them against repeat annotations, cross-check them against known SNPs and
+annotate the novel SNPs. It will automatically run a set of python scripts 
+that are required for the SNPs analysis using the RDS (Read DataSet) file. 
+This script assumes the existence of a known SNP database as described in 
+the previous section as well as of a repeatmask database
+
+Usage: $ERANGEPATH/runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
+
+where ERANGEPATH is the environmental variable set to the path to the directory holding the ERANGE scripts.
+
+Parameters:
+- genome: the name of the organism in the analysis.
+- rdsfile: read DataSet file. See README.build-rds for 
+more information.
+- label: the file name of your choice for the analysis.
+- rmaskdbfile: repeat mask database, a sqlite database file. See 
+README.rna-seq for more information on creating the database.
+- dbsnpfile: dbsnp database, a sqlite database file, built from the 
+dbSNP database text file from UCSC. Please see command line option 
+for building dbsnp sqlite database using buildsnpdb.py .
+- uniqStartMin: the ratio of the number of unique reads supporting a 
+SNP at base s and the maximum number of unique read coverage at base s .
+5 is a good number to start with.
+- totalRatio: the ratio of the number of reads supporting an 
+expressed SNP at s and the total read coverage at s . 0.75 should allow 
+you to get the homozygous SNPs.
+- rpkmfile: rpkm file can be generated using the RNA-seq pipeline as 
+described in README.rna-seq.  If you do not have that file, you can 
+set it to NONE.
+- cachepages: cache pages. Make sure to use as much caching as your 
+system will accomodate. See README.build-rds for more information.
+
+Example: $ERANGEPATH/runSNPAnalysis.sh mouse 24T4spike.rds 24Tspike rmask.db dbSNP128.db 5 0.75 c2c12rna.24R.final.rpkm 5000000 
+
+version 3.0    January  2009 - logging
+version 3.0rc1 December 2008 - major rewrite and speed-up of getSNPs.py and chksnp.py
+version 3.0b2  December 2008 - bug fixes & ERANGEPATH variable
+
diff --git a/docs/README.rna-seq b/docs/README.rna-seq
new file mode 100644 (file)
index 0000000..5a866f3
--- /dev/null
@@ -0,0 +1,267 @@
+The latest version of this software is available at 
+
+http://woldlab.caltech.edu/rnaseq
+
+please check the website for updates.
+
+This is the core of the RNA-seq analysis code described in Mortazavi 
+et al (2008). Please make sure that you have read Figure 3 and the 
+methods / supplemental methods of that paper before attempting to 
+use this package for RNA-Seq data analysis. 
+
+ERANGE should run on any Unix-like system supporting python 2.5 or 
+better. The code is developed on Linux and MacOS X on python 2.5. 
+
+Historically, the code for ERANGE grew out of the ChIPSeqMini 
+package from Johnson et al (2007), and some of the key scripts 
+(findallnocontrol.py and getallgenes.py) are shared between the two. 
+This is why ERANGE is "dual-use" and is also why the code for both 
+analyses were kept in common as much as possible. This should be 
+helpful when someone tries to combine ChIP-seq and RNA-seq 
+analyses !
+
+This code is made available as open-source, as described in the 
+copyright file ERANGE.COPYRIGHT.
+
+1. SETTING EXPECTATIONS
+2. REQUIREMENTS
+3. COMMAND LINE OPTIONS
+4. DISPLAYING DATA
+5. ANALYSIS
+6. PIPELINE
+7. CUSTOM CISTEMATIC GENOME ANNOTATIONS
+8. PAIRED-END RNA-SEQ ANALYSIS
+9. EXPRESSED SNP ANALYSIS
+
+1. SETTING EXPECTATIONS
+
+ERANGE is not a point-and-click, turn-key package. 
+
+It is a set of python scripts that, when run in order as a pipeline 
+on the "right" input, will take read data in RDS format and 
+calculate gene expression levels in RPKM (Reads Per kb per Million 
+reads). This pipeline for unpaired reads is embodied in a shell 
+script called runStandardAnalysis.sh, which only takes a few inputs, 
+described in the ANALYSIS and PIPELINE section below.
+
+You should be able to download the data from our website and run the 
+analysis through the pipeline. You will need to map the reads and 
+import them into an RDS dataset as described in README.build-rds.
+
+Because you will likely want to run this package on other genomes 
+(or builds) than the one described in our original paper, you will 
+need to do several additional steps, such as:
+
+- build expanded genomes with splices and spikes
+- check overlap of RNAFAR predictions with repeats
+
+This will require some comfort with running and, if necessary, 
+editing scripts. While the code is sparsely documented, we are 
+making it available so that you can *read it*. We'll be happy to 
+help modifying and updating the code within a reasonable extent 
+and will try to provide more in depth documentation and tutorials 
+on our web site.
+
+While the scripts produce several forms of RPKM, we suggest that 
+the "final" RPKM are the values that most people will be interested 
+in.
+
+*WARNING* A couple of these scripts are pretty memory hungry. If 
+you are going to analyze datasets with > 20M reads or reads with 
+high error rates, you will easily need > 8 GB RAM. We'll rewrite 
+these scripts before releasing 3.0 final to lower the memory 
+footprint. 
+
+2. REQUIREMENTS
+
+1) Python 2.5+ is required because some of the scripts and 
+Cistematic (see below) need pysqlite, which is now bundled in 
+Python.
+
+2) You will also need to use Cistematic 3.0 for some of the scripts 
+marked below that use genes and genomic sequence; in particular, you 
+will also likely need the Cistematic version of the genomes, unless 
+providing your own custom genome and annotations.
+
+Cistematic is available at http://cistematic.caltech.edu 
+
+3) You will need genomic sequences to build the expanded genome, as 
+well as gene models from UCSC. 
+
+(Optional) Python is very slow on large datasets. Use of the psyco 
+module (psyco.sf.net) on 32-bit Linux or all Mac Intel machines to 
+significantly speed up runtime is highly recommended.
+
+(Optional) Several of the ploting scripts also rely on Matplotlib, 
+which is available at matplotlib.sf.net.
+
+
+3. COMMAND LINE OPTIONS
+
+You can find out more about the settings for each python script by 
+typing:
+
+python $ERANGEPATH/<scriptname> 
+
+to see the command line options, where ERANGEPATH is the 
+environmental variable set to the path to the directory 
+holding the ERANGE scripts.
+
+
+For example, if you wanted to know the command line options of the 
+script used to generate supplementary datasets 2-4, combineRPKMs.py , 
+you would type:
+
+python $ERANGEPATH/combineRPKMs.py
+
+and get back a version number and all possible command line options:
+
+version 1.0
+usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [-withmultifraction]
+
+where fields in brackets are optional.
+
+
+4. DISPLAYING DATA 
+
+You can output bed-files of the raw reads in the RDS file 
+using makebedfromrds.py and  WIG file using makewiggle.py as 
+described in README.build-rds .
+
+
+5. ANALYSIS
+
+The main steps of a typical, unpaired analysis using ERANGE 
+is shown in RNA-seq.analysisSteps.txt, where each script 
+would be run in order, with the caveat that there are two 
+ways to do the candidate exon analysis (RNAFAR), creatively 
+called "alternative 1" and "alternative 2". 
+
+In alternative 1, we use reads that did not match an existing gene 
+model to identify candidate regions:
+
+# Alternative 1: find new regions outside of gene models with reads piled up 
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction 
+#                use "none" if you don't have a repeatmask database
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+
+In alternative 2, we pool multiple RNA-seq datasets into a single 
+RDS database, run it through the two scripts of alternative 1 above, 
+and then use these precomputed candidates to count reads falling in 
+these regions:
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
+
+Alternative 1 is the one used by the pipeline script described below.
+
+The scripts will generate a set of intermediate files, the most 
+interesting of which are the final RPKM values. These will be in the 
+following files for the test example:
+
+test.firstpass.rpkm (the unique reads only)
+test.expanded.rpkm (the unique reads + spliced reads  + RNAFAR)
+test.final.rpkm (uniques + spliced + RNAFAR + multireads)
+
+
+6. PIPELINE
+
+IF YOU ARE STORING THE RDS FILE ON A NETWORK-MOUNTED DIRECTORY, 
+PLEASE ALSO READ SECTION 7.
+
+Most of the analysis steps described in the section above are 
+automated in a pipeline shell script called runStandardAnalysis.sh .
+Note that the pipeline assumes that it will call its own RNAFAR 
+regions, which is called "alternative 1" in the ANALYSIS section, 
+which is a good starting point. You can modify the pipeline script 
+to use alternative 2, if appropriate.
+
+The pipeline assumes that one RDS database containing the appropriate 
+uniq, multi, and spliced reads exists as desribed in README.build-rds.
+
+We assume that Cistematic 2.3 is installed, including a version of 
+the appropriate Cistematic genome. You will need to build your own 
+Cistematic genome for any unsupported genome.
+
+We will also need a radius (e.g. 20000 bp) within which a candidate 
+exon will be consolidated with an existing gene.
+
+For example, for the test.rds dataset from the ANALYSIS section, we 
+would run the pipeline as:
+
+. $ERANGEPATH/runStandardAnalysis.sh mouse test ../mm9repeats/rmask.db 20001
+
+where ERANGEPATH is the environmental variable set to the path to 
+the directory holding the ERANGE scripts. Remember that you can 
+replace '../mm9repeats/rmask.db' with 'none' if you don't have a 
+repeatmask database.
+
+This could run from an hour to a whole day depending on how many 
+reads are involved (1M vs 80M) and how big a consolidation radius 
+is used. 
+
+
+7. CUSTOM CISTEMATIC GENOME ANNOTATIONS
+
+Cistematic 3.0 added support for generic genomes and loadable 
+(or alternative) annotations. While this support is still 
+experimental, the general idea is to take a GTF/GFF3 file, 
+convert it into the format that cistematic expects using 
+
+$ERANGEPATH/gfftocis.py infile.gff outfile.cis
+
+NOTE THAT YOU WILL MOST LIKELY HAVE TO EDIT THIS FILE TO 
+ACCOMODATE YOUR SPECIFIC GFF FORMAT TO THE CISTEMATIC 
+FORMAT, WHICH IS
+
+geneID<tab>uniqRef<tab>chrom<tab>start<tab>stop<tab>sense<tab>type<return>
+
+where type is one of 'CDS','5UTR','3UTR'.
+
+You can then run the standard analysis script with the additional 
+flag " -models outfile.cis ", e.g.
+
+. runStandardAnalysis.sh generic asteph none 1000 -models agambiae.base.cis
+
+Custom annotation support will be extended to other PIPELINE 
+scripts as part of 3.2 final.
+
+
+8. PAIRED-END RNA-SEQ ANALYSIS
+
+We are now experimentally supporting paired-end RNA-seq, as 
+implemented in the pipeline script runRNAPairedAnalysis.sh and 
+is only provided as a "work-in-progress" snapshot.
+
+This is done primarily by marking all of the reads that map in a 
+known exon or a novel RNAFAR region in the RDS database, which 
+is a slow and time-consuming step (and is off by default for 
+single-ended RNA-seq). This mapping step is done without 
+accounting for paired-end information.
+
+The paired-end information is then used to connect RNAFAR 
+regions to known genes or to other RNAFAR regions using 
+reads with one end in a given region and the other end 
+in different (known or novel) region, as implemented in 
+rnafarPairs.py ; note that there is currently a default 
+limit of 500000 bp maximum distance between the two pairs.
+
+
+9. EXPRESSED SNP ANALYSIS
+
+ERANGE3 now supports SNP analysis in RNA-seq data as described 
+in README.rna-esnp .
+
+RELEASE HISTORY
+
+version 3.2    December 2009 - support for custom genome annotations with Cistematic 3.0
+version 3.1    April    2009 - modified normalizeFinalExonic.py to remove genome
+version 3.0    January  2009 - added logging to shell pipelines
+version 3.0rc1 December 2008 - added blat support
+version 3.0b2  December 2008 - bug fixes & ERANGEPATH variable
+version 3.0b   November 2008 - Support for paired end analysis
+version 3.0a    October 2008 - Preview release of ERANGE3.0
+version 2.0         May 2008 - First public release of ERANGE
+
diff --git a/docs/README.rnapath b/docs/README.rnapath
new file mode 100644 (file)
index 0000000..c64579b
--- /dev/null
@@ -0,0 +1,49 @@
+This is a description of the pipeline designed to do scaffolding 
+of fragmented genomes using RNA-seq. The code should run 
+on any Unix-like system supporting python 2.6 or better. The code 
+is developed on MacOS X on python 2.6.
+
+Note that RNAPATH is not currently optimized for running on machines with 
+small or medium amounts of RAM. 32 Gb minimum is recommended for the current 
+version.
+
+1. COMMAND LINE OPTIONS
+2. MAPPING THE READS AND BUILDING THE RDS FILES
+3. GETTING THE SCAFFOLDING READS
+4. RUNNING RNAPATH.py
+
+
+1. COMMAND LINE OPTIONS
+
+To find out more about the settings for each script, type:
+
+python $ERANGEPATH/<scriptname> 
+
+to see the command line options. Note that all ERANGE command-line 
+options are case-sensitive & that the scripts typically ignore 
+command-line arguments that they do not recognize!
+
+
+2. MAPPING THE READS AND BUILDING THE RDS FILES
+
+Before running the RNAPATH script on a genome (assumed to be in fasta format), 
+you will need to first map the RNA-seq reads using BLAT and import those reads
+into an RDS file, as described in README.build-rds . 
+
+3. GETTING THE SCAFFOLDING READS
+
+Once you have an indexed RDS file, use the scriipit distalPairs.py to output 
+the list of paired reads that do not map to the same contig. This involves 
+specifying a distance to distalPairs.py that is greater than the length of the 
+largest existing genomic contig. For example:
+
+python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs -splices -cache 20000000
+
+4. RUNNING RNAPATH.py
+
+You can now run RNAPATH.py. I suggest optionallly using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
+
+Example: $ERANGEPATH/rnapath/RNAPATH.py genomic_contigs.fa rna_on_genomic.crosspairs RNAPATH.log genome.RNAPATH.fa
+
+version 3.2    May  2010 - first release
+
diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt
new file mode 100644 (file)
index 0000000..e9a5213
--- /dev/null
@@ -0,0 +1,87 @@
+# analysis steps for an ERANGE analysis of RNA-seq data
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+
+# preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
+#              export PYTHONPATH=/my/path/to/cistematic
+#
+# preliminary: set CISTEMATIC_ROOT to the directory that contains the genome directories (such as H_sapiens or M_musculus), e.g.
+#              export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes
+#
+# preliminary: set ERANGEPATH, e.g. 
+#              export ERANGEPATH=/proj/genome/experiments/commoncode
+#
+# preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g. 
+#              export CISTEMATIC_TEMP=/any/local/dir
+#
+# preliminary: create splice file using getsplicefa.py with maxBorder set to 4 bp shorter than the read length, e.g.
+#              python $ERANGEPATH/getsplicefa.py hsapiens /my/path/to/human/knownGene.txt hg18splice32.fa 28
+#
+# preliminary: build expanded genome using Eland's squashGenome or Bowtie's bowtie-build (see README.build-rds)
+#              a slower alternative is to use blat just on the genome.
+#
+# preliminary: build repeatmask database using buildrmaskdb.py, e.g.
+#              python $ERANGEPATH/buildrmaskdb.py /path/to/hg19repeats /path/to/hg18repeats/rmask.db
+#              if you don't have an repeatmask database, just use "none" for the rmask database below
+
+# run bowtie on expanded genome or just blat on the regular genome
+# as described in README.build-rds
+#
+
+# create rds file with one lane's worth of data (add -index if using only one lane)
+# The example below sets the default cache to 1000000 
+# The name::value pairs are optional documentart metadata, and can be set to any desired name or value
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX  
+
+# can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups
+# rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer
+#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount
+
+# append more data (only add -index when adding last lane)
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index  
+
+# count the unique reads falling on the gene models ; the nomatch files are 
+# mappable reads that fell outside of the Cistematic gene models and not the 
+# unmappable of Eland (i.e, the "NM" reads)
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1
+
+# There is a choice of either identifying new regions from the data alone 
+# (Alternative 1), or using a pre-computed list of new regions (presumably 
+# pooled from multiple nomatch.bed files, or literature) against the nomatch.bed
+# file (Alternative 2)
+
+# Alternative 1: find new regions outside of gene models with reads piled up 
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction 
+#                use "none" if you don't have a repeatmask database
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+#python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
+
+# map all candidate regions that are within a 20kb radius of a gene in bp
+# take out -cache if running locally
+python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache
+
+# create bed file of accepted candidate regions
+python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache
+
diff --git a/docs/buildMatrix.sh b/docs/buildMatrix.sh
new file mode 100644 (file)
index 0000000..9575071
--- /dev/null
@@ -0,0 +1,44 @@
+#!/bin/bash
+echo 'buildMatrix.sh: version 1.1'
+
+indexPrev=0
+indexCur=0
+
+truncateRPKM=""
+if [ $# -eq 3 ]; then
+    truncateRPKM="-truncate "$3
+fi
+
+if [ $# -eq 4 ]; then
+    truncateRPKM="-rescale -truncate "$3
+fi
+
+if [ $# -lt 2 ]; then
+       echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [-rescale]'
+       echo
+       echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
+       echo
+else
+       python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "with parameters: $1 $2 $truncateRPKM"
+       while read line
+       do 
+               prefix=`echo $line | cut -f 1 -d ','`
+               filename=$prefix.partcount
+               if [ -e $filename ]; then
+                       if [ $indexCur -lt 1 ]; then
+                               echo "building $1.step0"
+                               echo -e '\t' > $1.step0
+                               cut -f 1 $filename >> $1.step0
+                               indexCur=1
+                       fi
+                               python $ERANGEPATH/buildMatrix.py $1.step$indexPrev $filename $1.step$indexCur $truncateRPKM
+                               rm $1.step$indexPrev
+                               let indexPrev=indexPrev+1
+                               let indexCur=indexCur+1
+               else
+                       echo "could not find $filename - skipping"
+                       python $ERANGEPATH/recordLog.py buildMatrix.log buildMatrix.sh "could not find $rds - skipping"
+               fi
+       done < $2      
+       mv $1.step$indexPrev $1.matrix.tab
+fi
diff --git a/docs/partition.sh b/docs/partition.sh
new file mode 100644 (file)
index 0000000..1955e99
--- /dev/null
@@ -0,0 +1,34 @@
+# an example shell script to combine multiple region calls into one partition
+#
+
+if [ -z "$1" ]; then
+    PARTNAME=comb
+else
+       PARTNAME=$1
+fi
+
+if [ -z "$2" ]; then 
+       MINSIZE=400
+else
+       MINSIZE=$2
+fi
+
+N=0
+if [ $# -lt 2 ]; then
+       echo 'usage: partition.sh name minSize datalist.file'
+       echo
+       echo 'where the datalist file is a list of region files'
+       echo
+else
+       while read line
+       do
+               if [ $N -lt 1 ]; then
+                       FILELIST=''
+               else
+                       FILELIST=$FILELIST,
+               fi
+               FILELIST=$FILELIST$line
+               let N=N+1
+       done < $3
+       python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part -minFeature $MINSIZE -nomerge -locid -norandom
+fi
diff --git a/docs/regionCounts.sh b/docs/regionCounts.sh
new file mode 100644 (file)
index 0000000..13c60ad
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/bash
+echo 'regionCounts.sh: version 1.0'
+
+cachepages=""
+if [ $# -eq 3 ]; then
+    cachepages="-cache "$3
+fi
+
+if [ $# -lt 2 ]; then
+       echo 'usage: regionCounts.sh partitionfile datalist.file [cachevalue]'
+       echo
+       echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
+       echo
+else
+       arguments=$1' '$2' '$cachepages
+       python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "with parameters: $arguments"
+       while read line
+       do 
+               prefix=`echo $line | cut -f 1 -d ','`
+               rds=`echo $line | cut -f 2 -d ','`
+               if [ -e $rds ]; then
+                       python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount -force -nomerge -rpkm $cachepages
+               else
+                       echo "could not find $rds - skipping"
+                       python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "could not find $rds - skipping"
+               fi
+       done < $2      
+fi
diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh
new file mode 100755 (executable)
index 0000000..baf7f04
--- /dev/null
@@ -0,0 +1,81 @@
+#!/bin/bash
+#
+# runRNAPairedAnalysis.sh
+# ENRAGE
+#
+# example: . ../commoncode/runRNAPairedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db
+#       
+#          assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run. 
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+    ERANGEPATH='../commoncode'
+fi
+
+echo 'runRNAPairedAnalysis.sh: version 3.7'
+
+models=""
+if [ $# -eq 5 ]; then
+    models=" -models "$5
+fi
+
+replacemodels=""
+if [ $# -eq 6 ]; then
+    replacemodels=" -models $5 -replacemodels "
+fi
+
+if [ -z "$1" ]
+then
+    echo
+    echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [-replacemodels]'
+    echo
+    echo 'where rdsprefix is the name of the rds file without the .rds extension'
+    echo 'use "none" for the repeatmaskdb if you do not have one'
+    echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$models' '$5
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runRNAPairedAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are 
+# mappable reads that fell outside of the Cistematic gene models and not the 
+# unmappable of Eland (i.e, the "NM" reads)
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -markGID -cache 1 $models $replacemodels
+
+# find new regions outside of gene models with reads piled up 
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# filter out new regions that overlap repeats more than a certain fraction
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked -startField 1 -log rna.log -cache 1
+
+# calculate the read densities
+python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good -markRDS -cache -log rna.log
+
+# map all candidate regions that have paired ends overlapping with known genes
+python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt -cache $models $replacemodels
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache 
+
+fi
diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh
new file mode 100755 (executable)
index 0000000..0e4ff92
--- /dev/null
@@ -0,0 +1,62 @@
+#!/bin/bash
+#
+# runSNPAnalysis.sh
+#
+# Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
+# Example: /getSNPs.sh mouse /woldlab/trog/sdc/alim/24T4spike_10212/24T4spike.rds 24Tspike /woldlab/trog/data1/wlee/db/rmask.db /woldlab/trog/data1/wlee/db/dbSNP128.db 5 0.75 ~/proj/c2c12rna24R/c2c12rna.24R.final.rpkm 5000000 
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+    ERANGEPATH='../commoncode'
+fi
+
+echo 'runSNPAnalysis.sh: version 3.1'
+
+cachepages=""
+if [ $# -eq 9 ]; then
+    cachepages="-cache "$9
+fi
+
+nosplices=""
+if [ $# -eq 10 ]; then
+    nosplices=" -nosplices "
+fi
+
+if [ $# -lt 8 ]; then
+    echo 'runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]'
+    echo 'where for each position S:'
+    echo '     uniqStartMin = # independent reads supporting base change at S'
+    echo '     totalRatio = total # reads supporting base change at S / total # reads that pass through S'
+else
+# log the parameters
+arguments=$1' '$2' '$3' '$4' '$5' '$6' '$7' '$8' '$cachepages$nosplices
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py snp.log runSNPAnalysis.sh "with parameters: $arguments"
+
+# get all SNPs by extracting it from the RDS
+python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt -enforceChr $cachepages $nosplices
+
+# get SNPs in non-repeat regions only
+python $ERANGEPATH/chkSNPrmask.py $4 $3.snps.txt $3.nr_snps.txt $cachepages
+
+# Check to see if SNPs are found in dbSNP
+# if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC
+# usage: python2.5 buildsnpdb.py snpdbdir snpdbname
+# the database flat file must be in the snpdbdir directory
+# To build dbSNP database file, run the following command 
+# python2.5 buildsnpdb.py snp128.txt dbSNP128
+
+# get dbSNP info for SNPs that are found in the dbSNP database
+python $ERANGEPATH/chksnp.py $5 $3.nr_snps.txt $3.nr_dbsnp.txt $cachepages
+
+# get gene info for the snps found in dbSNP
+python $ERANGEPATH/getSNPGeneInfo.py $1 $3.nr_dbsnp.txt $8 $3.nr_dbsnp_geneinfo.txt $cachepages
+
+# get gene info for snps that are not found in dbSNP
+python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt 
+
+# make bed file for displaying the snps on UCSC genome browser
+python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
+fi
\ No newline at end of file
diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh
new file mode 100755 (executable)
index 0000000..6d83297
--- /dev/null
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# runStandardAnalysis.sh
+# ENRAGE
+#
+# example: . $ERANGEPATH/runStandardAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000
+#       
+#          assuming that we have rds database with the prefix c2c12rna.24R and that an RNAFAR analysis has already been run. 
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+    ERANGEPATH='../commoncode'
+fi
+
+echo 'runStandardAnalysis.sh: version 4.2'
+
+models=""
+if [ $# -eq 5 ]; then
+    models=" -models "$5
+fi
+
+replacemodels=""
+if [ $# -eq 6 ]; then
+    replacemodels=" -models $5 -replacemodels "
+fi
+
+if [ -z "$1" ]
+then
+    echo
+    echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [-replacemodels]'
+    echo
+    echo 'where rdsprefix is the name of the rds file without the .rds extension'
+    echo 'use "none" for the repeatmaskdb if you do not have one'
+    echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$4' '$models' '$6
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runStandardAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are 
+# mappable reads that fell outside of the Cistematic gene models and not the 
+# unmappable of Eland (i.e, the "NM" reads)
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache  $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache  $models $replacemodels
+
+# recount the unique reads with weights calculated during the first pass
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1  $models $replacemodels
+
+# count splice reads
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1  $models $replacemodels
+
+# Alternative 1: find new regions outside of gene models with reads piled up 
+echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1"
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+
+# Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -cache 1"
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -log rna.log -startField 1 -cache 1
+
+# map all candidate regions that are within a given radius of a gene in bp
+echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache  $models $replacemodels"
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache  $models $replacemodels
+
+# make sure candidates.txt file exists
+echo "touch $2.candidates.txt"
+touch $2.candidates.txt
+
+# calculate expanded exonic read density
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache  $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache  $models $replacemodels
+
+# weigh multi-reads
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1  $models $replacemodels
+
+# calculate final exonic read density
+echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache"
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+
+fi
\ No newline at end of file
diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh
new file mode 100755 (executable)
index 0000000..2626ed0
--- /dev/null
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# runStrandedAnalysis.sh
+# ENRAGE
+#
+# example: . ../commoncode/runStrandedAnalysis.sh mouse c2c12rna ../mm9repeats/rmask.db 20000
+#       
+#          assuming that we have rds database with the prefix c2c12rna.24R. 
+
+# set ERANGEPATH to the absolute or relative path to ERANGE, if it's not in the environment
+
+if [ -z "$ERANGEPATH" ]
+then
+    ERANGEPATH='../commoncode'
+fi
+
+echo 'runStrandedAnalysis.sh: version 4.1'
+
+if [ -z "$1" ]
+then
+    echo
+    echo 'usage:runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius'
+    echo
+    echo 'where rdsprefix is the name of the rds file without the .rds extension'
+    echo 'use "none" for the repeatmaskdb if you do not have one'
+    echo
+else
+
+# log the parameters
+arguments=$1' '$2' '$3' '$4
+echo 'running with settings: ' $arguments
+python $ERANGEPATH/recordLog.py rna.log runStrandedAnalysis.sh "with parameters: $arguments"
+
+# count the unique reads falling on the gene models ; the nomatch files are 
+# mappable reads that fell outside of the Cistematic gene models and not the 
+# unmappable of Eland (i.e, the "NM" reads)
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -stranded -markGID -cache 1
+
+# calculate a first-pass RPKM to re-weigh the unique reads,
+# using 'none' for the splice count
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache
+
+# recount the unique reads with weights calculated during the first pass
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -stranded -uniq -cache 1
+
+# count splice reads
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -stranded -splices -noUniqs -cache 1
+
+# find new regions outside of gene models with reads piled up 
+python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter plus -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter minus -log rna.log -cache 1 -append
+
+# filter out new regions that overlap repeats more than a certain fraction
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -log rna.log -cache 1
+
+# Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+#python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
+#python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good 
+
+# map all candidate regions that are within a given radius of a gene in bp
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -stranded -cache
+
+# calculate expanded exonic read density
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache
+
+# weigh multi-reads
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -stranded -multi -cache 1
+
+# calculate final exonic read density
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+
+fi
diff --git a/farPairs.py b/farPairs.py
new file mode 100644 (file)
index 0000000..73dd3ca
--- /dev/null
@@ -0,0 +1,162 @@
+#
+#  farPairs.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 7/13/10.
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, time
+import optparse
+from commoncode import readDataset
+
+print "%prog: version 1.3"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--minDist", type="int", dest="minDist")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--minCount", type="int", dest="minCount")
+    parser.add_option("--label", dest="label")
+    parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
+                        minDist=1000, maxDist=500000, minCount=2, label=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        print "\tIs both slow and takes up large amount of RAM"
+        sys.exit(1)
+
+    rdsfile = args[0]
+    outfilename = args[1]
+    outbedname = args[2]
+
+    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+             options.cachePages, options.minDist, options.maxDist, options.minCount,
+             options.label)
+
+
+def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+             cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
+
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = 0
+
+    if label is None:
+        label = rdsfile
+
+    RDS = readDataset(rdsfile, verbose=True, cache=doCache)
+    rdsChromList = RDS.getChromosomes()
+
+    if doVerbose:
+        print time.ctime()
+
+    total = 0
+    outfile = open(outfilename, "w")
+    outbed = open(outbedname, "w")
+    outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+
+    readlen = RDS.getReadSize()
+    flagDict = {}
+    for chromosome in rdsChromList:
+        if doNotProcessChromosome(chromosome):
+            continue
+
+        print chromosome
+        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+        if doVerbose:
+            print len(uniqDict), time.ctime()
+
+        for readID in uniqDict:
+            readList = uniqDict[readID]
+            if len(readList) == 2:
+                total += 1
+                (start1, flag1, pair1) = readList[0]
+                (start2, flag2, pair2) = readList[1]
+
+                if flag1 != flag2:
+                    dist = abs(start1 - start2)
+                    startList = [start1, start2]
+                    stopList = [start1 + readlen, start2 + readlen]
+                    startList.sort()
+                    stopList.sort()
+                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
+                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
+                        outbed.write(outputLine)
+                        if doVerbose:
+                            print flag1, flag2, dist
+
+                        try:
+                            flagDict[flag1].append((flag2, start1, start2))
+                        except KeyError:
+                            flagDict[flag1] = [(flag2, start1, start2)]
+
+                        try:
+                            flagDict[flag2].append((flag1, start1, start2))
+                        except KeyError:
+                            flagDict[flag2] = [(flag2, start1, start2)]
+
+    print "%d connected regions" % len(flagDict)
+
+    for region in flagDict:
+        flagDict[region].sort()
+        regionConnections = {}
+        for (region2, start1, start2) in flagDict[region]:
+            try:
+                regionConnections[region2] += 1
+            except KeyError:
+                regionConnections[region2] = 1
+
+        for region2 in regionConnections:
+            if regionConnections[region2] >= minCount:
+                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
+                if doVerbose:
+                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+
+    outfile.close()
+    outbed.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
+    readSizes = "%d" % (stopList[0] - startList[0])
+    readCoords = "0"
+    leftStart = startList[0] - 1
+    rightStop = stopList[-1]
+    for index in range(1, numPieces):
+        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
+        readCoords += ",%d" % (startList[index] - startList[0])
+
+    if rsense == "+":
+        senseCode = plusSense
+    else:
+        senseCode = minusSense
+
+    outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+    return outline
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/featureIntersects.py b/featureIntersects.py
new file mode 100755 (executable)
index 0000000..e0b7726
--- /dev/null
@@ -0,0 +1,62 @@
+#
+#  featureIntersects.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from cistematic.core import featuresIntersecting
+
+print "%prog: version 1.0"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s tabfile [--cistype type] [--radius radius]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--radius", type="int", dest="radius")
+    parser.set_defaults(cistype="TFBSCONSSITES", radius=100)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 1:
+        print usage
+        sys.exit(1)
+
+    tabfile = args[0]
+
+    featureIntersects(tabfile, options.cistype, options.radius)
+
+
+def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
+    tabfile = open(tabFileName)
+    previous = ""
+
+    posList = []
+    for line in tabfile:
+        fields = line.split("\t")
+        current = fields[0]
+        if previous == current:
+            continue
+
+        previous = current
+        chrom = fields[1][3:]
+        posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
+
+    feats = featuresIntersecting("human", posList, radius, cistype)
+    featkeys = feats.keys()
+    featkeys.sort()
+    for (chrom, pos) in featkeys:
+        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/findMotifs.py b/findMotifs.py
new file mode 100755 (executable)
index 0000000..e79401b
--- /dev/null
@@ -0,0 +1,112 @@
+#
+#  findMotifs.py
+#  ENRAGE
+#
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, os, optparse
+from cistematic.experiments.fasta import Fasta
+from cistematic.programs.meme import Meme
+from cistematic.programs.cisGreedy import CisGreedy
+#TODO: cisSampler is not supported yet!
+#from cistematic.programs.cisSampler import CisSampler
+
+print "%prog: version 3.4"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog explabel regions.fsa [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--meme", action="store_true", dest="doMeme")
+    parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
+    parser.add_option("--logo", action="store_true", dest="saveLogo")
+    parser.add_option("--threshold", type="float", dest="threshold")
+    parser.add_option("--prefix", dest="motifPrefix")
+    parser.add_option("--numMotifs", dest="numMotifs")
+    parser.add_option("--maxWidth", type="int", dest="maxWidth")
+    parser.add_option("--maskLower", action="store_true", dest="maskLower")
+    parser.set_defaults(doMeme=False, doCisGreedy=False, saveLogo=False,
+                        threshold=75., numMotifs="10", maxWidth=28, maskLower=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        print "\n\twhere at least one of the motif finders (meme or cisGreedy) must be specified\n"
+        sys.exit(1)
+
+    expbase = args[0]
+    fsafile = args[1]
+
+    doCisSampler = False
+    if "--cisSampler" in sys.argv:
+        print "cisSampler is not supported yet! avoid using it for now"
+        doCisSampler = True
+
+    findMotifs(expbase, fsafile, options.doMeme, options.doCisGreedy, options.saveLogo,
+               options.threshold, options.numMotifs, options.maxWidth, options.maskLower,
+               doCisSampler)
+
+
+def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
+               numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False):
+
+    motifPrefix = expbase
+
+    #TODO: cisSampler is not supported yet!
+    #if doMeme or doCisGreedy or doCisSampler:
+    if not (doMeme or doCisGreedy):
+        print "error: must specify at least one motif finder - exiting"
+        sys.exit(1)
+
+    exp = Fasta(expbase, "%s.db" % expbase)
+
+    exp.initialize()
+    if maskLower:
+        exp.setMaskLowerCase(True)
+
+    if doMeme:
+        prog4 = Meme()
+        prog4.setMaxWidth(maxWidth)
+        prog4.setNumMotifs(numMotifs)
+        prog4.setModel("zoops")
+        exp.appendProgram(prog4)
+
+    if doCisGreedy:
+        prog5 = CisGreedy()
+        prog5.setGenExpOptions([])
+        prog5.setMaxWidth(maxWidth)
+        prog5.setNumMotifs(numMotifs)
+        exp.appendProgram(prog5)
+
+    #TODO: cisSampler is not supported yet!
+    #if doCisSampler:
+    #    prog6 = CisSampler()
+    #    prog6.setGenExpOptions([])
+    #    prog6.setMaxWidth(maxWidth)
+    #    prog6.setNumMotifs(numMotifs)
+    #    exp.appendProgram(prog6)
+
+    exp.run(fsafile)
+    exp.createAnalysis()
+    exp.loadAnalysis()
+    exp.mapMotifs(threshold, verbose=False)
+    exp.exportMotifs(prefix = motifPrefix)
+    if saveLogo:
+        exp.exportLogos(prefix = motifPrefix)
+
+    exp.draw("%s.png" % expbase, maxOccurences=4000)
+    print "deleting database..."
+    del exp
+    os.remove("%s.db" % expbase)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/findall.py b/findall.py
new file mode 100755 (executable)
index 0000000..10f007b
--- /dev/null
@@ -0,0 +1,699 @@
+"""
+    usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
+           [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+           [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
+           [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
+           [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
+           [--revbackground] [--pvalue self|back|none] [--nodirectionality]
+           [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
+           [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
+
+           where values in brackets are optional and label is an arbitrary string.
+
+           Use -ratio (default 4 fold) to set the minimum fold enrichment
+           over the control, -minimum (default 4) is the minimum number of reads
+           (RPM) within the region, and -spacing (default readlen) to set the maximum
+           distance between reads in the region. -listPeak lists the peak of the
+           region. Peaks mut be higher than -minPeak (default 0.5 RPM).
+           Pvalues are calculated from the sample (change with -pvalue),
+           unless the -revbackground flag and a control RDS file are provided.
+
+           By default, all numbers and parameters are on a reads per
+           million (RPM) basis. -raw will treat all settings, ratios and reported
+           numbers as raw counts rather than RPM. Use -notrim to turn off region
+           trimming and -trimvalue to control trimming (default 10% of peak signal)
+
+           The peak finder uses minimal directionality information that can
+           be turned off with -nodirectionality ; the fraction of + strand reads
+           required to be to the left of the peak (default 0.3) can be set with
+           -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
+           of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
+
+           Use -shift to shift reads either by half the expected
+           fragment length (default 0 bp) or '-shift learn ' to learn the shift
+           based on the first chromosome. If you prefer to learn the shift
+           manually, use -autoshift to calculate a per-region shift value, which
+           can be reported using -reportshift. -strandfilter should only be used
+           when explicitely calling unshifted stranded peaks from non-ChIP-seq
+           data such as directional RNA-seq. regionoutfile is written over by
+           default unless given the -append flag.
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import math
+import string
+import optparse
+from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
+
+
+versionString = "%s: version 3.2" % sys.argv[0]
+print versionString
+
+def usage():
+    print __doc__
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--control", dest="mockfile")
+    parser.add_option("--minimum", type="float", dest="minHits")
+    parser.add_option("--ratio", type="float", dest="minRatio")
+    parser.add_option("--spacing", type="int", dest="maxSpacing")
+    parser.add_option("--listPeak", action="store_true", dest="listPeak")
+    parser.add_option("--shift", dest="shift")
+    parser.add_option("--learnFold", type="float", dest="stringency")
+    parser.add_option("--noshift", action="store_true", dest="noShift")
+    parser.add_option("--autoshift", action="store_true", dest="autoshift")
+    parser.add_option("--reportshift", action="store_true", dest="reportshift")
+    parser.add_option("--nomulti", action="store_true", dest="noMulti")
+    parser.add_option("--minPlus", type="float", dest="minPlusRatio")
+    parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
+    parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
+    parser.add_option("--minPeak", type="float", dest="minPeak")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
+    parser.add_option("--pvalue", dest="ptype")
+    parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
+    parser.add_option("--strandfilter", dest="strandfilter")
+    parser.add_option("--trimvalue", type="float", dest="trimValue")
+    parser.add_option("--notrim", action="store_false", dest="doTrim")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--log", dest="logfilename")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--append", action="store_true", dest="doAppend")
+    parser.add_option("--RNA", action="store_true", dest="rnaSettings")
+    parser.add_option("--combine5p", action="store_true", dest="combine5p")
+    parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
+                        stringency=4.0, noshift=False, autoshift=False, reportshift=False,
+                        minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+                        normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
+                        trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
+                        cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
+                        strandfilter=None, combine5p=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        usage()
+        sys.exit(2)
+
+    factor = args[0]
+    hitfile = args[1]
+    outfilename = args[2]
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
+            options.stringency, options.noshift, options.autoshift, options.reportshift,
+            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
+            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
+            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
+            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.strandfilter, options.combine5p)
+
+
+def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
+            stringency=4.0, noshift=False, autoshift=False, reportshift=False,
+            minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
+            trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
+            cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
+            strandfilter=None, combine5p=False):
+
+    shiftValue = 0
+    if autoshift:
+        shiftValue = "auto"
+
+    if shift is not None:
+        try:
+            shiftValue = int(shift)
+        except ValueError:
+            if shift == "learn":
+                shiftValue = "learn"
+                print "Will try to learn shift"
+
+    if noshift:
+        shiftValue = 0
+
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
+    if doRevBackground:
+        print "Swapping IP and background to calculate FDR"
+        pValueType = "back"
+
+    doControl = False
+    if mockfile is not None:
+        doControl = True
+
+    doPvalue = True
+    if ptype is not None:
+        ptype = ptype.upper()
+        if ptype == "NONE":
+            doPvalue = False
+            pValueType = "none"
+            p = 1
+            poissonmean = 0
+        elif ptype == "SELF":
+            pValueType = "self"
+        elif ptype == "BACK":
+            if doControl and doRevBackground:
+                pValueType = "back"
+            else:
+                print "must have a control dataset and -revbackground for pValue type 'back'"
+        else:
+            print "could not use pValue type : %s" % ptype
+    else:
+        pValueType = "self"
+
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
+    if withFlag != "":
+        print "restrict to flag = %s" % withFlag
+
+    useMulti = True
+    if noMulti:
+        print "using unique reads only"
+        useMulti = False
+
+    if rnaSettings:
+        print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
+        shiftValue = 0
+        doTrim = False
+        doDirectionality = False
+
+    stranded = ""
+    if strandfilter is not None:
+        if strandfilter == "plus":
+            stranded = "+"
+            minPlusRatio = 0.9
+            maxPlusRatio = 1.0
+            print "only analyzing reads on the plus strand"
+        elif strandfilter == "minus":
+            stranded = "-"
+            minPlusRatio = 0.0
+            maxPlusRatio = 0.1
+            print "only analyzing reads on the minus strand"
+
+    stringency = max(stringency, 1.0)
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if doControl:
+        print "\ncontrol:" 
+        mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
+
+        if cachePages > mockRDS.getDefaultCacheSize():
+            mockRDS.setDBcache(cachePages)
+
+    print "\nsample:" 
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    readlen = hitRDS.getReadSize()
+    if rnaSettings:
+        maxSpacing = readlen
+
+    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
+    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
+    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    hitRDSsize = len(hitRDS) / 1000000.
+    if doControl:
+        mockRDSsize = len(mockRDS) / 1000000.
+
+    if normalize:
+        if doControl:
+            mockSampleSize = mockRDSsize
+
+        hitSampleSize = hitRDSsize
+
+    if doAppend:
+        outfile = open(outfilename, "a")
+    else:
+        outfile = open(outfilename, "w")
+
+    outfile.write("#ERANGE %s\n" % versionString)
+    if doControl:
+        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
+    else:
+        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
+
+    if withFlag != "":
+        outfile.write("#restrict to Flag = %s\n" % withFlag)
+
+    outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
+    outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
+    outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
+    if normalize:
+        print "Normalizing to RPM"
+        countLabel = "RPM"
+    else:
+        countLabel = "COUNT"
+
+    headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
+    if doDirectionality:
+        headerList.append("plus%\tleftPlus%")
+
+    if listPeak:
+        headerList.append("peakPos\tpeakHeight")
+
+    if reportshift:
+        headerList.append("readShift")
+
+    if doPvalue:
+        headerList.append("pValue")
+
+    headline = string.join(headerList, "\t")
+    print >> outfile, headline
+
+    statistics = {"index": 0,
+                  "total": 0,
+                  "mIndex": 0,
+                  "mTotal": 0,
+                  "failed": 0
+    }
+
+    if minRatio < minPeak:
+        minPeak = minRatio
+
+    hitChromList = hitRDS.getChromosomes()
+    if doControl:
+        mockChromList = mockRDS.getChromosomes()
+
+    hitChromList.sort()
+
+    for chromosome in hitChromList:
+        if doNotProcessChromosome(chromosome, doControl, mockChromList):
+            continue
+
+        print "chromosome %s" % (chromosome)
+        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
+        maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+        if shiftValue == "learn":
+            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
+                                    stringency, readlen, minHits, logfilename, outfile, outfilename)
+
+        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
+                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+                                                              noMulti, doControl, factor, trimValue, outputRegionList=True)
+
+        statistics["index"] += regionStats["index"]
+        statistics["total"] += regionStats["total"]
+        statistics["failed"] += regionStats["failed"]
+        if not doRevBackground:
+            if doPvalue:
+                p, poissonmean = calculatePValue(allRegionWeights)
+
+            print headline
+            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+            continue
+
+        #now do background swapping the two samples around
+        print "calculating background..."
+        backgroundTrimValue = 1/20.
+        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
+                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+                                                              noMulti, doControl, factor, backgroundTrimValue)
+
+        statistics["mIndex"] += backgroundRegionStats["index"]
+        statistics["mTotal"] += backgroundRegionStats["total"]
+        statistics["failed"] += backgroundRegionStats["failed"]
+        print statistics["mIndex"], statistics["mTotal"]
+        if doPvalue:
+            if pValueType == "self":
+                p, poissonmean = calculatePValue(allRegionWeights)
+            else:
+                p, poissonmean = calculatePValue(backgroundRegionWeights)
+
+        print headline
+        shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+
+    footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
+    print footer
+    outfile.write(footer)
+    outfile.close()
+
+    writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+
+
+def doNotProcessChromosome(chromosome, doControl, mockChromList):
+    skipChromosome = False
+    if chromosome == "chrM":
+        skipChromosome = True
+
+    if doControl and (chromosome not in mockChromList):
+        skipChromosome = True
+
+    return skipChromosome
+
+
+def calculatePValue(dataList):
+    dataList.sort()
+    listSize = float(len(dataList))
+    try:
+        poissonmean = sum(dataList) / listSize
+    except ZeroDivisionError:
+        poissonmean = 0
+
+    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
+    p = math.exp(-poissonmean)
+
+    return p, poissonmean
+
+
+def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
+               stringency, readlen, minHits, logfilename, outfile, outfilename):
+
+    print "learning shift.... will need at least 30 training sites"
+    previousHit = -1 * maxSpacing
+    hitList = [-1]
+    weightList = [0]
+    readList = []
+    shiftDict = {}
+    count = 0
+    numStarts = 0
+    for (pos, sense, weight) in hitDict[chrom]:
+        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
+            sumAll = sum(weightList)
+            if normalize:
+                sumAll /= hitSampleSize
+
+            regionStart = hitList[0]
+            regionStop = hitList[-1]
+            regionLength = regionStop - regionStart
+            # we're going to require stringent settings
+            if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
+                foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
+
+                if foldRatio >= minRatio:
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    try:
+                        shiftDict[localshift] += 1
+                    except KeyError:
+                        shiftDict[localshift] = 1
+
+                    count += 1
+
+            hitList = []
+            weightList = []
+            readList = []
+            numStarts = 0
+
+        if pos not in hitList:
+            numStarts += 1
+
+        hitList.append(pos)
+        weightList.append(weight)
+        readList.append((pos, sense, weight))
+        previousHit = pos
+
+    bestShift = 0
+    bestCount = 0
+    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
+    print outline
+    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+    if count < 30:
+        outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
+        print outline
+        writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+        shiftValue = 0
+    else:
+        for shift in sorted(shiftDict):
+            if shiftDict[shift] > bestCount:
+                bestShift = shift
+                bestCount = shiftDict[shift]
+
+        shiftValue = bestShift
+        print shiftDict
+
+    outline = "#picked shiftValue to be %d" % shiftValue
+    print outline
+    print >> outfile, outline
+    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+
+    return shiftValue
+
+
+def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
+    if doControl:
+        foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+    else:
+        foldRatio = minRatio
+
+    return foldRatio
+
+
+def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
+    numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+    if normalize:
+        numMock /= sampleSize
+
+    foldRatio = sumAll / numMock
+
+    return foldRatio
+
+
+def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
+                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
+                noMulti, doControl, factor, trimValue, outputRegionList=False):
+
+    index = 0
+    total = 0
+    failedCounter = 0
+    previousHit = - 1 * maxSpacing
+    currentHitList = [-1]
+    currentWeightList = [0]
+    currentReadList = []
+    regionWeights = []
+    outregions = []
+    numStarts = 0
+    hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
+    maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
+    for (pos, sense, weight) in hitDict[chrom]:
+        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
+            sumAll = sum(currentWeightList)
+            if normalize:
+                sumAll /= rdsSampleSize
+
+            regionStart = currentHitList[0]
+            regionStop = currentHitList[-1]
+            regionWeights.append(int(sumAll))
+            if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
+                sumMulti = 0.
+                #first pass uses getFoldRatio on mockRDS as there may not be control
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                if foldRatio >= minRatio:
+                    # first pass, with absolute numbers
+                    if doDirectionality:
+                        (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
+                    else:
+                        (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
+
+                    bestPos = topPos[0]
+                    peakScore = smoothArray[bestPos]
+                    if normalize:
+                        peakScore /= rdsSampleSize
+
+                    if doTrim:
+                        minSignalThresh = trimValue * peakScore
+                        start = 0
+                        stop = regionStop - regionStart - 1
+                        startFound = False
+                        while not startFound:
+                            if smoothArray[start] >= minSignalThresh or start == bestPos:
+                                startFound = True
+                            else:
+                                start += 1
+
+                        stopFound = False
+                        while not stopFound:
+                            if smoothArray[stop] >= minSignalThresh or stop == bestPos:
+                                stopFound = True
+                            else:
+                                stop -= 1
+
+                        regionStop = regionStart + stop
+                        regionStart += start
+                        try:
+                            if doDirectionality:
+                                (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
+                            else:
+                                (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
+                        except:
+                            continue
+
+                        if normalize:
+                            sumAll /= rdsSampleSize
+
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        if outputRegionList:
+                            sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
+                        # just in case it changed, use latest data
+                        try:
+                            bestPos = topPos[0]
+                            peakScore = smoothArray[bestPos]
+                        except:
+                            continue
+
+                        # normalize to RPM
+                        if normalize:
+                            peakScore /= rdsSampleSize
+
+                    elif outputRegionList:
+                        sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
+
+                    if outputRegionList:
+                        # normalize to RPM
+                        if normalize:
+                            sumMulti /= rdsSampleSize
+
+                        try:
+                            multiP = 100. * (sumMulti / sumAll)
+                        except:
+                            break
+
+                        if noMulti:
+                            multiP = 0.
+
+                    # check that we still pass threshold
+                    if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
+                        plusRatio = float(numPlus)/numHits
+                        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+                            if outputRegionList:
+                                peak = ""
+                                if listPeak:
+                                    peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
+
+                            if doDirectionality:
+                                if leftPlusRatio < numLeft / numPlus:
+                                    index += 1
+                                    if outputRegionList:
+                                        plusP = plusRatio * 100.
+                                        leftP = 100. * numLeft / numPlus
+                                        # we have a region that passes all criteria
+                                        outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
+
+                                    total += sumAll
+                                else:
+                                    failedCounter += 1
+                            else:
+                                # we have a region, but didn't check for directionality
+                                index += 1
+                                total += sumAll
+                                if outputRegionList:
+                                    outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
+
+            currentHitList = []
+            currentWeightList = []
+            currentReadList = []
+            numStarts = 0
+
+        if pos not in currentHitList:
+            numStarts += 1
+
+        currentHitList.append(pos)
+        currentWeightList.append(weight)
+        currentReadList.append((pos, sense, weight))
+        previousHit = pos
+
+    statistics = {"index": index,
+                  "total": total,
+                  "failed": failedCounter
+    }
+
+    if outputRegionList:
+        return statistics, regionWeights, outregions
+    else:
+        return statistics, regionWeights
+
+
+def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
+    bestShift = 0
+    shiftDict = {}
+    for region in outregions:
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if doPvalue:
+            sumAll = int(region[5])
+            for i in xrange(sumAll):
+                pValue *= poissonmean
+                pValue /= i+1
+
+        if shiftValue == "auto" and reportshift:
+            try:
+                shiftDict[region[-1]] += 1
+            except KeyError:
+                shiftDict[region[-1]] = 1
+
+        try:
+            if reportshift:
+                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
+            else:
+                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
+        except:
+            if reportshift:
+                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
+            else:
+                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
+
+        if doPvalue:
+            outputList.append("%1.2g" % pValue)
+
+        outline = string.join(outputList, "\t")
+        print outline
+        print >> outfile, outline
+
+    if shiftValue == "auto" and reportshift:
+        bestCount = 0
+        for shift in sorted(shiftDict):
+            if shiftDict[shift] > bestCount:
+                bestShift = shift
+                bestCount = shiftDict[shift]
+
+    return bestShift
+
+
+def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
+    footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
+    if doDirectionality:
+        footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+
+    if doRevBackground:
+        try:
+            percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
+        except (ValueError, ZeroDivisionError):
+            percent = 0.
+
+        footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+
+    if shiftValue == "auto" and reportshift:
+        footerList.append("#mode of shift values: %d" % shiftModeValue)
+
+    footer = string.join(footerList, "\n")
+
+    return footer
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/fraction.py b/fraction.py
new file mode 100755 (executable)
index 0000000..f955fce
--- /dev/null
@@ -0,0 +1,51 @@
+#
+#  fraction.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+from random import random
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(sys.argv) < 4:
+        print "usage: python %s fraction infile outfile" % sys.argv[0]
+        sys.exit(1)
+
+    fraction = float(sys.argv[1])
+    infile = sys.argv[2]
+    outfile = argv[3]
+
+    doFraction(fraction, infile, outfile)
+
+
+def doFraction(fraction, inFileName, outFileName):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+
+    totalIndex = 0
+    fractionIndex = 0
+    for line in infile:
+        totalIndex += 1
+        if random() <= fraction:
+            outfile.write(line)
+            fractionIndex += 1
+
+    infile.close()
+    outfile.close()
+
+    print "%d / %d = %.2f" % (fractionIndex, totalIndex, float(fractionIndex) / totalIndex)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py
new file mode 100755 (executable)
index 0000000..058ad82
--- /dev/null
@@ -0,0 +1,149 @@
+#
+#  geneDownstreamBins.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+# originally from version 1.3 of geneDnaDownstreamCounts.py
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--max", type="int", dest="standardMinDist",
+                      help="maximum region in bp")
+    parser.set_defaults(standardMinDist=3000)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
+
+
+def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    normalizationFactor = 1.0
+    if normalize:
+        hitDictSize = len(hitRDS)
+        normalizationFactor = hitDictSize / 1000000.
+
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=True)
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    featuresDict = hg.getallGeneFeatures()
+
+    outfile = open(outfilename, "w")
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for gid in gidList:
+        symbol = "LOC" + gid
+        geneinfo = ""
+        featureList = []
+        try:
+            geneinfo = geneinfoDict[gid]
+            featureList = featuresDict[gid]
+            symbol = geneinfo[0][0]
+        except:
+            print gid
+
+        if len(featureList) == 0:
+            continue
+
+        newfeatureList = []
+        for (ftype, chrom, start, stop, fsense) in featureList:
+            if (start, stop) not in newfeatureList:
+                newfeatureList.append((start, stop))
+
+        if chrom not in hitDict:
+            continue
+
+        newfeatureList.sort()
+        if len(newfeatureList) < 1:
+            continue
+
+        glen = standardMinDist
+        if fsense == "F":
+            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+            if nextGene < glen * 2:
+                glen = nextGene / 2
+
+            if glen < 1:
+                glen = 1
+
+            gstart = newfeatureList[-1][1]
+        else:
+            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+            if nextGene < glen * 2:
+                glen = nextGene / 2
+
+            if glen < 1:
+                glen = 1
+
+            gstart = newfeatureList[0][0] - glen
+            if gstart < 0:
+                gstart = 0
+
+        tagCount = 0
+        if glen < standardMinDist:
+            continue
+
+        binList = [0.] * bins
+        for (tagStart, sense, weight) in hitDict[chrom]:
+            tagStart -= gstart
+            if tagStart >= glen:
+                break
+
+            if tagStart > 0:
+                tagCount += weight
+                if fsense == "F":
+                    # we are relying on python's integer division quirk
+                    binID = tagStart / standardMinThresh 
+                    binList[binID] += weight
+                else:
+                    rdist = glen - tagStart
+                    binID = rdist / standardMinThresh 
+                    binList[binID] += weight
+
+        if tagCount < 2:
+            continue
+
+        tagCount *= normalizationFactor
+        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
+        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
+        for binAmount in binList:
+            outfile.write("\t%.2f" % binAmount)
+
+        outfile.write("\n")
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneLocusBins.py b/geneLocusBins.py
new file mode 100755 (executable)
index 0000000..e6b403f
--- /dev/null
@@ -0,0 +1,136 @@
+#
+#  geneLocusBins.py
+#  ENRAGE
+#
+
+# originally from version 1.3 of geneDownstreamBins.py
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print '%s: version 2.1' % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--bins", type="int", dest="bins",
+                      help="number of bins to use [default: 10]")
+    parser.add_option("--flank", type="int", dest="flankBP",
+                      help="number of flanking BP on both upstream and downstream [default: 0]")
+    parser.add_option("--upstream", type="int", dest="upstreamBP",
+                      help="number of upstream flanking BP [default: 0]")
+    parser.add_option("--downstream", type="int", dest="downstreamBP",
+                      help="number of downstream flanking BP [default: 0]")
+    parser.add_option("--nocds", action="store_false", dest="doCDS",
+                      help="do not CDS")
+    parser.add_option("--raw", action="store_false", dest="normalizeBins",
+                      help="do not normalize results")
+    parser.add_option("--force", action="store_false", dest="limitNeighbor",
+                      help="limit neighbor region")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+    parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+   
+    upstreamBp = 0
+    downstreamBp = 0
+    doFlank = False
+    if options.flankBP is not None:
+        upstreamBp = options.flankBP
+        downstreamBp = options.flankBP
+        doFlank = True
+
+    if options.upstreamBP is not None:
+        upstreamBp = options.upstreamBP
+        doFlank = True
+
+    if options.downstreamBP is not None:
+        downstreamBp = options.downstreamBP
+        doFlank = True
+
+    geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
+
+
+def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
+    if acceptfile is None:
+        acceptDict = {}
+    else:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    readlen = hitRDS.getReadSize()
+    normalizationFactor = 1.0
+    if normalizeBins:
+        totalCount = len(hitRDS)
+        normalizationFactor = totalCount / 1000000.
+
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=doCache)
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for chrom in acceptDict:
+        for (label, start, stop, length) in acceptDict[chrom]:
+            if label not in gidList:
+                gidList.append(label)
+
+    (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+
+    outfile = open(outfilename,'w')
+
+    for gid in gidList:
+        if 'FAR' not in gid:
+            symbol = 'LOC' + gid
+            geneinfo = ''
+            try:
+                geneinfo = geneinfoDict[gid]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = gid
+        if gid in gidBins and gid in gidLen:
+            tagCount = 0.
+            for binAmount in gidBins[gid]:
+                tagCount += binAmount
+        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+        for binAmount in gidBins[gid]:
+            if normalizeBins:
+                if tagCount == 0:
+                    tagCount = 1
+                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
+            else:
+                outfile.write('\t%.1f' % binAmount)
+        outfile.write('\n')
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneLocusCounts.py b/geneLocusCounts.py
new file mode 100755 (executable)
index 0000000..0e8792b
--- /dev/null
@@ -0,0 +1,138 @@
+#
+#  geneLocusCounts.py
+#  ENRAGE
+#
+"""  usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
+            where upstream and downstream are in bp and and optional
+            using noCDS requires either upstream or downstream (but not both)
+            to be nonzero. Using -locuslength will report the first bplength
+            or the last bplength of the gene region depending on whether it
+            is positive or negative.
+            will by default only count the uniq reads (use -noUniqs to turn off)
+            but can also count multi and splice reads given the appropriate flags
+"""
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print '%s: version 3.0' % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome readDB outfilename [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+                      help="do not count unique reads")
+    parser.add_option("--multi", action="store_true", dest="doUniqs",
+                      help="count multi reads")
+    parser.add_option("--splices", action="store_true", dest="doUniqs",
+                      help="count splice reads")
+    parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--noCDS", action="store_false", dest="useCDS")
+    parser.add_option("--locusLength", type="int", dest="bplength",
+                      help="number of bases to report")
+    parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print __doc__
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    upstream = 0
+    downstream = 0
+    try:
+        upstream = int(args[3])
+    except ValueError:
+        pass
+    except IndexError:
+        pass
+
+    try:
+        if "-" not in args[3]:
+            downstream = int(args[4])
+    except ValueError:
+        pass
+
+    geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
+    print 'returning only up to %d bp from gene locus' % bplength
+    print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+
+    if acceptfile:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+    hitRDS = readDataset(hitfile, verbose = True)
+
+    totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=True)
+
+    gidCount = {}
+    gidList = []
+    gidLen = {}
+    geneinfoDict = idb.getallGeneInfo(genome)
+    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
+
+    locusChroms = locusByChromDict.keys()
+    chromList = hitRDS.getChromosomes(fullChrom=False)
+    chromList.sort()
+    for chrom in chromList:
+        if chrom == 'M' or chrom not in locusChroms:
+            continue
+
+        print 'chr' + chrom
+        fullchrom = 'chr' + chrom
+        hitRDS.memSync(fullchrom, index=True)
+        for (start, stop, gid, length) in locusByChromDict[chrom]:
+            if gid not in gidList:
+                gidList.append(gid)
+                gidCount[gid] = 0
+                gidLen[gid] = length
+
+            gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+    outfile = open(outfilename,'w')
+
+    totalCount /= 1000000.
+
+    outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+    gidList.sort()
+    for gid in gidList:
+        if 'FAR' not in gid:
+            symbol = 'LOC' + gid
+            geneinfo = ''
+            try:
+                geneinfo = geneinfoDict[gid]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = gid
+
+        if gid in gidCount and gid in gidLen:
+            rpm  = gidCount[gid] / totalCount
+            rpkm = 1000. * rpm / gidLen[gid]
+            outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneLocusPeaks.py b/geneLocusPeaks.py
new file mode 100755 (executable)
index 0000000..fdfddf9
--- /dev/null
@@ -0,0 +1,117 @@
+#
+#  geneLocusPeaks.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+import sys, optparse
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--up", type="int", dest="upstream")
+    parser.add_option("--down", type="int", dest="downstream")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        print "\twhere upstream and downstream are in bp and and optional"
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
+
+
+def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
+    acceptDict = {}
+
+    if acceptfile:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+    print "upstream = %d downstream = %d" % (upstream, downstream)
+
+    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    readlen = hitRDS.getReadSize()
+    normalizationFactor = 1.0
+    if normalize:
+        totalCount = len(hitRDS)
+        normalizationFactor = totalCount / 1000000.
+
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=True)
+
+    gidCount = {}
+    gidPos = {}
+    geneinfoDict = idb.getallGeneInfo(genome)
+    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for chrom in acceptDict:
+        for (label, start, stop, length) in acceptDict[chrom]:
+            if label not in gidList:
+                gidList.append(label)
+
+    for gid in gidList:
+        gidCount[gid] = 0
+
+    for chrom in hitDict:
+        if chrom not in locusByChromDict:
+            continue
+
+        print chrom
+        for (start, stop, gid, glen) in locusByChromDict[chrom]:
+            gidCount[gid] = 0.
+            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
+            if len(topPos) > 0:
+                gidCount[gid] = smoothArray[topPos[0]]
+                gidPos[gid] = (chrom, start + topPos[0])
+            else:
+                gidPos[gid] = (chrom, start)
+
+    outfile = open(outfilename, "w")
+
+    for gid in gidList:
+        if "FAR" not in gid:
+            symbol = "LOC" + gid
+            geneinfo = ""
+            try:
+                geneinfo = geneinfoDict[gid]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = gid
+
+        if gid in gidCount and gid in gidPos:
+            (chrom, pos) = gidPos[gid]
+            outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py
new file mode 100755 (executable)
index 0000000..b905cf0
--- /dev/null
@@ -0,0 +1,198 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print "psyco not running"
+
+import sys
+import optparse
+from commoncode import readDataset, getFeaturesByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%s: version 5.1" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome rdsfile outfilename [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_true", dest="trackStrand")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+    parser.add_option("--multi", action="store_true", dest="doMulti")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--searchGID", action="store_true", dest="searchGID")
+    parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+    parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
+                        extendGenome="", replaceModels=False, searchGID=False,
+                        countFeats=False, cachePages=None, markGID=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genomeName = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
+                   options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
+                   options.searchGID, options.countFeats, options.cachePages, options.markGID)
+
+
+def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+                   doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
+                   searchGID=False, countFeats=False, cachePages=None, markGID=False):
+
+    if trackStrand:
+        print "will track strandedness"
+        doStranded = "track"
+    else:
+        doStranded = "both"
+
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+    else:
+        replaceModels = False
+
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = 100000
+        doCache = False
+
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    genome = Genome(genomeName, inRAM=True)
+    if extendGenome != "":
+        genome.extendFeatures(extendGenome, replace=replaceModels)
+
+    print "getting gene features...."
+    featuresByChromDict = getFeaturesByChromDict(genome)
+
+    seenFeaturesByChromDict = {}
+    print "getting geneIDs...."
+    gidList = genome.allGIDs()
+    gidList.sort()
+    gidCount = {}
+    for gid in gidList:
+        gidCount[gid] = 0
+
+    chromList = hitRDS.getChromosomes(fullChrom=False)
+    if len(chromList) == 0 and doSplices:
+        chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
+
+    if markGID:
+        print "Flagging all reads as NM"
+        hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+    for chrom in chromList:
+        if chrom not in featuresByChromDict:
+            continue
+
+        if countFeats:
+            seenFeaturesByChromDict[chrom] = []
+
+        print "\nchr%s" % chrom
+        fullchrom = "chr%s" % chrom
+        regionList = []        
+        print "counting GIDs"
+        for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
+            try:
+                if doStranded == "track":
+                    checkSense = "+"
+                    if featureSense == "R":
+                        checkSense = "-"
+
+                    regionList.append((gid, fullchrom, start, stop, checkSense))
+                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+                else:
+                    regionList.append((gid, fullchrom, start, stop))
+                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                    if count != 0:
+                        print count
+
+                gidCount[gid] += count
+                if countFeats:
+                    if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
+                        seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
+            except:
+                print "problem with %s - skipping" % gid
+
+        if markGID:
+            print "marking GIDs"
+            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
+            print "finished marking"
+
+    print " "
+    if countFeats:
+        numFeatures = countFeatures(seenFeaturesByChromDict)
+        print "saw %d features" % numFeatures
+
+    writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
+    if markGID and doCache:
+        hitRDS.saveCacheDB(hitfile)
+
+
+def countFeatures(seenFeaturesByChromDict):
+    count = 0
+    for chrom in seenFeaturesByChromDict.keys():
+        try:
+            count += len(seenFeaturesByChromDict[chrom])
+        except TypeError:
+            pass
+
+    return count
+
+
+def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+    geneAnnotDict = genome.allAnnotInfo()
+    genomeName = genome.genome
+    outfile = open(outfilename, "w")
+    idb = geneinfoDB(cache=True)
+    geneInfoDict = idb.getallGeneInfo(genomeName)
+    for gid in gidList:
+        symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
+        if gid in gidCount:
+            outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
+        else:
+            outfile.write("%s\t%s\t0\n" % (gid, symbol))
+
+    outfile.close()
+
+
+def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
+    lookupGID = gid
+    if searchGID and gid not in geneInfoDict:
+        actualGeneID = idb.getGeneID(genomeName, gid)
+        if len(actualGeneID) > 0:
+            lookupGID = actualGeneID[1]
+
+    try:
+        geneinfo = geneInfoDict[lookupGID]
+        symbol = geneinfo[0][0]
+    except (KeyError, IndexError):
+        try:
+            symbol = geneAnnotDict[(genomeName, gid)][0]
+        except (KeyError, IndexError):
+            symbol = "LOC%s" % gid
+
+    return symbol
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py
new file mode 100755 (executable)
index 0000000..7acf0b9
--- /dev/null
@@ -0,0 +1,266 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+
+print '%s: version 4.1' % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_false", dest="ignoreSense")
+    parser.add_option("--uniq", action="store_true", dest="withUniqs")
+    parser.add_option("--multi", action="store_true", dest="withMulti")
+    parser.add_option("--record", action="store_true", dest="recording",
+                      help="ignored with uniq reads")
+    parser.add_option("--accept", dest="acceptfile")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
+                        acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
+                        replaceModels=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    countfile = args[2]
+    outfilename = args[3]
+
+    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+                           options.withUniqs, options.withMulti, options.recording,
+                           options.acceptfile, options.cachePages, options.doVerbose,
+                           options.extendGenome, options.replaceModels)
+
+
+def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+                           withUniqs=False, withMulti=False, recording=False, acceptfile=None,
+                           cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
+
+    if (not withUniqs and not withMulti) or (withUniqs and withMulti):
+        print "must have either one of -uniq or -multi set. Exiting"
+        sys.exit(1)
+
+    if cachePages is not None:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        idb = geneinfoDB(cache=True)
+        print "%s cached" % genome
+        doCache = True
+    else:
+        doCache = False
+        cachePages = 0
+        hg = Genome(genome, inRAM=True)
+        idb = geneinfoDB()
+
+    if acceptfile is not None:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+    else:
+        acceptDict = {}
+
+    if recording and withUniqs:
+        recording = False
+
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+    else:
+        replaceModels = False
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace = replaceModels)
+    
+    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    readlen = hitRDS.getReadSize()
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    geneannotDict = hg.allAnnotInfo()
+    gidCount = {}
+    gidReadDict = {}
+
+    featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
+    gidList = hg.allGIDs()
+
+    gidList.sort()
+    for chrom in acceptDict:
+        for (label, start, stop, length) in acceptDict[chrom]:
+            if label not in gidList:
+                gidList.append(label)
+
+    for gid in gidList:
+        gidCount[gid] = 0
+        gidReadDict[gid] = []
+
+    uniqueCountDict = {}
+    read2GidDict = {}
+
+    uniquecounts = open(countfile)
+    for line in uniquecounts:
+        fields = line.strip().split()
+        # add a pseudo-count here to ease calculations below
+        uniqueCountDict[fields[0]] = float(fields[-1]) + 1
+
+    uniquecounts.close()
+
+    outfile = open(outfilename, "w")
+
+    index = 0
+    if withMulti and not withUniqs:
+        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
+    else:
+        chromList = hitRDS.getChromosomes(fullChrom=False)
+
+    for achrom in chromList:
+        if achrom not in featuresByChromDict:
+            continue
+
+        print "\n" + achrom + " ",
+        startFeature = 0
+        fullchrom = "chr" + achrom
+        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
+        featList = featuresByChromDict[achrom]
+        if ignoreSense:
+            for (tagStart, tagReadID) in hitDict[fullchrom]:
+                index += 1
+                if index % 100000 == 0:
+                    print "read %d" % index,
+
+                stopPoint = tagStart + readlen
+                if startFeature < 0:
+                    startFeature = 0
+
+                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+                    if tagStart > stop:
+                        startFeature += 1
+                        continue
+
+                    if start > stopPoint:
+                        startFeature -= 100
+                        break
+
+                    if start <= tagStart <= stop:
+                        try:
+                            gidReadDict[gid].append(tagReadID)
+                            if tagReadID in read2GidDict:
+                                if gid not in read2GidDict[tagReadID]:
+                                    read2GidDict[tagReadID].append(gid)
+                            else:
+                                read2GidDict[tagReadID] = [gid]
+
+                            gidCount[gid] += 1
+                        except:
+                            print "gid %s not in gidReadDict" % gid
+
+                        stopPoint = stop
+        else:
+            for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
+                index += 1
+                if index % 100000 == 0:
+                    print "read %d" % index,
+
+                stopPoint = tagStart + readlen
+                if startFeature < 0:
+                    startFeature = 0
+
+                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+                    if tagStart > stop:
+                        startFeature += 1
+                        continue
+
+                    if start > stopPoint:
+                        startFeature -= 100
+                        break
+
+                    if sense == "R":
+                        sense = "-"
+                    else:
+                        sense = "+"
+
+                    if start <= tagStart <= stop and sense == tSense:
+                        try:
+                            gidReadDict[gid].append(tagReadID)
+                            if tagReadID in read2GidDict:
+                                if gid not in read2GidDict[tagReadID]:
+                                    read2GidDict[tagReadID].append(gid)
+                            else:
+                                read2GidDict[tagReadID] = [gid]
+
+                            gidCount[gid] += 1
+                        except:
+                            print "gid %s not in gidReadDict" % gid
+
+                        stopPoint = stop
+
+    for gid in gidList:
+        if "FAR" not in gid:
+            symbol = "LOC" + gid
+            geneinfo = ""
+            try:
+                geneinfo = geneinfoDict[gid]
+                if genome == "celegans":
+                    symbol = geneinfo[0][1]
+                else:
+                    symbol = geneinfo[0][0]
+            except:
+                try:
+                    symbol = geneannotDict[(genome, gid)][0]
+                except:
+                    symbol = "LOC" + gid
+        else:
+            symbol = gid
+
+        tagCount = 0.
+        for readID in gidReadDict[gid]:
+            try:
+                tagValue = uniqueCountDict[gid]
+            except:
+                tagValue = 1
+
+            tagDenom = 0.
+            for aGid in read2GidDict[readID]:
+                try:
+                    tagDenom += uniqueCountDict[aGid]
+                except:
+                    tagDenom += 1
+
+        try:
+            tagCount += tagValue / tagDenom
+        except ZeroDivisionError:
+            tagCount = 0
+    
+        if doVerbose:
+            print "%s %s %f" % (gid, symbol, tagCount)
+
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
+
+    outfile.close()
+
+    if doCache:
+        uncacheGeneDB(genome)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneNeighbors.py b/geneNeighbors.py
new file mode 100755 (executable)
index 0000000..8ec363c
--- /dev/null
@@ -0,0 +1,152 @@
+#
+#  geneNeighbors.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.4"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--regions", dest="acceptFile")
+    parser.add_option("--downstream", type="int", dest="downMax")
+    parser.add_option("--upstream", type="int", dest="upMax")
+    parser.add_option("--mindist", type="int", dest="minDist")
+    parser.add_option("--minlocus", type="int", dest="minLocus")
+    parser.add_option("--maxlocus", type="int", dest="maxLocus")
+    parser.add_option("--samesense", action="store_true", dest="checkSense")
+    parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000,
+                        upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    outfilename = args[1]
+
+    index = geneNeighbors(genome, outfilename, options.acceptFile, options.checkSense,
+                          options.downMax, options.upMax, options.minDist, options.minLocus,
+                          options.maxLocus)
+
+    print "\n%d genes matched" % index
+
+
+def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
+                  downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
+                  maxLocus=10000000):
+
+    acceptDict = {}
+    if acceptfile:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=True)
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for chrom in acceptDict:
+        for (label, start, stop, length) in acceptDict[chrom]:
+            if label not in gidList:
+                gidList.append(label)
+
+    index = 0
+    outfile = open(outfilename,"w")
+    chromList = locusByChromDict.keys()
+    chromList.sort()
+    for chrom in chromList:
+        if len(locusByChromDict[chrom]) < 3 or "NT" in chrom or "MT" in chrom:
+            continue
+
+        print chrom + " ",
+    
+        prevStop = locusByChromDict[chrom][0][1]
+        prevGID = locusByChromDict[chrom][0][2]
+        if "FAR" not in prevGID:
+            symbol = "LOC" + prevGID
+            geneinfo = ""
+            try:
+                geneinfo = geneinfoDict[prevGID]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = prevGID
+
+        prevGID = symbol
+        prevSense = locusByChromDict[chrom][0][4]
+
+        currentStart = locusByChromDict[chrom][1][0]
+        currentStop = locusByChromDict[chrom][1][1]
+        currentGID = locusByChromDict[chrom][1][2]
+        if "FAR" not in currentGID:
+            symbol = "LOC" + currentGID
+            geneinfo = ""
+            try:
+                geneinfo = geneinfoDict[currentGID]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = currentGID
+
+        currentGID = symbol
+        currentGlen = locusByChromDict[chrom][1][3]
+        currentSense = locusByChromDict[chrom][1][4] 
+
+        for (nextStart, nextStop, nextGID, nextGlen, nextSense) in locusByChromDict[chrom][2:]:
+            if "FAR" not in nextGID:
+                symbol = "LOC" + nextGID
+                geneinfo = ""
+                try:
+                    geneinfo = geneinfoDict[nextGID]
+                    symbol = geneinfo[0][0]
+                except:
+                    pass
+            else:
+                symbol = nextGID
+
+            nextGID = symbol
+            leftDist = currentStart - prevStop
+            rightDist = nextStart - currentStop
+            if (currentSense == "F" and minDist < leftDist < upMax and minDist < rightDist < downMax) or (currentSense == "R" and minDist < rightDist < upMax and minDist < leftDist < downMax):
+                if not checkSense or currentSense == nextSense:
+                    if minLocus <= currentGlen <= maxLocus:
+                        outfile.write("%s\t%s\t%s\t%s\t%d\t%s\t%s\t%d\n" % (currentGID, currentSense, prevGID, prevSense, leftDist, nextGID, nextSense, rightDist))
+                        index += 1
+
+            prevStop = currentStop
+            prevGID = currentGID
+            prevSense = currentSense
+            currentStart = nextStart
+            currentStop = nextStop
+            currentGID = nextGID
+            currentGlen = nextGlen
+            currentSense = nextSense
+
+    outfile.close()
+    return index
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneStallingBins.py b/geneStallingBins.py
new file mode 100755 (executable)
index 0000000..f08abe6
--- /dev/null
@@ -0,0 +1,156 @@
+#
+#
+#  geneStallingBins.py
+#  ENRAGE
+#
+
+# originally from geneLocusBins.py
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 1.3"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--upstream", type="int", dest="upstreamBp")
+    parser.add_option("--downstream", type="int", dest="downstreamBp")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--normalize", action="store_true", dest="normalize")
+    parser.add_option("--tagCount", action="store_true", dest="doTagCount")
+    parser.add_option("--bins", type="int", dest="bins")
+    parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
+                        doCache=False, normalize=False, doTagCount=False, bins=4)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    controlfile = args[2]
+    outfilename = args[3]
+
+    geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
+                     options.downstreamBp, options.acceptfile, options.doCache,
+                     options.normalize, options.doTagCount, options.bins)
+
+
+def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
+                     downstreamBp=0, acceptfile="", doCache=False, normalize=False,
+                     doTagCount=False, bins=4):
+
+    acceptDict = {}
+    if acceptfile:
+        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+
+    doCDS = True
+    limitNeighbor = False
+
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    readlen = hitRDS.getReadSize()
+    hitNormalizationFactor = 1.0
+    if normalize:
+        hitDictSize = len(hitRDS)
+        hitNormalizationFactor = hitDictSize / 1000000.
+
+    controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    controlNormalizationFactor = 1.0
+    if normalize:
+        controlDictSize = len(hitRDS)
+        controlNormalizationFactor = controlDictSize / 1000000.
+
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+    controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=doCache)
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for chrom in acceptDict:
+        for (label, start, stop, length) in acceptDict[chrom]:
+            if label not in gidList:
+                gidList.append(label)
+
+    (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
+    (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
+
+    outfile = open(outfilename, "w")
+
+    for gid in gidList:
+        if "FAR" not in gid:
+            symbol = "LOC" + gid
+            geneinfo = ""
+            try:
+                geneinfo = geneinfoDict[gid]
+                symbol = geneinfo[0][0]
+            except:
+                pass
+        else:
+            symbol = gid
+
+        if gid in gidBins and gid in gidLen:
+            tagCount = 0.
+            controlCount = 0.
+            for binAmount in gidBins[gid]:
+                tagCount += binAmount
+
+            for binAmount in controlBins[gid]:
+                controlCount += abs(binAmount)
+
+            diffCount = tagCount + controlCount
+            if diffCount < 0:
+                diffCount = 0
+
+            outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
+            if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
+                outfile.write("\tshort\n")
+                continue
+
+            TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
+            finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
+            if finalbin <= 0.:
+                finalbin = 0.01
+
+            if TSSbins < 0:
+                TSSbins = 0
+
+            ratio =  float(TSSbins)/float(finalbin)
+            for binAmount in gidBins[gid]:
+                if doTagCount:
+                    binAmount = binAmount * tagCount / 100.
+
+                if normalize:
+                    if tagCount == 0:
+                        tagCount = 1
+
+                    outfile.write("\t%.1f" % (100. * binAmount / tagCount))
+                else:
+                    outfile.write("\t%.1f" % binAmount)
+
+        outfile.write("\t%.2f\n" % ratio)
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/geneStartBins.py b/geneStartBins.py
new file mode 100755 (executable)
index 0000000..cbb3c4a
--- /dev/null
@@ -0,0 +1,134 @@
+#
+#  geneStartBins.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+# originally from version 1.3 of geneDownstreamBins.py
+from commoncode import *
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+import sys
+
+print '%s: version 2.0' % sys.argv[0]
+if len(sys.argv) < 4:
+    print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
+    print '\n\twhere regionSize is the optional maximum region in bp\n'
+    sys.exit(1)
+
+genome = sys.argv[1]
+hitfile =  sys.argv[2]
+outfilename = sys.argv[3]
+
+standardMinDist = 3000
+if '-max' in sys.argv:
+    standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
+
+if '-raw' in sys.argv:
+    normalize = False
+    normalizeBins = False
+else:
+    normalize = True
+    normalizeBins = True    
+
+doCache = False
+if '-cache' in sys.argv:
+    doCache = True
+
+bins = 10
+standardMinThresh = standardMinDist / bins
+
+hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+readlen = hitRDS.getReadSize()
+normalizationFactor = 1.0
+if normalize:
+    totalCount = len(hitRDS)
+    normalizationFactor = totalCount / 1000000.
+
+hg = Genome(genome)
+idb = geneinfoDB(cache=True)
+
+gidDict = {}
+geneinfoDict = idb.getallGeneInfo(genome)
+featuresDict = hg.getallGeneFeatures()
+
+#infile = open(infilename)
+outfile = open(outfilename,'w')
+
+gidList = hg.allGIDs()
+gidList.sort()
+for gid in gidList:
+    symbol = 'LOC' + gid
+    geneinfo = ''
+    featureList = []
+    try:
+        geneinfo = geneinfoDict[gid]
+        featureList = featuresDict[gid]
+        symbol = geneinfo[0][0]
+    except:
+        print geneinfo
+    newfeatureList = []
+    if len(featureList) == 0:
+        continue
+    for (ftype, chrom, start, stop, fsense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+    if chrom not in hitDict:
+        continue
+    newfeatureList.sort()
+    if len(newfeatureList) < 1:
+        #print '%s %s %d' % (gid, symbol, -1)
+        #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
+        continue
+    glen = standardMinDist / 2
+    if fsense == 'F':
+        nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+        if nextGene < glen * 2:
+                glen = nextGene / 2
+        if glen < 1:
+                glen = 1
+        gstart = newfeatureList[0][0] - glen
+        if gstart < 0:
+                gstart = 0
+        gstop = newfeatureList[0][0]  + glen
+    else:
+        nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+        if nextGene < glen * 2:
+            glen = nextGene / 2
+        if glen < 1:
+            glen = 1
+        gstart = newfeatureList[-1][1] - glen
+        gstop = newfeatureList[-1][1] + glen
+    tagCount = 0
+    if glen < standardMinDist / 2:
+        continue
+    binList = [0] * bins
+    for (tagStart, sense, weight) in hitDict[chrom]:
+        tagStart -= gstart 
+        if tagStart >= 2 * glen:
+            break
+        if tagStart > 0:
+            tagCount += weight
+            if fsense == 'R':
+                # we are relying on python's integer division quirk
+                binID = tagStart / standardMinThresh 
+                binList[binID] += weight
+            else:
+                rdist = 2 * glen - tagStart
+                binID = rdist / standardMinThresh 
+                binList[binID] += weight
+    if tagCount < 2:
+        continue
+    print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
+    outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
+    for binAmount in binList:
+        outfile.write('\t%d' % binAmount)
+    outfile.write('\n')
+#infile.close()
+outfile.close()
+
diff --git a/geneUpstreamBins.py b/geneUpstreamBins.py
new file mode 100755 (executable)
index 0000000..e855416
--- /dev/null
@@ -0,0 +1,149 @@
+#
+#  geneUpstreamBins.py
+#  ENRAGE
+#
+# originally from version 1.3 of geneDownstreamBins.py
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 2.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--raw", action="store_false", dest="normalize",
+                       help="maximum region in bp")
+    parser.add_option("--max", type="int", dest="standardMinDist")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile =  args[1]
+    outfilename = args[3]
+
+    geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
+
+
+def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+
+    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    normalizationFactor = 1.0
+    if normalize:
+        totalCount = len(hitRDS)
+        normalizationFactor = totalCount / 1000000.
+
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+    hg = Genome(genome)
+    idb = geneinfoDB(cache=True)
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    featuresDict = hg.getallGeneFeatures()
+
+    outfile = open(outfilename,"w")
+
+    gidList = hg.allGIDs()
+    gidList.sort()
+    for gid in gidList:
+        symbol = "LOC" + gid
+        geneinfo = ""
+        featureList = []
+        try:
+            geneinfo = geneinfoDict[gid]
+            featureList = featuresDict[gid]
+            symbol = geneinfo[0][0]
+        except:
+            print geneinfo
+
+        newfeatureList = []
+        if len(featureList) == 0:
+            continue
+
+        for (ftype, chrom, start, stop, fsense) in featureList:
+            if (start, stop) not in newfeatureList:
+                newfeatureList.append((start, stop))
+
+        if chrom not in hitDict:
+            continue
+
+        newfeatureList.sort()
+        if len(newfeatureList) < 1:
+            continue
+
+        glen = standardMinDist
+        if fsense == "F":
+            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
+            if nextGene < glen * 2:
+                glen = nextGene / 2
+
+            if glen < 1:
+                glen = 1
+
+            gstart = newfeatureList[0][0] - glen
+            if gstart < 0:
+                gstart = 0
+
+        else:
+            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
+            if nextGene < glen * 2:
+                glen = nextGene / 2
+
+            if glen < 1:
+                glen = 1
+
+            gstart = newfeatureList[-1][1]
+
+        tagCount = 0
+        if glen < standardMinDist:
+            continue
+
+        binList = [0] * bins
+        for (tagStart, sense, weight) in hitDict[chrom]:
+            tagStart -= gstart
+            if tagStart >= glen:
+                break
+
+            if tagStart > 0:
+                tagCount += weight
+                if fsense == "R":
+                    # we are relying on python's integer division quirk
+                    binID = tagStart / standardMinThresh 
+                    binList[binID] += weight
+                else:
+                    rdist = glen - tagStart
+                    binID = rdist / standardMinThresh 
+                    binList[binID] += weight
+
+        if tagCount < 2:
+            continue
+
+        print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
+        outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
+        for binAmount in binList:
+            outfile.write("\t%d" % binAmount)
+        outfile.write("\n")
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getGOgenes.py b/getGOgenes.py
new file mode 100755 (executable)
index 0000000..0a320ee
--- /dev/null
@@ -0,0 +1,113 @@
+import sys, optparse
+from cistematic.genomes import Genome
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome GOID1 [GOID2 ....] [--outfile outfilename] [--append] [--restrict genefile]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--outfile", dest="outfilename")
+    parser.add_option("--append", action="store_true", dest="append")
+    parser.add_option("--restrict", dest="restrictfilename")
+    parser.set_defaults(outfilename=None, restrictfilename=None, append=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+
+    GOIDlist = []
+    for arg in args:
+        if "GO:" in arg:
+            GOIDlist.append(arg)
+
+    getGOgenes(genome, GOIDlist, options.outfilename, options.restrictfilename, options.append)
+
+
+def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append=False):
+    writeOut = False
+    if outfilename is not None:
+        writeOut = True
+
+    restrict = False
+    if restrictfilename is not None:
+        restrict = True
+    
+    hg = Genome(genome)
+    idb = geneinfoDB()
+
+    print sys.argv
+    print GOIDlist
+
+    firstGeneList = []
+    for GOID in GOIDlist:
+        testList = hg.allGIDsbyGOID(GOID)
+        print "GOID: %s (%d)" % (GOID, len(testList))
+        firstGeneList += testList
+
+    geneDict = {}
+    for gid in firstGeneList:
+        geneDict[gid] = 1
+
+    geneList = geneDict.keys()
+    print len(geneList)
+    geneInfoList = idb.getallGeneInfo(genome)
+
+    if writeOut:
+        if append:
+            outfile = open(outfilename, "a")
+        else:
+            outfile = open(outfilename, "w")
+
+        for GOID in GOIDlist:
+            outfile.write("#%s\n" % GOID) 
+
+    restrictList = []
+    restrictDict = {}
+    if restrict:
+        restrictFile = open(restrictfilename)
+        for line in restrictFile:
+            fields = line.strip().split()
+            restrictList.append(fields[0])
+            restrictDict[fields[0]] = line
+
+    outList = []
+    symbolDict = {}
+    for gid in geneList:
+        symbol = "LOC" + gid
+        if restrict and gid not in restrictList:
+            continue
+
+        try:
+            symbol = geneInfoList[gid][0][0]
+        except:
+            pass
+
+        if restrict:
+            symbolDict[symbol] = restrictDict[gid]
+
+        outList.append(symbol)
+
+    outList.sort()
+    for symbol in outList:
+        if writeOut:
+            if restrict:
+                outfile.write(symbolDict[symbol])
+            else:
+                outfile.write(symbol + "\n")
+        else:
+            print symbol
+
+    if writeOut:
+        outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getNovelSNPs.py b/getNovelSNPs.py
new file mode 100755 (executable)
index 0000000..0936a7f
--- /dev/null
@@ -0,0 +1,96 @@
+#
+#  getNovelSNPs.py
+#  ENRAGE
+#
+# This script attempts to annotate the novel sncs/snps from the snp summary file 
+# Written by: Wendy Lee
+# Written on: Aug 7th, 2008
+
+import sys
+import string
+from cistematic.genomes import Genome 
+from commoncode import writeLog
+
+print "%prog: version 1.5"
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome snpsfile nondbsnp_geneinfo_outfile" % argv[0]
+
+    if len(argv) < 4:
+        print usage
+        sys.exit(2)
+
+    genome = argv[1]
+    snpfile = argv[2]
+    outfilename = argv[3]
+
+    getNovelSNPsFromFile(genome, snpfile, outfilename)
+
+
+def getNovelSNPsFromFile(genome, snpfile, outfilename): 
+    infile = file(snpfile, "r")
+    writeNovelSNPFile(genome, infile, outfilename)
+    writeLog("snp.log", sys.argv[0], "outputfile: %s" % outfilename)
+    infile.close()
+
+
+def writeNovelSNPFile(genome, snpPropertiesList, outfilename):
+    hg = Genome(genome)
+    outString = ""
+    outfile  = open(outfilename, "w")
+    outfile.write("#Sl\tCl\tchrom\tmis pos\t\tmatch\tuniq_mis\ttot_mis\tbase_chg\tknown_snp\tfunction\tgene\tgeneId\trpkm\n") 
+    for line in snpPropertiesList:
+        if doNotProcessLine(line):
+            continue
+
+        outString = getNovelSNPInfo(genome, line, hg)
+        if outString == line:
+            outfile.write(outString)
+        else:
+            outfile.write("%s\n" % outString)
+
+    outfile.close()
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def getNovelSNPInfo(genome, snpEntry, hg):
+    fields = snpEntry.split()
+    #TODO: refactor naming. is fields[8] rpkm?
+    if fields[8].find("N\A") == -1: 
+        return snpEntry
+    else:
+        snpInfo = ""
+        gid = fields[11]
+        snc_start = int(fields[3])
+        featuresList = hg.getGeneFeatures((genome, gid))
+        func = "N\A"
+        for (ftype, chromosome, start, stop, orientation) in featuresList:
+            if int(start) <= snc_start <= int(stop):
+                func = ftype
+                break 
+        for i in range (0, 9):
+            snpInfo = string.join([snpInfo, fields[i]], "\t")
+
+        snpInfo = string.join([snpInfo, func], "\t")
+        for i in range (10, 13):
+            snpInfo = string.join([snpInfo, fields[i]], "\t")
+
+    return snpInfo
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getSNPGeneInfo.py b/getSNPGeneInfo.py
new file mode 100755 (executable)
index 0000000..307413b
--- /dev/null
@@ -0,0 +1,177 @@
+#
+#  getSNPGeneInfo.py
+#  ENRAGE
+#
+# This script look for the gene info and expression level for the snps.
+# Written by: Wendy Lee
+# Written on: August 7th, 2008
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys
+import optparse
+import string
+from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
+from cistematic.core.geneinfo import geneinfoDB
+
+print "%prog: version 4.5"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", action="store_true", dest="cachePages")
+    parser.add_option("--withoutsense", action="store_false", dest="withSense")
+    parser.add_option("--flank", type="int", dest="flankBP")
+    parser.set_defaults(doCache=False, withSense=True, flankBP=0)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(2)
+
+    genome = args[0]
+    infilename = args[1]
+    rpkmfilename = args[2]
+    outfilename = args [3]
+
+    writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
+
+
+def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
+
+    outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
+    outfile = open(outfilename, "w")
+
+    for outputLine in outList:
+        outfile.write("%s\n" % outputLine)
+
+    outfile.close()
+
+
+def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=True, flankBP=0):
+
+    rpkmDict = {}
+    rpkmField = 3
+    if rpkmfilename != "NONE":
+        rpkmfile = open(rpkmfilename, "r")
+        for line in rpkmfile:
+            lineFields = line.split()
+            rpkmDict[lineFields[0]] = lineFields[rpkmField]
+
+        rpkmfile.close()
+
+    infile = open(infilename)
+    snpPositionList = []
+    snpDict = {}
+
+    for line in infile:
+        if doNotProcessLine(line):
+            continue
+
+        fields = line.split("\t")
+        chrom = fields[2][3:]
+        start = int(fields[3])
+        chromosomePosition = (chrom, start)
+        snpPositionList.append(chromosomePosition)
+        snpDict[chromosomePosition] = line
+
+    if doCache:
+        cacheGeneDB(genome)
+        idb = geneinfoDB(cache=True)
+        print "cached %s" % genome
+    else:
+        idb = geneinfoDB()
+
+    geneinfoDict = idb.getallGeneInfo(genome)
+    geneDict = {}
+
+    if flankBP > 0:
+        matchingGenesDict = genesIntersecting(genome, snpPositionList, flank=flankBP)
+    else:
+        matchingGenesDict = genesIntersecting(genome, snpPositionList)
+
+    for pos in matchingGenesDict:
+        geneID = matchingGenesDict[pos][0][0]
+        try:
+            symbol = geneinfoDict[geneID][0][0]
+        except:
+            symbol = "LOC%s" % geneID
+
+        geneDescriptor = (symbol, geneID)
+        if geneDict.has_key(geneDescriptor):
+            geneDict[geneDescriptor]["position"].append(pos)
+        else:
+            geneDict[geneDescriptor] = {"position": [pos],
+                                        "sense": matchingGenesDict[pos][0][-1]}
+
+    if doCache:
+        uncacheGeneDB(genome)
+
+    return getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense)
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def getSNPGeneOutputList(geneDict, snpDict, rpkmDict, withSense):
+    snpGeneOutputList = []
+    snpGeneInfoList = getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense)
+
+    for snpEntry in snpGeneInfoList:
+        outputItems = [snpEntry["snpDescription"], snpEntry["symbol"], snpEntry["geneID"], snpEntry["rpkm"]]
+        if withSense:
+            outputItems.append(snpEntry["sense"])
+
+        line = string.join(outputItems, "\t")
+        snpGeneOutputList.append(line)
+
+    snpGeneOutputList.sort(reverse=True)
+
+    return snpGeneOutputList
+
+
+def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
+
+    snpGeneInfoList = []
+
+    for geneDescriptor in geneDict.keys():
+        alreadyDoneList = []
+        (symbol, geneID) = geneDescriptor
+        genePositionList = geneDict[geneDescriptor]["position"]
+        genePositionList.sort()
+
+        for position in genePositionList:
+            if snpDict[position] in alreadyDoneList:
+                continue
+
+            snpGeneInfoDict = {"symbol": symbol,
+                               "geneID": geneID}
+
+            rpkm = "N\A"
+            if rpkmDict.has_key(geneID):
+                rpkm = str(rpkmDict[geneID])
+
+            snpGeneInfoDict["rpkm"] = rpkm
+            snpGeneInfoDict["snpDescription"] = snpDict[position][:-1]
+            if withSense:
+                snpGeneInfoDict["sense"] = geneDict[geneDescriptor]["sense"]
+
+            alreadyDoneList.append(snpDict[position])
+            snpGeneInfoList.append(snpGeneInfoDict)
+
+    snpGeneInfoList.sort(reverse=True)
+
+    return snpGeneInfoList
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getSNPs.py b/getSNPs.py
new file mode 100755 (executable)
index 0000000..0adde42
--- /dev/null
@@ -0,0 +1,216 @@
+#
+#  getSNPs.py
+#  ENRAGE
+#
+# Originally written by: Wendy Lee
+# Last modified: May 11th, 2009 by Ali Mortazavi
+
+"""
+    Get the matches and mismatches from the RDS file, and calculate the SNP thresholds uniqStartMin (Sl * readlength) and and totalRatio (Cl). 
+    For each mismatch, choose the base change that occur most frequently (ie: has the highest number
+    of independent reads)
+    Threshold of Sl and Cl are from user input
+    Sl = # of independent reads supporting a base change at position S 
+    Cl = total # of all reads supporting a base change at position S / # of all # reads that pass through position S
+
+    usage: python getSNPs.py samplerdsfile uniqStartMin totalRatioMin outfile [--nosplices] [--enforceChr] [--cache pages] where
+
+    uniqStartMin = # of independent reads supporting a base change at position S
+    totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
+"""
+
+import sys, optparse
+from commoncode import readDataset, writeLog
+
+print "%prog: version 3.5"
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print "psyco is not running"
+    pass
+
+def usage():
+    print __doc__
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nosplices", action="store_false", dest="doSplices")
+    parser.add_option("--enforceChr", action="store_true", dest="forceChr")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        usage()
+        sys.exit(2)
+
+    hitfile = args[0]
+    uniqStartMin = float(args[1])
+    totalRatioMin = float(args[2])
+    outfilename = args[3]
+
+    if options.cachePages > 0:
+        doCache = True
+    else:
+        doCache = False
+
+    writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
+
+
+def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
+    writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
+
+    outfile  = open(outfilename, "w")
+    header = "#Sl\tCl\tchrom\tpos\tmatch\tuniqMis\t\ttotalMis\tchange" 
+    outfile.write(header + "\n")
+
+    snpPropertiesList = getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages, doSplices, forceChr)
+    for snpEntry in snpPropertiesList:
+        outline = "%1.2f\t%1.2f\t%s\t%d\t%d\t%d\t\t%d\t%s\n" % snpEntry
+        print outline
+        outfile.write(outline + "\n")
+        outfile.flush() 
+
+    outfile.close()
+
+    writeLog("snp.log", sys.argv[0], "%d candidate SNPs\n" % len(snpPropertiesList))
+
+
+def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
+
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    if cachePages > 20000:
+        hitRDS.setDBcache(cachePages)
+
+    snpPropertiesList = []
+    readLength = hitRDS.getReadSize() 
+    chromList = hitRDS.getChromosomes()
+
+    for chrom in chromList:
+        if doNotProcessChromosome(forceChr, chrom):
+            continue
+
+        matchDict = getMatchDict(hitRDS, chrom, doSplices)
+        print "got match dict for %s " % chrom
+        mismatchDict = getMismatchDict(hitRDS, chrom, doSplices)
+        print "got mismatch dict for %s " % chrom
+        mismatchPositions = mismatchDict.keys()
+        mismatchPositions.sort()
+        for position in mismatchPositions:
+            totalCount = mismatchDict[position]["totalCount"]
+            uniqBaseDict = mismatchDict[position]["uniqBaseDict"]
+            totalBaseDict = mismatchDict[position]["totalBaseDict"]
+            highestCount = 0
+            highestBaseChange = "N-N"
+            highestTotalCount = 0
+            for baseChange in uniqBaseDict:
+                if totalBaseDict[baseChange] > highestTotalCount:
+                    highestBaseChange = baseChange
+                    highestCount = uniqBaseDict[baseChange]
+                    highestTotalCount = totalBaseDict[baseChange]
+
+            Cl = 0.
+            matchCount = 0
+            if highestCount >= uniqStartMin:
+                for matchpos in xrange(position - readLength + 1, position + 1):
+                    try:
+                        matchCount += len([mstop for mstop in matchDict[matchpos] if position <= mstop])
+                    except:
+                        pass
+
+                matchCount -= totalCount
+                if matchCount < 0:
+                    matchCount = 0
+
+                Sl = highestCount/float(readLength)
+                Cl = highestTotalCount/float(highestTotalCount + matchCount)
+                if Cl >= totalRatioMin:
+                    snpProperties = (Sl, Cl, chrom, position, matchCount, highestCount, highestTotalCount, highestBaseChange)
+                    snpPropertiesList.append(snpProperties)
+
+    return snpPropertiesList
+
+
+def doNotProcessChromosome(forceChr, chromosome):
+    if forceChr:
+        if chromosome[:3] != "chr":
+            return True
+    else:
+        return False
+
+
+def getMatchDict(rds, chrom, withSplices=True):
+    spliceDict = {}
+    readDict = {}
+    finalDict = {}
+
+    try:
+        readDict = rds.getReadsDict(fullChrom=True, bothEnds=True, noSense=True, chrom=chrom)
+    except:
+        readDict[chrom] = []
+
+    for (start, stop) in readDict[chrom]:
+        if finalDict.has_key(start):
+            finalDict[start].append(stop)
+        else:
+            finalDict[start] = [stop]
+
+    if withSplices:
+        try:
+            spliceDict = rds.getSplicesDict(noSense=True, fullChrom=True, chrom=chrom, splitRead=True)
+        except:
+            spliceDict[chrom] = []
+
+        for (start, stop) in spliceDict[chrom]:
+            if finalDict.has_key(start):
+                finalDict[start].append(stop)
+            else:
+                finalDict[start] = [stop]
+
+    return finalDict
+
+
+def getMismatchDict(rds, chrom, withSplices=True):
+    mismatchDict = {}
+    spliceDict = rds.getMismatches(mischrom=chrom, useSplices=withSplices)
+    for (start, change_at, change_base, change_from) in spliceDict[chrom]:
+        change = "%s-%s" % (change_base, change_from)
+        uniqueReadCount = 1
+        totalCount = 1
+        back = "%s:%s" % (str(start), change)
+        uniqBaseDict = {change: 1}
+        totalBaseDict = {change: 1}
+        if mismatchDict.has_key(change_at):
+            (uniqueReadCount, totalCount, back, uniqBaseDict, totalBaseDict) = mismatchDict[change_at]
+            pos = "%s:%s" % (str(start), change)
+            totalCount += 1
+            if totalBaseDict.has_key(change): 
+                totalBaseDict[change] += 1
+
+            if pos not in back:
+                uniqueReadCount += 1
+                if uniqBaseDict.has_key(change):
+                    uniqBaseDict[change] += 1 # dict contains total unique read counts
+
+                back = "%s,%s" % (back, pos)
+
+        mismatchDict[change_at] = {"uniqueReadCount": uniqueReadCount,
+                                   "totalCount": totalCount, 
+                                   "back": back,
+                                   "uniqBaseDict": uniqBaseDict,
+                                   "totalBaseDict": totalBaseDict
+        }
+
+    return mismatchDict
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getallNRSE.py b/getallNRSE.py
new file mode 100755 (executable)
index 0000000..c2e639f
--- /dev/null
@@ -0,0 +1,354 @@
+import sys, optparse
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+from cistematic.core import complement
+from cistematic.core.motif import Motif
+from cistematic.genomes import Genome
+from commoncode import readDataset, getMergedRegions, findPeak
+from pylab import *
+import matplotlib
+
+print '%s: version 3.4' % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome regionfile siteOutfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--dataset", dest="chipfilename")
+    parser.add_option("--min", type="float", dest="minHeight")
+    parser.add_option("--minfraction", type="float", dest="minFraction")
+    parser.add_option("--plot", dest="plotname")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+    parser.add_option("--peakdist", type="int", dest="maxpeakdist")
+    parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
+    parser.add_option("--motifdir", dest="motifDir")
+    parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
+                        doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
+                        maxpeakdist=None, fullOnly=False, motifDir="./")
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = argv[0]
+    infilename = args[1]
+    outfilename = args[2]
+
+    getallNRSE(genome, infilename, outfilename, options.chipfilename,
+               options.minHeight, options.minFraction, options.plotname,
+               options.doCache, options.normalize, options.doVerbose,
+               options.doMarkov1, options.maxpeakdist, options.fullOnly,
+               options.motifDir)
+
+
+def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
+               minFraction=-2., plotname="", doCache=False, normalize=True,
+               doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
+               motifDir="./"):
+
+    doPlot = False
+    if plotname:
+        matplotlib.use("Agg")
+        doPlot = True
+
+    if motifDir[-1] != "/":
+        motifDir += "/"
+
+    doDataset = False
+    normalizeBy = 1
+    if chipfilename:
+        hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
+        doDataset = True
+        if normalize:
+            normalizeBy = len(hitRDS) / 1000000.
+
+    if minFraction > 1.:
+        minFraction /= 100.
+        print "scaling minFraction to %.2f" % minFraction
+
+    if maxpeakdist is not None:
+        enforcePeakDist = True
+    else:
+        enforcePeakDist = False
+        maxpeakdist = 101
+
+    mot = Motif("", motifFile="%sNRSE3.mot" % motifDir)
+    motL = Motif("", motifFile="%sNRSE3left.mot" % motifDir)
+    motR = Motif("", motifFile="%sNRSE3right.mot" % motifDir)
+    bestScore = mot.bestConsensusScore()
+    bestLeft = motL.bestConsensusScore()
+    bestRight = motR.bestConsensusScore()
+
+    hg = Genome(genome)
+
+    regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=doVerbose, doMerge=False)
+
+    outfile = open(outfilename, "w")
+    outfile.write("#dataset: %s\tregions:%s\tnormalize: %s\tmarkov1: %s\n" % (chipfilename, infilename, normalize, doMarkov1))
+    outfile.write("#enforcePeakDist: %s\tpeakdist: %d bp\tfullOnly: %d bp\n" % (enforcePeakDist, maxpeakdist, fullOnly))
+    outfile.write("#site\tscore\tleftscore\trightscore\tRPM\tpeakDist\ttype\theight\tfractionHeight\tregion\tsense\tseq\n")
+
+    index = 0
+    regionList = []
+
+    for rchrom in regions:
+        if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
+            continue
+
+        for (start, stop, length) in regions[rchrom]:
+            regionList.append((rchrom, start, length))
+
+    notFoundIndex = 0
+    currentChrom = ""
+    for (rchrom, start, length) in regionList:
+        seq = hg.sequence(rchrom, start, length)
+        if doDataset:
+            if rchrom != currentChrom:
+                fullchrom = "chr" + rchrom
+                hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
+                currentChrom = rchrom
+
+            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
+            if len(topPos) == 0:
+                print "topPos error"
+
+            peakpos = topPos[0]
+            peakscore = smoothArray[peakpos]
+            if peakscore == 0.:
+                peakscore = -1.
+
+            if normalize:
+                numHits /= normalizeBy
+                peakscore /= normalizeBy
+        else:
+            peakpos = length
+            peakscore = -1
+            numHits = 0
+            smoothArray = [0.] * length
+
+        found = []
+        if doMarkov1:
+            lefts = motL.locateMarkov1(seq, 3.)
+            rights = motR.locateMarkov1(seq, 3.)
+        else:
+            lefts = motL.locateMotif(seq, 70)
+            rights = motR.locateMotif(seq, 70)
+
+        allhalfs = [(v0, v1, "L") for (v0, v1) in lefts] + [(v0, v1, "R") for (v0, v1) in rights]
+        allhalfs.sort()
+
+        # look for canonicals and non-canonicals
+        if len(allhalfs) > 1:
+            (firstpos, firstsense, firsttype) = allhalfs[0]
+            for (secondpos, secondsense, secondtype) in allhalfs[1:]:
+                if enforcePeakDist:
+                    withinDistance = False
+                    for aPos in topPos:
+                        if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
+                            withinDistance = True
+                    if not withinDistance:
+                        firstpos = secondpos
+                        firstsense = secondsense
+                        firsttype = secondtype
+                        continue
+
+                if firsttype == "L":
+                    dist = secondpos - firstpos + 2
+                else:
+                    dist = secondpos - firstpos -1
+
+                if firstsense == secondsense and dist in [9, 10, 11, 16, 17, 18, 19]:
+                    if (firsttype == "L" and secondtype == "R" and secondsense == "F"):
+                        found.append((start + firstpos, firstpos - peakpos + (dist + 10)/2, dist))
+
+                    if (firsttype == "R" and secondtype == "L" and secondsense == "R"):
+                        found.append((start + firstpos, firstpos  - peakpos + (dist + 10)/2, dist))
+
+                firstpos = secondpos
+                firstsense = secondsense
+                firsttype = secondtype
+
+        # did we miss any 70%+ matches ?
+        if doMarkov1:
+            matches = mot.locateMarkov1(seq, 3.5)
+        else:
+            matches = mot.locateMotif(seq, 70)
+
+        for (pos, sense) in matches:
+            alreadyFound = False
+            for (fpos, fpeakdist, fdist) in found:
+                if pos + start == fpos:
+                    alreadyFound = True
+
+            if not alreadyFound:
+                if enforcePeakDist:
+                    withinDistance = False
+                    for aPos in topPos:
+                        if abs(firstpos - aPos) < maxpeakdist or abs(secondpos - aPos) < maxpeakdist:
+                            withinDistance = True
+                            thePos = aPos
+
+                    if withinDistance:
+                        found.append((start + pos, pos - thePos + 10, 11))
+
+                else:
+                    found.append((start + pos, pos - peakpos + 10, 11))
+
+        # we'll now accept half-sites within maxpeakdist bp of peak if using a dataset, else all
+        if len(found) == 0 and not fullOnly:
+            bestone = -1
+            if not doDataset:
+                bestdist = maxpeakdist
+            else:
+                bestdist = length
+
+            index = 0
+            for (pos, sense, type) in allhalfs:
+                if doDataset:
+                    for aPos in topPos:
+                        if abs(pos - aPos) < bestdist:
+                            bestdist = abs(pos - aPos)
+                            bestone = index
+                            peakpos = aPos
+                else:
+                    found.append((start + allhalfs[index][0], allhalfs[index][0] + 5 - peakpos, 0))
+
+                index += 1
+
+            if (doDataset and bestdist < 101):
+                try:
+                    found.append((start + allhalfs[bestone][0], allhalfs[bestone][0] + 5 - peakpos, 0))
+                except:
+                    continue
+
+        # see if we found an acceptable match
+        foundValue = False
+        for (foundpos, posdist, dist) in found:
+            # get a score for 21-mer, report
+            seq = hg.sequence(rchrom, foundpos, 21)
+            # height will be measured from the center of the motif
+            height = -2.
+            for pos in range(10 + dist):
+                try:
+                    currentHeight = smoothArray[int(peakpos + posdist + pos)]
+                except: 
+                    pass
+
+                if currentHeight > height:
+                    height = currentHeight
+
+            if normalize:
+                height /= normalizeBy
+
+            fractionHeight = height / peakscore
+            if height < minHeight or fractionHeight < minFraction:
+                continue
+
+            foundValue = True
+            (front, back) = mot.scoreMotif(seq)
+            sense = "+"
+            if front > back:
+                score = int(100 * front / bestScore)
+                theseq = hg.sequence(rchrom, foundpos, 10 + dist)
+            else:
+                score = int(100 * back / bestScore)
+                theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
+                sense = "-"
+                foundpos + 1
+
+            leftScore = -1.
+            rightScore = -1.
+            leftseq = ""
+            rightseq = ""
+            if dist > 0:
+                testseq = hg.sequence(rchrom, foundpos, 10 + dist)
+                if sense == "-":
+                    testseq = complement(testseq)
+
+                leftseq = testseq[:9]
+                rightseq = testseq[dist-2:]
+            elif dist == 0:
+                testseq = hg.sequence(rchrom, foundpos, 12)
+                if sense == "-":
+                    testseq = complement(testseq)
+                    leftseq = testseq[3:]
+                else:
+                    leftseq = testseq[:9]
+
+                rightseq = testseq
+
+            (lfront, lback) = motL.scoreMotif(leftseq)
+            (rfront, rback) = motR.scoreMotif(rightseq)
+            if lfront > lback:
+                leftScore = int(100 * lfront) / bestLeft
+                leftSense = "+"
+            else:
+                leftScore = int(100 * lback) / bestLeft
+                leftSense = "-"
+
+            if rfront > rback:
+                rightScore = int(100 * rfront) / bestRight
+                rightSense = "+"
+            else:
+                rightScore = int(100 * rback) / bestRight
+                rightSense = "-"
+
+            if dist != 11:
+                if rightScore > leftScore:
+                    sense = rightSense
+                else:
+                    sense = leftSense
+
+                if sense == "-" and dist > 0:
+                    theseq = complement(hg.sequence(rchrom, foundpos, 10 + dist))
+
+            outline = "chr%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2f\t%.2f\tchr%s:%d-%d\t%s\t%s" % (rchrom, foundpos, foundpos + 9 + dist, score, leftScore, rightScore, numHits, posdist, dist, height, fractionHeight, rchrom, start, start + length, sense, theseq)
+            if doVerbose:
+                print outline
+
+            outfile.write(outline + "/n")
+
+        # we didn't find a site - draw region
+        if not foundValue and doVerbose:
+            outline = "#no predictions for %s:%d-%d %d %.2f" % (rchrom, start, start + length, numHits, peakscore)
+            print outline
+            outfile.write(outline + "\n")
+
+        if not foundValue and doPlot:
+            drawarray = [val + notFoundIndex for val in smoothArray]
+            drawpos = [drawarray[val] for val in topPos]
+            plot(drawarray, "b")
+            plot(topPos, drawpos, "r.")
+            goodmatches = mot.locateMotif(seq, 75)
+            if len(goodmatches) > 0:
+                print topPos
+                print goodmatches
+                drawgood = []
+                drawgoody = []
+                for (mstart, sense) in goodmatches:
+                    drawgood.append(mstart)
+                    drawgoody.append(drawarray[mstart])
+
+                plot(drawgood, drawgoody, "g.")
+
+            notFoundIndex -= 30
+
+    outfile.close()
+    if doPlot:
+        savefig(plotname)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getallgenes.py b/getallgenes.py
new file mode 100755 (executable)
index 0000000..addba36
--- /dev/null
@@ -0,0 +1,301 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys, optparse
+from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+
+print "%prog: version 5.5"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--radius", type="int", dest="maxRadius")
+    parser.add_option("--nomatch", dest="nomatchfilename")
+    parser.add_option("--trackfar", action="store_true", dest="trackFar")
+    parser.add_option("--stranded", action="store_true", dest="trackStrand")
+    parser.add_option("--cache", action="store_true", dest="cachePages")
+    parser.add_option("--compact", action="store_true", dest="compact")
+    parser.add_option("--step", type="int", dest="step")
+    parser.add_option("--startField", type="int", dest="colID")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
+                        trackStrand=False, compact=False, colID=1, doCache=False,
+                        extendGenome="", replaceModels=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(2)
+
+    genome = args[0]
+    infilename = args[1]
+    outfilename = args[2]
+
+    getallgenes(genome, infilename, outfilename, options.maxRadius,
+                options.nomatchfilename, options.step, options.trackFar,
+                options.trackStrand, options.compact, options.colID,
+                options.doCache, options.extendgenome, options.replaceModels)
+
+
+def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
+                step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
+                doCache=False, extendGenome="", replaceModels=False):
+
+    if doCache:
+        idb = geneinfoDB(cache=True)
+    else:
+        idb = geneinfoDB()
+
+    if not step:
+        step = maxRadius - 2
+
+    if extendGenome and replaceModels:
+        replaceModels = True
+    else:
+        replaceModels = False
+
+    infile = open(infilename)
+    outfile = open(outfilename,"w")
+
+    if genome == "dmelanogaster":
+        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+    else:
+        geneinfoDict = idb.getallGeneInfo(genome)
+
+    posList = []
+    altPosDict = {}
+    altPosRevDict = {}
+    posLine = {}
+    posStrand = {}
+    altPosList = []
+
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        fields = line.split("\t")
+        if compact:
+            (chrom, pos) = fields[colID].split(":")
+            chrom = chrom[3:]
+            (start, stop) = pos.split("-")
+            pos = (chrom, int(start))
+            altPos = (chrom, int(stop))
+        else:
+            try:
+                chrom = fields[colID][3:]
+            except:
+                print line
+                continue
+
+            pos = (chrom, int(fields[colID + 1]))
+            altPos = (chrom, int(fields[colID + 2]))
+
+        altPosDict[pos] = altPos
+        altPosRevDict[altPos] = pos
+        posList.append(pos)
+        posList.append(altPos)
+        altPosList.append(altPos)
+        posLine[pos] = line
+        if trackStrand:
+            if "RNAFARP" in line:
+                posStrand[pos] = "+"
+                posStrand[altPos] = "+"
+            else:
+                posStrand[pos] = "-"
+                posStrand[altPos] = "-"
+
+    geneList = []
+    geneDict = {}
+    if maxRadius < step:
+        step = maxRadius - 2
+
+    hg = Genome(genome, inRAM=True)
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace = replaceModels)
+
+    geneannotDict = hg.allAnnotInfo()
+
+    for radius in range(1, maxRadius, step):
+        print "radius %d" % radius
+        print len(posList)
+        if radius == 1:
+            posDict = genesIntersecting(genome, posList, extendGen=extendGenome, replaceMod=replaceModels)
+        else:
+            posDict = featuresIntersecting(genome, posList, radius, "CDS", extendGen=extendGenome, replaceMod=replaceModels) 
+            posDict2 = featuresIntersecting(genome, posList, radius, "UTR", extendGen=extendGenome, replaceMod=replaceModels)
+            for apos in posDict2:
+                try: 
+                    posDict[apos] += posDict2[apos]
+                    posDict[apos].sort()
+                except:
+                    posDict[apos] = posDict2[apos]
+
+        for pos in posDict:
+            geneID  = ""
+            if len(posDict[pos]) == 1:
+                if trackStrand:
+                    if posStrand[pos] == posDict[pos][0][-1]:
+                        geneID = posDict[pos][0][0]
+                else:
+                    geneID = posDict[pos][0][0]
+            elif len(posDict[pos]) > 1 and not trackStrand:
+                (chrom, loc) = pos
+                bestres = posDict[pos][0]
+                dist1 = abs(bestres[3] - loc)
+                dist2 = abs(bestres[4] - loc)
+                if dist1 < dist2:
+                    bestdist = dist1
+                else:
+                    bestdist = dist2
+
+                for testres in posDict[pos]:
+                    testdist1 = abs(testres[3] - loc)
+                    testdist2 = abs(testres[4] - loc)
+                    if testdist1 < testdist2:
+                        testdist = testdist1
+                    else:
+                        testdist = testdist2
+
+                    if testdist < bestdist:
+                        bestdist = testdist
+                        bestres = testres
+
+                geneID = bestres[0]
+            elif len(posDict[pos]) > 1:
+                (chrom, loc) = pos
+                bestres = posDict[pos][0]
+                dist1 = abs(bestres[3] - loc)
+                dist2 = abs(bestres[4] - loc)
+                bestStrand = posDict[pos][-1]
+                if dist1 < dist2:
+                    bestdist = dist1
+                else:
+                    bestdist = dist2
+
+                for testres in posDict[pos]:
+                    testdist1 = abs(testres[3] - loc)
+                    testdist2 = abs(testres[4] - loc)
+                    testStrand = testres[-1]
+                    if testdist1 < testdist2:
+                        testdist = testdist1
+                    else:
+                        testdist = testdist2
+
+                    if bestStrand != posStrand[pos] and testStrand == posStrand[pos]:
+                        bestdist = testdist
+                        bestres = testres
+                        bestStrand = testStrand
+                    elif testdist < bestdist:
+                        bestdist = testdist
+                        bestres = testres
+
+                if bestStrand == posStrand[pos]:
+                    geneID = bestres[0]
+
+            if geneID != "":
+                try:
+                    if genome == "dmelanogaster":
+                        symbol = geneinfoDict["Dmel_" + geneID][0][0]
+                    else:
+                        symbol = geneinfoDict[geneID][0][0]
+                except:
+                    try:
+                        symbol = geneannotDict[(genome, geneID)][0]
+                    except:
+                        symbol = "LOC" + geneID
+            else:
+                continue
+
+            if pos in altPosList and pos in posList:
+                posList.remove(pos)
+                if pos not in altPosRevDict:
+                    continue
+
+                if altPosRevDict[pos] in posList:
+                    posList.remove(altPosRevDict[pos])
+
+                pos = altPosRevDict[pos]
+            elif pos in posList:
+                posList.remove(pos)
+                if pos not in altPosDict:
+                    print pos
+                    continue
+
+                if altPosDict[pos] in posList:
+                    posList.remove(altPosDict[pos])
+            else:
+                continue
+
+            if (symbol, geneID) not in geneList:
+                geneList.append((symbol, geneID))
+                geneDict[(symbol, geneID)] = []
+
+            if pos not in geneDict[(symbol, geneID)]:
+                geneDict[(symbol, geneID)].append(pos)
+
+    for (symbol, geneID) in geneList:
+        geneDict[(symbol, geneID)].sort()
+        seenLine = []
+        for pos in geneDict[(symbol, geneID)]:
+            if pos in altPosRevDict:
+                pos = altPosRevDict[pos]
+
+            if posLine[pos] in seenLine:
+                continue
+
+            if "\t" in symbol:
+                symbol = symbol.replace("\t","|")
+
+            if " " in symbol:
+                symbol = symbol.replace(" ","_")
+
+            line = "%s %s %s" % (symbol, geneID, posLine[pos])
+            seenLine.append(posLine[pos])
+            outfile.write(line)
+
+    matchIndex = 0
+    if nomatchfilename != "":
+        nomatchfile = open(nomatchfilename, "w")
+
+    prevStart = 0
+    prevChrom = ""
+    farIndex = 0
+    start = 0
+    for pos in posList:
+        if pos not in altPosList:
+            if nomatchfilename != "":
+                nomatchfile.write(posLine[pos])
+
+            matchIndex += 1
+            # need to add strand tracking here.....
+            if trackFar:
+                (chrom, start) = pos
+                if chrom != prevChrom:
+                    farIndex += 1
+                    prevChrom = chrom
+                elif abs(int(start) - prevStart) > maxRadius:
+                    farIndex += 1
+
+                line = "FAR%d %d %s" % (farIndex, -1 * farIndex, posLine[pos])
+                outfile.write(line)
+            prevStart = int(start)
+
+    if nomatchfilename != "":
+        nomatchfile.close()
+
+    print "%d sites without a gene within radius of %d" % (matchIndex, radius)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getallsites.py b/getallsites.py
new file mode 100755 (executable)
index 0000000..39335e4
--- /dev/null
@@ -0,0 +1,209 @@
+import sys, optparse
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+from cistematic.core.motif import Motif, hasMotifExtension
+from cistematic.core import complement
+from cistematic.genomes import Genome
+from commoncode import readDataset, getMergedRegions, findPeak
+
+print "%prog: version 2.4"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--dataset", dest="chipfilename")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--best", action="store_true", dest="bestOnly",
+                      help="only report the best position for each region")
+    parser.add_option("--usepeak", action="store_true", dest="usePeak",
+                      help="use peak position and height from regions file")
+    parser.add_option("--printseq", action="store_true", dest="printSeq")
+    parser.add_option("--nomerge", action="store_true", dest="noMerge")
+    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+    parser.add_option("--rank", type="int", dest="useRank",
+                      help="return region ranking based on peak height ranking [requires --usepeak]")
+    parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
+                        printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 5:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    motfilename = args[1]
+    motThreshold = float(args[2])
+    infilename = args[3]
+    outfilename = args[4]
+
+    getallsites(genome, motfilename, motThreshold, infilename, outfilename, options.chipfilename,
+                options.doCache, options.bestOnly, options.usePeak, options.printSeq, options.doMarkov1,
+                options.useRank, options.noMerge)
+
+
+def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
+                doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
+                useRank=False, noMerge=False):
+
+    if motThreshold < 1.0 and doMarkov1:
+        print "motThreshold should be between 1.0 and 10.0 for markov1"
+        sys.exit(1)
+    elif motThreshold < 55.0 and not doMarkov1:
+        print "motThreshold should be between 55 and 99 for a regular PSFM"
+        sys.exit(1)
+
+    if hasMotifExtension:
+        print "will use cistematic.core.motif C-extension to speed up motif search"
+
+    if useRank and usePeak:
+        print "will return region ranking based on peak height ranking"
+        useRank = True
+    else:
+        print "ignoring '-rank': can only use ranking when using a region file with peak position and height"
+        useRank = False
+
+    mot = Motif("", motifFile=motfilename)
+    motLen = len(mot)
+    bestScore = mot.bestConsensusScore()
+
+    hg = Genome(genome)
+
+    # minHits=-1 will force regions to be used regardless
+    # maxDist= 0 prevents merging of non-overlapping regions
+    if noMerge:
+        regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, doMerge=False, keepPeak=usePeak)
+    else:
+        regions = getMergedRegions(infilename, maxDist=0, minHits=-1, verbose=True, keepPeak=usePeak)
+
+    doRDS = False
+    if chipfilename:
+        doRDS = True
+
+    if doRDS:
+        hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
+
+    outfile = open(outfilename, "w")
+
+    regionList = []
+
+    for chrom in regions:
+        if "rand" in chrom or "M" in chrom:
+            continue
+
+        if usePeak:
+            for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
+                regionList.append((peakHeight, chrom, start, length, peakPos))
+        else:
+            for (start, stop, length) in regions[chrom]:
+                regionList.append((chrom, start, length))
+
+    if usePeak:
+        regionList.sort()
+        regionList.reverse()
+
+    notFoundIndex = 0
+    currentChrom = ""
+    count = 0
+    for tuple in regionList:
+        if usePeak:
+            (rpeakheight, rchrom, start, length, rpeakpos) = tuple
+        else:
+            (rchrom, start, length) = tuple
+
+        try:
+            seq = hg.sequence(rchrom, start, length)
+        except:
+            print "couldn't retrieve %s %d %d - skipping" % (rchrom, start, length)
+            continue
+
+        count += 1
+        numHits = -1
+        if usePeak:
+            peakpos = rpeakpos
+            if useRank:
+                numHits = count
+            else:
+                numHits = rpeakheight
+        elif doRDS:
+            if rchrom != currentChrom:
+                fullchrom = "chr" + rchrom
+                hitDict = hitRDS.getReadsDict(chrom=fullchrom)
+                currentChrom = rchrom
+
+            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length)
+            if len(topPos) == 0:
+                print "topPos error"
+
+            peakpos = topPos[0]
+
+        found = []
+        if doMarkov1:
+            matches = mot.locateMarkov1(seq, motThreshold)
+        else:
+            matches = mot.locateMotif(seq, motThreshold)
+
+        for (pos, sense) in matches:
+            alreadyFound = False
+            for (fpos, fdist) in found:
+                if pos + start == fpos:
+                    alreadyFound = True
+
+            if not alreadyFound:
+                if usePeak:
+                    found.append((start + pos, start + pos  + motLen/2 - peakpos))
+                elif doRDS:
+                    found.append((start + pos, pos  + motLen/2 - peakpos))
+                else:
+                    found.append((start + pos, -1))
+
+        foundValue = False
+        bestList = []
+        for (foundpos, peakdist) in found:
+            seq = hg.sequence(rchrom, foundpos, motLen)
+            foundValue = True
+            (front, back) = mot.scoreMotif(seq)
+            sense = "+"
+            if front >= back:
+                score = int(100 * front / bestScore)
+            else:
+                score = int(100 * back / bestScore)
+                sense = "-"
+                seq = complement(seq)
+
+            if printSeq:
+                print seq
+
+            outline = "chr%s:%d-%d\t%d\t%d\t%d\tchr%s:%d-%d\t%s\n" % (rchrom, foundpos, foundpos + motLen - 1, score, numHits, peakdist, rchrom, start, start + length, sense)
+            if bestOnly:
+                bestList.append((abs(peakdist), outline))
+            else:
+                outfile.write(outline)
+
+        if bestOnly and foundValue:
+            bestList.sort()
+            outfile.write(bestList[0][1])
+
+        if not foundValue:
+            if printSeq:
+                print "could not find a %s site for %s:%d-%d" % (mot.tagID, rchrom, start, start+ length)
+
+            notFoundIndex += 1
+        if (count % 10000) == 0 and not printSeq:
+            print count
+
+    outfile.close()
+    print "did not find motif in %d regions" % notFoundIndex
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/getfasta.py b/getfasta.py
new file mode 100755 (executable)
index 0000000..0b2faf9
--- /dev/null
@@ -0,0 +1,183 @@
+#
+#  getfasta.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, findPeak
+from cistematic.genomes import Genome
+
+print "%s: version 3.4" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome regionfile outfilename [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--seqradius", type="int", dest="seqsize")
+    parser.add_option("--minreads", type="int", dest="minHitThresh")
+    parser.add_option("--returnTop", type="int", dest="topRegions")
+    parser.add_option("--maxsize", type="int", dest="maxsize")
+    parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+    parser.add_option("--dataset", dest="hitfile")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--compact", action="store_true", dest="doCompact")
+    parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
+                        usePeaks=False, hitfile=None, doCache=False, doCompact=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    regionfile = args[1]
+    outfilename = args[2]
+
+    getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
+             options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
+             options.doCache, options.doCompact)
+
+
+def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
+             maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
+    doDataset = False
+    if hitfile is not None:
+        if usePeaks:
+            print "ignoring dataset and relying on peak data"
+        else:
+            doDataset = True
+
+    if doCompact:
+        mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
+                                      chromField=0, compact=True, keepPeak=usePeaks,
+                                      returnTop=topRegions)
+    else:
+        mergedRegions = getMergedRegions(regionfile, minHits=minHitThresh, verbose=True,
+                                      keepPeak=usePeaks, returnTop=topRegions)
+
+    if usePeaks:
+        ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
+    elif doDataset:
+        hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+        ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
+    else:
+        ncregions = getDefaultRegion(mergedRegions, maxsize)
+
+    writeFastaFile(ncregions, genome, outfilename, seqsize)
+
+
+def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
+    hg = Genome(genome)
+    outfile = open(outfilename, "w")
+    for chrom in ncregions:
+        for regionDict in ncregions[chrom]:
+            rstart = regionDict["start"]
+            rlen = regionDict["length"]
+            topPos = regionDict["topPos"]
+            if topPos[0] >= 0:
+                newrstart = rstart + topPos[0] - seqsize
+                newrlen = 2 * seqsize + 1
+            else:
+                newrstart = rstart
+                newrlen = rlen
+
+            seq2 = hg.sequence(chrom, newrstart, newrlen)
+            outfile.write(">chr%s:%d-%d\n%s\n" % (chrom, newrstart, newrstart + newrlen, seq2))
+
+    outfile.close()
+
+
+def getDefaultRegion(regionDict, maxsize):
+    ncregions = {}
+    for chrom in regionDict:
+        ncregions[chrom] = []
+
+    for achrom in regionDict:
+        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+        for region in regionDict[achrom]:
+            (rstart, rstop, rlen) = region
+
+            if rlen > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+                continue
+
+            resultDict = {"start": rstart,
+                          "length": rlen,
+                          "topPos": [-1]
+            }
+            ncregions[achrom].append(resultDict)
+
+    return ncregions
+
+
+def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
+
+    ncregions = {}
+    for chrom in regionDict:
+        ncregions[chrom] = []
+
+    for achrom in regionDict:
+        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+        for region in regionDict[achrom]:
+            (rstart, rstop, rlen, peakPos, peakHeight) = region
+
+            if rlen > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+                continue
+
+            topPos = peakPos - rstart
+            if peakHeight > minHitThresh:
+                resultDict = {"start": rstart,
+                              "length": rlen,
+                              "topPos": [topPos]
+                }
+                ncregions[achrom].append(resultDict)
+
+    return ncregions
+
+
+def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
+
+    readlen = hitRDS.getReadSize()
+
+    ncregions = {}
+    for chrom in regionDict:
+        ncregions[chrom] = []
+
+    for achrom in regionDict:
+        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
+        for region in regionDict[achrom]:
+            (rstart, rstop, rlen) = region
+
+            if rlen > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+                continue
+
+            thechrom = "chr%s" % achrom
+            print "."
+            hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+            print "hitDict length: %d", len(hitDict[thechrom])
+            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
+            if numHits > minHitThresh:
+                resultDict = {"start": rstart,
+                              "length": rlen,
+                              "topPos": topPos
+                }
+                ncregions[achrom].append(resultDict)
+
+    return ncregions
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getgosig.py b/getgosig.py
new file mode 100755 (executable)
index 0000000..b04dca6
--- /dev/null
@@ -0,0 +1,241 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+from cistematic.genomes import Genome
+from math import log
+import os.path
+import sys
+import optparse
+import matplotlib
+from pylab import *
+
+print "%prog: version 2.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [--fontsize pts] [--length in] [--width in]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--fontsize", type="int", dest="fontSize")
+    parser.add_option("--length", type="int", dest="length")
+    parser.add_option("--width", type="int", dest="width")
+    parser.set_defaults(fontSize=5, length=10, width=7)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 5:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    imagename =  args[1]
+
+    conditionList = args[2:]
+    conditions = len(conditionList) / 3
+    fileroots = []
+    titles = []
+    for index in range(conditions):
+        conditionIndex = index * 3
+        fileroots.append(conditionList[conditionIndex])
+        titles.append((conditionList[conditionIndex + 1], "(%s)" % conditionList[conditionIndex + 2]))
+
+    getgosig(genome, imagename, fileroots, titles, options.fontSize, options.length, options.width)
+
+
+def getgosig(genome, imagename, fileroots=[], titles=[], fontSize=5, length=10, width=7):
+    hg = Genome(genome)
+    allgodesc = hg.allGOterms()
+    godesc = []
+
+    matplotlib.use("Agg")
+
+    doGray = False
+
+    rootdir = "./"
+    htmlname = imagename[:-4] + ".html"
+
+    ceiling = 40.0
+    goterms = []
+    goscores = {}
+    numgenes = {}
+    possiblegenes = {}
+    flatArray = []
+
+    highestPval = 0.0
+    lowestPval = 1.0
+    for sigfile in fileroots:
+        infile = open(rootdir + sigfile + ".gosig", "r")
+        for line in infile:
+            if "depleted" in line:
+                continue
+
+            fields = line.split("\t")
+            if fields[0] not in goterms:
+                goterms.append(fields[0])
+                goscores[fields[0]] = []
+                numgenes[fields[0]] = []
+                possiblegenes[fields[0]] = 0
+
+            if float(fields[3]) > highestPval:
+                highestPval = float(fields[3])
+
+            if float(fields[3]) < lowestPval:
+                lowestPval = float(fields[3])
+
+    print highestPval
+    print lowestPval
+
+    boundaryScore = score = -1 * log(highestPval) /  (2.0 * ceiling) + 0.49
+    print boundaryScore
+
+    cdict = {"red": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)),
+             "green": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.1), (1.0, 1.0, 1.0)),
+             "blue": ((0.0, 1.0, 1.0), (boundaryScore, 1.0, 0.75), (1.0, 0.0, 0.0))
+    }
+
+    mymap = matplotlib.colors.LinearSegmentedColormap("my_colormap", cdict, 1024)
+
+    goindex = 0
+    for zfile in fileroots:
+        infile = open(rootdir + zfile + ".gozscore", "r")
+        for line in infile:
+            fields = line.split()
+            goindex += 1
+            if fields[0] not in goterms:
+                continue
+
+            score = -1 * log(float(fields[7])) /  (2.0 * ceiling)
+            if score < -0.5:
+                score = -0.5
+
+            if score > 0.5:
+                score = 0.5
+
+            score += 0.5
+            if doGray:
+                score = 1 - score
+
+            goscores[fields[0]].append(score)
+            numgenes[fields[0]].append(fields[1])
+            possiblegenes[fields[0]] = int(fields[4])
+
+    goindex /= len(fileroots)
+
+    gokeys = goscores.keys()
+    gosortarray = []
+    for term in gokeys:
+        gosortarray.append(goscores[term] + [term])
+
+    gosortarray.sort()
+
+    htmlfile = open(htmlname, "w")
+    htmlfile.write('<html><head><title>GO Analysis</title></head><body><table border="1">')
+    htmlfile.write("<tr><th>Description</th><th>possible</th>")
+    for entry in titles:
+        htmlfile.write("<th>%s<br>%s</th>" % entry)
+
+    htmlfile.write("</tr>\n")
+    tableLines = []
+
+    for entry in gosortarray:
+        term = entry[-1]
+        outline = "%s:\t" % term
+        for entry in goscores[term]:
+            outline += str(round(entry, 4)) + "\t"
+
+        print outline
+        htmlLine = "<tr><th>%s</th><th>%d</th>" % (allgodesc[term], possiblegenes[term])
+        index = 0
+        for fileroot in fileroots:
+            gofile = fileroot + "." + term[3:]
+            ngene = numgenes[term][index]
+            if os.path.exists(gofile):
+                htmlLine += '<td><a href="%s">%s</a></td>' % (gofile, ngene)
+            else:
+                htmlLine += "<td>%s</td>" % (ngene)
+
+            index += 1
+
+        tableLines.append(htmlLine + "</tr>\n")
+        flatArray.append(goscores[term])
+        godesc.append(allgodesc[term])
+
+    tableLines.reverse()
+    for line in tableLines:
+        htmlfile.write(line)
+
+    htmlfile.write("<tr><th>Cohort Size:</th>")
+    htmlfile.write("</tr>\n")
+    htmlfile.write("</table></body></html>")
+
+    figure(figsize=(length, width))
+    myaxe = axes([0.3, 0.1, 0.55, 0.75])
+
+    Z = array(flatArray)
+    print Z.shape
+    if doGray:
+        c = pcolor(Z, cmap=cm.gray, vmin=0.0, vmax=1.0)
+    else:
+        c = pcolor(Z, cmap=mymap, vmin=0.0, vmax=1.0)
+
+    c.set_linewidth(0.1)
+    clim(0.0, 1.0)
+
+    ind = arange(len(fileroots))
+    width = 0.5
+
+    coordy = 0.1
+    deltaX = 1.0    
+    deltaY = 1.0
+
+    pcolorAxes = c.get_axes()
+    for entry in gosortarray:
+        term = entry[-1]
+        coordx = 0.4
+        for genenum in numgenes[term]:
+            if len(genenum) == 1:
+                genenum = "    " + genenum
+            elif len(genenum) == 2:
+                genenum = "  " + genenum
+
+            pcolorAxes.text(coordx, coordy, genenum, fontsize=fontSize)
+            coordx += deltaX
+
+        coordy += deltaY
+
+    coordx = 0
+    for (line1,line2) in titles:
+        pcolorAxes.text(coordx + 0.1, coordy + 3 * deltaY + 0.5, line1, fontsize=int(fontSize*1.5))
+        pcolorAxes.text(coordx + 0.1, coordy + deltaY, line2, fontsize=int(fontSize*1.5))
+        coordx += deltaX 
+
+    setp(gca(), "xticks", [])
+    setp(gca(), "xticklabels", [])
+    setp(gca(), "yticks", arange(len(godesc)))
+    setp(gca(), "yticklabels", godesc)
+    locs, labels = yticks()
+    setp(labels, fontsize=fontSize)
+    setp(labels, verticalalignment="bottom")
+    setp(gca(), "ylim", [0, len(godesc)])
+
+    figtext(0.3,0.02, str(goindex - len(gokeys)) + " additional GO Terms below threshold of significance", fontsize=fontSize*2)
+
+    d = colorbar(orientation="vertical", drawedges=False)
+    for t in d.ax.get_yticklabels():
+        t.set_fontsize(0)
+
+    locs, labels = yticks()
+    setp(labels, fontsize=5)
+    pcolorAxes.text(conditions + 1,len(godesc), str(lowestPval), fontsize=fontSize*2)
+    pcolorAxes.text(conditions + 1,boundaryScore * len(godesc), str(highestPval), fontsize=fontSize*2)
+
+    savefig(imagename, dpi=250)
+    show()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getmers.py b/getmers.py
new file mode 100755 (executable)
index 0000000..c7c35ff
--- /dev/null
@@ -0,0 +1,52 @@
+#
+#  getmers.py
+#  ENRAGE
+#
+
+import sys
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+from cistematic.genomes import Genome
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print '%s: version 1.1' % argv[0]
+
+    if len(sys.argv) < 5:
+        print 'usage: python %s genome merlen chrAny:start-stop outfile' % argv[0]
+        exit(1)
+
+    genome = argv[1]
+    merlen = int(argv[2])
+    location = argv[3]
+    outfilename = argv[4]
+
+    getmers(genome, merlen, location, outfilename)
+
+
+def getmers(genome, merlen, location, outfilename):
+    (chrom, pos) = location.split(':')
+    chrom = chrom[3:]
+    (start, stop) = pos.split('-')
+    start = int(start)
+    regionlength = int(stop) - start + 1
+
+    hg = Genome(genome)
+
+    seq = hg.sequence(chrom, start, regionlength)
+
+    outfile = open(outfilename,'w')
+    print 'writing %d %d-mers' % (regionlength - merlen, merlen)
+    for index in range(regionlength - merlen):
+        outfile.write(seq[index:index + merlen].upper() + '\n')
+
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/getsplicefa.py b/getsplicefa.py
new file mode 100755 (executable)
index 0000000..db8e204
--- /dev/null
@@ -0,0 +1,158 @@
+import sys
+import optparse
+import string
+try:
+    import psyco
+    psyco.full()
+except:
+    print "psyco not running"
+from cistematic.core import complement
+from cistematic.genomes import Genome
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    verstring = "%prog: version 1.0"
+    print verstring
+    delimiter = "|"
+
+    usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+            \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
+            \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--verbose", action="store_true", dest="doVerbose",
+                      help="show verbose messages [default: False]")
+    parser.add_option("--spacer", type="int", dest="spacer",
+                      help="number of spacer NTs to use [default: 2")
+    parser.set_defaults(doVerbose=False, spacer=2)
+    (options, args) = parser.parse_args(argv[1:])
+
+    try:
+        genome = args[0]
+        datafilename = args[1]
+        outfilename = args[2]
+        maxBorder = args[3]
+    except IndexError:
+        print usage
+        sys.exit(1)
+
+    getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
+
+
+def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
+    spacerseq = "N" * spacer
+
+    datafile = open(datafilename)
+    hg = Genome(genome)
+
+    spliceCountDict = {}
+    exonStartDict = {}
+    exonStopDict = {}
+    exonLengthDict = {}
+    nameToChromDict = {}
+    nameToComplementDict = {}
+    alreadySeen = {}
+    counter = 0
+
+    for line in datafile:
+        fields = line.split()
+        name = fields[0]
+        spliceCount = int(fields[7]) - 1
+        if spliceCount < 1:
+            continue
+
+        counter += spliceCount
+        spliceCountDict[name] = spliceCount
+        chrom = fields[1][3:]
+        if chrom == "chrM":
+            continue
+
+        nameToChromDict[name] = chrom
+        if chrom not in alreadySeen:
+            alreadySeen[chrom] = []
+
+        nameToComplementDict[name] = fields[2]
+        exonStarts = []
+        exonStops = []
+        for val in fields[8].split(",")[:-1]:
+            exonStarts.append(int(val))
+
+        for val in fields[9].split(",")[:-1]:
+            exonStops.append(int(val))
+
+        exonStartDict[name] = exonStarts
+        exonStopDict[name] = exonStops
+        exonLengths = []
+        for index in range(spliceCount + 1):
+            exonLengths.append(exonStops[index] - exonStarts[index])
+
+        exonLengthDict[name] = exonLengths
+
+    print len(spliceCountDict)
+    print counter
+
+    missedCount = 0
+    depressedCount = 0
+    splicefileindex = 1
+    spliceCounter = 0
+    outfile = open(outfilename, "w")
+    for name in nameToChromDict:
+        try:
+            spliceCount = spliceCountDict[name]
+        except:
+            continue
+
+        exonStarts = exonStartDict[name]
+        exonStops = exonStopDict[name]
+        exonLengths = exonLengthDict[name]
+        chrom = nameToChromDict[name]
+        for index in range(spliceCount):
+            if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
+                continue
+
+            regionstart = exonStops[index] - maxBorder
+            alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
+            beforeLen = exonLengths[index]
+            afterLen = exonLengths[index + 1]
+            if (beforeLen + afterLen) < maxBorder + spacer:
+                missedCount += 1
+                continue
+
+            if (beforeLen + afterLen) < 2 * maxBorder:
+                depressedCount += 1
+
+            if beforeLen > maxBorder:
+                beforeLen = maxBorder
+
+            if afterLen > maxBorder:
+                afterLen = maxBorder
+
+            try:
+                beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
+                afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
+            except:
+                if doVerbose:
+                    print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
+                continue
+
+            sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
+            spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
+            outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
+            outfile.write(outstring)
+
+        splicefileindex += 1
+        spliceCounter += 1
+        if spliceCounter > 10000:
+            print "%d genes" % splicefileindex
+            spliceCounter = 0
+
+    outfile.close()
+
+    print "%d splices too short to be seen" % missedCount
+    print "%d splices will be under-reported" % depressedCount
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/gfftocis.py b/gfftocis.py
new file mode 100644 (file)
index 0000000..9fec165
--- /dev/null
@@ -0,0 +1,58 @@
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+if len(sys.argv) < 3:
+    print "usage: python %s infile.gff outfile.cis\n" % sys.argv[0]
+    print "\tTHIS SCRIPT WILL MOST LIKELY NEED TO BE EDITED FOR YOUR GFF FILE\n"
+    sys.exit(1)
+
+index = 1
+# Cistematic just want's a use set of exons labeled "CDS", "5UTR", and "3UTR"
+# just put the corresponding type in your GFF file as the key in the key:value pairs 
+# in the ftypeDict below
+ftypeDict = {"CDS": "CDS",
+             "mRNA": "mRNA",
+             "five_prime_utr": "5UTR",
+             "three_prime_utr": "3UTR"
+}
+
+chrom = ""
+idfields = ""
+gene = ""
+sense = ""
+start = 0
+stop = 0
+ftype = ""
+
+infile = open(sys.argv[1])
+outfile = open(sys.argv[2], "w")
+for line in infile:
+    if line[0]=="#":
+        continue
+
+    fields = line.strip().split()
+    try:
+        if fields[2] in ftypeDict:
+            # this part of the code will need to be customized, most likely
+            # how does the annotation define the gene, geneid, and chromosome
+            # for example, for Anopheles Gambiae we have
+            #chrX    VectorBase      mRNA    582     16387   .       -       .       ID=vectorbase|AGAP000002-RA; stable_id=AGAP000002-RA.1; Parent=vectorbase|AGAP000002;
+            if fields[2] == "mRNA":
+                chrom = fields[0][3:]
+                source = fields[1]
+                idfields = fields[9].split(";")
+                geneid = idfields[0].split("=")[1]
+                sense = fields[6]
+            else:
+                start = int(fields[3])
+                stop = int(fields[4])
+                ftype = ftypeDict[fields[2]]
+                outline = "%s\t%s%d\t%s\t%d\t%d\t%s\t%s\n" % (geneid, source, index, chrom, start, stop, sense, ftype)
+                outfile.write(outline)
+    except:
+        sys.exit()
+
+    index += 1
+
+infile.close()
+outfile.close()
diff --git a/gointersects.py b/gointersects.py
new file mode 100755 (executable)
index 0000000..0f74727
--- /dev/null
@@ -0,0 +1,46 @@
+#
+#  gointersects.py
+#  ENRAGE
+#
+
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 4:
+        print "usage: python %s gogidfile gidfile outfile" % argv[0]
+        sys.exit(1)
+
+    gogidfilename = argv[1]
+    gidfilename = argv[2]
+    outfilename = argv[3]
+
+    gointersects(gogidfilename, gidfilename, outfilename)
+
+
+def gointersects(gogidfilename, gidfilename, outfilename):
+    gidList = []
+    gogidfile = open(gogidfilename)
+    for line in gogidfile:
+        fields = line.split()
+        gidList.append(fields[0])
+
+    gogidfile.close()
+
+    gidfile = open(gidfilename)
+    outfile = open(outfilename, "w")
+    for line in gidfile:
+        fields = line.split()
+        if fields[0] in gidList:
+            outfile.write(line)
+
+    gidfile.close()
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/hepg2.rds b/hepg2.rds
new file mode 100644 (file)
index 0000000..8674f76
Binary files /dev/null and b/hepg2.rds differ
diff --git a/intersects.py b/intersects.py
new file mode 100755 (executable)
index 0000000..67e7d35
--- /dev/null
@@ -0,0 +1,149 @@
+#
+#  intersects.py
+#  ENRAGE
+#
+
+import sys, optparse
+
+print 'version 2.0'
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog infile1 infile2 outfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("-d", dest="delimiter")
+    parser.add_option("--file3", dest="infile3")
+    parser.add_option("-1", type="int", dest="matchfield1")
+    parser.add_option("-2", type="int", dest="matchfield2")
+    parser.add_option("-3", type="int", dest="matchfield3")
+    parser.add_option("-reject1", dest="reject1file")
+    parser.add_option("-trackGID", action="store_true", dest="trackGID")
+    parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0,
+                        matchField3=0, rejectFileName="", trackGID=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    infile1 = args[0]
+    infile2 = args[1]
+    outfile = args[2]
+
+    intersects(infile1, infile2, outfile, options.delimiter, options.infile3,
+               options.matchField1, options.matchField2, options.matchField3,
+               options.rejectFileName, options.trackGID)
+
+
+def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
+               matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
+
+    if rejectFileName:
+        doReject1 = True
+        reject1file = open(rejectFileName)
+    else:
+        doReject1 = False
+
+    if infile3Name is not None:
+        doFile3 = True
+    else:
+        doFile3 = False
+
+    matchedList = []
+    matchedList12 = []
+    matchedList13 = []
+    matchedList23 = []
+    gidDict = {}
+
+    if trackGID:
+        gidKeys = gidDict.keys()
+        list1, fileGIDDict = getCandidatesAndGIDFromFile(infile1Name, delimiter, matchField1, gidKeys)
+        for entry in fileGIDDict.keys():
+            gidDict[entry] = fileGIDDict[entry]
+
+        gidKeys = gidDict.keys()
+        list2, fileGIDDict = getCandidatesAndGIDFromFile(infile2Name, delimiter, matchField2, gidKeys)
+        for entry in fileGIDDict.keys():
+            gidDict[entry] = fileGIDDict[entry]
+            
+        if doFile3:
+            gidKeys = gidDict.keys()
+            list3, fileGIDDict = getCandidatesAndGIDFromFile(infile3Name, delimiter, matchField3, gidKeys)
+            for entry in fileGIDDict.keys():
+                gidDict[entry] = fileGIDDict[entry]
+    else:
+        list1 = getCandidateListFromFile(infile1Name, delimiter, matchField1)
+        list2 = getCandidateListFromFile(infile2Name, delimiter, matchField2)
+        if doFile3:
+            list3 = getCandidateListFromFile(infile3Name, delimiter, matchField3)
+
+    for candidate in list1:
+        if doFile3 and candidate in list2 and candidate in list3:
+            matchedList.append(candidate)
+        elif doFile3 and candidate in list3:
+            matchedList13.append(candidate)
+        elif doFile3 and candidate in list2:
+            matchedList12.append(candidate)
+        elif not doFile3 and candidate in list2:
+            matchedList.append(candidate)
+        elif doReject1:
+            if trackGID:
+                reject1file.write("%s%s%s\n" % (candidate, delimiter, gidDict[candidate]))
+            else:
+                reject1file.write("%s\n" % candidate)
+
+    if doFile3:
+        for candidate in list2:
+            if candidate not in list1 and candidate in list3:
+                matchedList23.append(candidate)
+
+    print len(list1), len(list2), len(list3)
+    if doFile3:
+        print len(matchedList12), len(matchedList13), len(matchedList23)
+    print len(matchedList)
+
+    outfile = open(outfileName, "w")
+    for match in matchedList:
+        if trackGID:
+            outfile.write("%s%s%s\n" % (match, delimiter, gidDict[match]))
+        else:
+            outfile.write("%s\n" % match)
+
+    outfile.close()
+
+
+def getCandidatesFromFile(filename, delimiter, matchField, trackGID=False, gidList=[]):
+    infile = open(filename)
+    candidateList = []
+    gidDict = {}
+
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split(delimiter)
+        candidate = fields[matchField]
+        if candidate not in candidateList:
+            candidateList.append(candidate)
+
+        if trackGID and candidate not in gidList:
+            gidDict[candidate] = fields[matchField + 1]
+
+    infile.close()
+    return candidateList, gidDict
+
+
+def getCandidatesAndGIDFromFile(filename, delimiter, matchField, gidList=[]):
+    return getCandidatesFromFile(filename, delimiter, matchField, trackGID=True, gidList=[])
+
+
+def getCandidateListFromFile(filename, delimiter, matchField):
+    candidateList, gidDict = getCandidatesFromFile(filename, delimiter, matchField)
+    return candidateList
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/listGeneFeatures.py b/listGeneFeatures.py
new file mode 100755 (executable)
index 0000000..607100d
--- /dev/null
@@ -0,0 +1,59 @@
+#
+#  listGeneFeatures.py
+#  ENRAGE
+#
+
+import sys
+from cistematic.genomes import Genome
+from commoncode import getMergedRegions, getFeaturesByChromDict
+
+print "%s: version 1.1" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 4:
+        print "usage: python %s genome [acceptFile] gid outfile\n" % argv[0]
+        sys.exit(1)
+
+    genome = argv[1]
+
+    if len(argv) == 4:
+        gid = argv[2]
+        outfile = argv[3]
+    else:
+        acceptFileName = argv[2]
+        gid = argv[3]
+        outfile = argv[4]
+
+    listGeneFeatures(genome, gid, outfile, acceptFileName)
+
+
+def listGeneFeatures(genome, gid, outFileName, acceptFileName=""):
+    hg = Genome(genome)
+    outfile = open(outFileName, "w")
+    if acceptFileName:
+        additionalDict = getMergedRegions(acceptFileName, maxDist = 0, keepLabel = True, verbose = True)
+    else:
+        additionalDict = {}
+
+    featuresDict = getFeaturesByChromDict(hg, additionalDict, restrictList=[gid])
+    outfile.write('track name="LOC%s"\n' % gid)
+
+    senseDict = {"F": "+",
+                 "R": "-",
+                 "+": "+",
+                 "-": "-"
+    }
+
+    for chrom in featuresDict:
+        for (start, stop, fgid, sense, ftype) in featuresDict[chrom]:
+            outfile.write("chr%s\t%d\t%d\t%s\t0\t%s\n" % (chrom, start, stop, ftype, senseDict[sense]))
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makeGraphs.py b/makeGraphs.py
new file mode 100644 (file)
index 0000000..3965e5c
--- /dev/null
@@ -0,0 +1,127 @@
+import sys, os
+
+
+def getEdges(nodeList, shorten=False):
+    edgeDict = {}
+
+    for nodeEntry in nodeList:
+        try:
+            (node1, node2, count) = nodeEntry.strip().split("\t")
+        except ValueError:
+            continue
+
+        if shorten:
+            try:
+                node1 = node1.split("_")[1]
+            except IndexError:
+                pass
+
+            try:
+                node2 = node2.split("_")[1]
+            except IndexError:
+                pass
+
+        node1Detail = (node1, int(count))
+        node2Detail = (node2, int(count))
+        try:
+            if node2Detail not in edgeDict[node1]:
+                edgeDict[node1].append(node2Detail)
+        except KeyError:
+            edgeDict[node1] = [node2Detail]
+
+        try:
+            if node1Detail not in edgeDict[node2]:
+                edgeDict[node2].append(node1Detail)
+        except KeyError:
+            edgeDict[node2] = [node1Detail]
+
+    return edgeDict
+
+
+def getEdgesFromFile(inFileName, shorten=False):
+
+    infile = open(inFileName)
+    edgeDict = getEdges(infile, shorten)
+    infile.close()
+
+    return edgeDict
+
+
+def getOutputLine(currentNode, node, nodeCount):
+    if nodeCount > 2:
+        outputLine = '\t"%s" -- "%s" [ label = "%d", penwidth=%d, color="red", constraint=false] ; \n' % (currentNode, node, nodeCount, nodeCount)
+    else:
+        outputLine = '\t"%s" -- "%s" [ label = "%d", color="red", constraint=false] ; \n' % (currentNode, node, nodeCount)
+
+    return outputLine
+
+
+infilename = sys.argv[1]
+outprefix = sys.argv[2]
+
+shorten = False
+if "-shorten" in sys.argv:
+    shorten = True
+
+edgeDict = getEdgesFromFile(infilename, shorten)
+
+nodeList = edgeDict.keys()
+seenNodeDict = {}
+seenEdgeDict = {}
+currentNodeList = []
+currentEdgeList = []
+treeList = []
+localCount = []
+
+outstat = open("%s.stats" % outprefix,"w")
+outstat.write("#gID\tnodes\tedges\tweight\n")
+
+def visitNodes(currentNode):
+    if currentNode in seenNodeDict:
+        return
+
+    seenNodeDict[currentNode] = []
+    for (node, nodeCount) in edgeDict[currentNode]:
+        nodePair = [node, currentNode]
+        nodePair.sort()
+        if str(nodePair) not in seenEdgeDict:
+            if node not in currentNodeList:
+                currentNodeList.append(node)
+
+            outputLine = getOutputLine(currentNode, node, nodeCount)
+            currentEdgeList.append(outputLine)
+            seenEdgeDict[str(nodePair)] = 0
+            localCount[0] += nodeCount
+            try:
+                visitNodes(node)
+            except:
+                pass
+
+print "getting trees"
+for node in nodeList:
+    if node not in seenNodeDict:
+        currentNodeList = [node]
+        currentEdgeList = []
+        localCount = [0]
+        outfile = open("%s.%s.gv" % (outprefix, node), "w")
+        treeList.append(node)
+        outfile.write("graph g%s {\n" % node)
+        visitNodes(node)
+        currentNodeList.sort()
+        outfile.write('subgraph G0 {\n\t"%s" ' % currentNodeList[0])
+        for anode in currentNodeList[1:]:
+            outfile.write('-- "%s" ' % anode)
+
+        outfile.write(" [ weight = 100 ] ;\n\tordering = out ;\n}\n")
+        for line in currentEdgeList:
+            outfile.write(line)
+
+        outfile.write("}\n")
+        outfile.close()
+        outstat.write("%s\t%d\t%d\t%d\n" % (node, len(currentNodeList), len(currentEdgeList), localCount[0]))
+
+print "generating pngs"
+for node in treeList:
+    output = os.popen("dot -Tpng %s.%s.gv > %s.%s.png" % (outprefix, node, outprefix, node))
+
+outstat.close()
\ No newline at end of file
diff --git a/makeSNPtrack.py b/makeSNPtrack.py
new file mode 100755 (executable)
index 0000000..23d8ac9
--- /dev/null
@@ -0,0 +1,99 @@
+#
+#  makeSNPtrack.py
+#  ENRAGE
+#
+# This script maps all the qualified SNC sites on to the genome browser 
+# Output format: bed
+# Written by: Wendy Lee
+# Written on: August 18th, 2008
+# Last Modified: December 14th, 2008 by Ali Mortazavi
+
+import sys
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "%s: version 1.2" % argv[0]
+
+    if len(argv) < 4:
+        print "usage: python %s snpfile trackname trackoutfile" % argv[0]
+        sys.exit(1)
+
+    snpfile = argv[1]
+    track = argv[2]
+    outfile = argv[3]
+
+    makeSNPtrack(snpfile, track, outfile)
+
+
+def makeSNPtrack(snpfilename, track, outfilename):
+
+    snpfile = open(snpfilename, "r")
+    writeSNPsBedfile(snpfile, track, outfilename)
+    snpfile.close()
+
+
+def writeSNPsBedfile(snpPropertiesList, track, outfilename):
+
+    outfile = open(outfilename, "w")
+    header = getHeader(track)
+    outfile.write(header)
+
+    for line in snpPropertiesList:
+        if doNotProcessLine(line):
+            continue
+
+        fields = line.strip().split()
+        outline = getBedOutputLine(fields)
+        outfile.write(outline)
+
+    outfile.close()
+
+
+def getHeader(track):
+    header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track)
+    return header
+
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def getBedOutputLine(snpPropertiesList):
+    chromosome = snpPropertiesList[2]
+    readStart = int(snpPropertiesList[3]) - 1
+    readStop = readStart + 1
+    readName = snpPropertiesList[7]
+    color = getSNPColor(readName)
+    score = "0"
+    sense = "+"
+    outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color)
+
+    return outline
+
+
+def getSNPColor(readName):
+    baseColor = {"A": "200, 0, 255",
+                 "T": "200, 0, 255",
+                 "C": "200, 0, 255",
+                 "G": "200, 0, 255"
+    }
+
+    specialColors = {"A-G": "255, 0, 0",
+                     "T-C": "0, 0, 255"
+    }
+
+    if readName in specialColors.keys():
+        color = specialColors[readName]
+    else:
+        try:
+            color = baseColor[readName[-1]]
+        except (IndexError, KeyError):
+            color = "200, 0, 255"
+
+    return color
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makebedfromrds.py b/makebedfromrds.py
new file mode 100755 (executable)
index 0000000..924bc7e
--- /dev/null
@@ -0,0 +1,369 @@
+#
+#  makebedfromrds.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 7/19/08.
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset
+
+PLUS_COLOR = "0,0,255"
+MINUS_COLOR = "255,0,0"
+MULTI_PLUS_COLOR = "64,64,64"
+MULTI_MINUS_COLOR = "192,192,192"
+SPLICE_COLOR = "255,0,0"
+UNIQUE_COLOR = "0,0,0"
+MULTI_COLOR = "128,128,128"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    verstring = "%prog: version 3.1"
+    print verstring
+
+    doPairs = False
+    
+    usage = "usage:  %prog trackLabel rdsFile bamFile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", type="int", dest="pairDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--strand", dest="strand")
+    parser.add_option("-r", "--region", dest="region", type="string",
+                      help="samtools region string")
+    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False,
+                        pairDist=None, withFlag="", useFlagLike=False, enforceChr=False,
+                        senseStrand="", allChrom=True, doCache=False, cachePages=100000,
+                        chromList=[])
+    (options, args) = parser.parse_args(argv[1:])
+
+    try:
+        trackType = args[0]
+    except IndexError:
+        print "no track specified - see --help for usage"
+        sys.exit(1)
+
+    try:
+        rdsfile = args[1]
+    except IndexError:
+        print "no RDS file specified - see --help for usage"
+        sys.exit(1)
+
+    try:
+        outfilename = args[2]
+    except IndexError:
+        print "no output file specified - see --help for usage"
+        sys.exit(1)
+
+    if options.pairDist is not None:
+        doPairs = True
+
+    if options.chromList:
+        options.allChrom = False
+
+    outputBedFromRds(trackType, rdsfile, outfilename, options.withUniqs, options.withMulti,
+                     options.doSplices, options.doSpliceColor, doPairs, options.pairDist,
+                     options.withFlag, options.useFlagLike, options.enforceChr, options.senseStrand,
+                     options.allChrom, options.doCache, options.cachePages, options.chromList)
+
+
+def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
+                     doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
+                     withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
+                     allChrom=True, doCache=False, cachePages=100000, chromList=[]):
+
+    if not withUniqs and not withMulti and not doSplices:
+        print "must be outputing at least one of uniqs, multi, or -splices - exiting"
+        sys.exit(1)
+
+    print "\nsample:"
+    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+
+    #check that this is better than the dataset's default cache size
+    if cachePages > RDS.getDefaultCacheSize():
+        RDS.setDBcache(cachePages)
+
+    readlength = RDS.getReadSize()
+    minDist = -1 * readlength
+
+    if allChrom:
+        if withUniqs:
+            chromList = RDS.getChromosomes()
+        elif withMulti:
+            chromList = RDS.getChromosomes(table="multi")
+        else:
+            chromList = RDS.getChromosomes(table="splices")
+
+        chromList.sort()
+
+    outfile = open(outfilename, "w")
+    outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (trackType))
+
+    if withUniqs or withMulti:
+        for achrom in chromList:
+            index = 0
+            if doNotOutputChromosome(achrom, enforceChr):
+                continue
+
+            print "chromosome %s" % (achrom)
+
+            if doPairs:
+                hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag,
+                                           withWeight=True, withPairID=True, doUniqs=withUniqs,
+                                           doMulti=withMulti, readIDDict=True,
+                                           flagLike=useFlagLike, strand=senseStrand)
+
+                readIDList = hitDict.keys()
+                if doSplices:
+                    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag,
+                                                    withPairID=True, readIDDict=True,
+                                                    flagLike=useFlagLike, strand=senseStrand)
+
+                    spliceIDList = spliceDict.keys()
+                    combDict = {}
+                    for readID in readIDList:
+                        combDict[readID] = 1
+
+                    for readID in spliceIDList:
+                        combDict[readID] = 1
+
+                    combinedIDList = combDict.keys()
+                else:
+                    combinedIDList = readIDList
+
+                for readID in combinedIDList:
+                    localList = []
+                    try:
+                        localList = hitDict[readID]
+                    except:
+                        pass
+
+                    if doSplices:
+                        try:
+                            localList += spliceDict[readID]
+                        except:
+                            pass
+
+                    localList.sort()
+                    listLen = len(localList) - 1
+                    localIndex = 0
+                    while localIndex <= listLen:
+                        try:
+                            (leftpos, leftsense, leftweight, lPairID) = localList[localIndex]
+                            leftstop = leftpos + readlength - 1
+                            lpart = 1
+                            startList = [leftpos]
+                            stopList = [leftstop]
+                        except:
+                            (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex]
+                            leftweight = 1.0
+                            lpart = 2
+                            startList = [leftpos, LRstart]
+                            stopList = [LLstop, leftstop]
+
+                        if localIndex < listLen:
+                            try:
+                                (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1]
+                                rightstop = rightpos + readlength - 1
+                                rpart = 1
+                                rstartList = [rightpos]
+                                rstopList = [rightstop]
+                            except:
+                                (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1]
+                                rightweight = 1.0
+                                rpart = 2
+                                rstartList = [rightpos, RRstart]
+                                rstopList = [RLstop, rightstop]
+                        else:
+                            rightsense = "+"
+                            rightpos = 0
+                            rstartList = []
+                            rstopList = []
+
+                        if leftsense == "+" and rightsense == "-" and minDist < (rightpos - leftstop) < pairDist and lPairID != rPairID:
+                            if doSpliceColor:
+                                plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+                            elif leftweight == 1.0 or rightweight == 1.0:
+                                plusSenseColor = "0,0,0"
+                                minusSenseColor = MINUS_COLOR
+                            else:
+                                plusSenseColor = "128,128,128"
+                                minusSenseColor = MULTI_MINUS_COLOR
+
+                            splitReadWrite(outfile, achrom, lpart + rpart, startList + rstartList, stopList + rstopList, "+", readID, plusSenseColor, minusSenseColor)
+                            localIndex += 2
+                            index += 2
+                        else:
+                            if doSpliceColor:
+                                plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight)
+                                outputSense = "+"
+                            elif leftweight == 1.0:
+                                plusSenseColor = PLUS_COLOR
+                                minusSenseColor = MINUS_COLOR
+                                outputSense = leftsense
+                            else:
+                                plusSenseColor = PLUS_COLOR
+                                minusSenseColor = MINUS_COLOR
+                                outputSense = leftsense
+
+                            splitReadWrite(outfile, achrom, lpart, startList, stopList, outputSense, readID, plusSenseColor, minusSenseColor)
+                            localIndex += 1
+                            index += 1
+            else:
+                hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
+                try:
+                    for (pos, sense, weight, readID) in hitDict[achrom]:
+                        splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
+                        index += 1
+                except:
+                    pass
+
+                if doSplices:
+                    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
+                    if achrom not in spliceDict:
+                        continue
+                    for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+                        splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
+                        index += 1
+
+    elif doSplices:
+        for achrom in chromList:
+            index = 0
+            if doNotOutputChromosome(achrom, enforceChr):
+                continue
+
+            print "chromosome %s" % (achrom)
+
+            spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
+            if achrom not in spliceDict:
+                continue
+            for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+                splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
+                index += 1
+
+        print index
+
+    outfile.close()
+
+
+def singleReadWrite(chrom, pos, sense, weight, readID, readlength, outfile):
+    start = pos
+    stop = pos + readlength - 1
+    senseColor = getSenseColor(sense, weight)
+    outfile.write("%s %d %d %s %.1f %s 0 0 %s\n" % (chrom, start, stop, readID, weight, sense, senseColor))
+
+
+def getSenseColor(sense, weight):
+    if weight < 1.0:
+        senseColor = getMultiSenseColor(sense)
+    else:
+        senseColor = getSingleSenseColor(sense)
+
+    return senseColor
+
+
+def getMultiSenseColor(sense):
+    if sense == "+":
+        senseColor = MULTI_PLUS_COLOR
+    else:
+        senseColor = MULTI_MINUS_COLOR
+
+    return senseColor
+
+
+def getSingleSenseColor(sense):
+    if sense == "+":
+        senseColor = PLUS_COLOR
+    else:
+        senseColor = MINUS_COLOR
+
+    return senseColor
+
+
+def splitReadWrite(outfile, chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
+    readSizes = getReadSizes(numPieces, startList, stopList)
+    readCoords = getReadCoords(numPieces, startList)
+    leftStart = startList[0]
+    rightStop = stopList[-1]
+
+    if rsense == "+":
+        senseCode = plusSense
+    else:
+        senseCode = minusSense
+    
+    outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+    outfile.write(outline)
+
+
+def getReadSizes(numPieces, startList, stopList):
+    readSizes = "%d" % (stopList[0] - startList[0])
+    for index in range(1, numPieces):
+        readSizes += ',%d' % (stopList[index] - startList[index])
+
+    return readSizes
+
+
+def getReadCoords(numPieces, startList):
+    readCoords = "0"
+    for index in range(1, numPieces):
+        readCoords += ",%d" % (startList[index] - startList[0])
+
+    return readCoords
+
+
+def getSpliceColor(lpart, rpart, leftweight, rightweight, hackType=None):
+    if hackType == "1":
+        if (lpart + rpart) > 2:
+            aColor = SPLICE_COLOR
+            bColor = SPLICE_COLOR
+        elif leftweight == 1.0 or rightweight == 1.0:
+            aColor = UNIQUE_COLOR
+            bColor = UNIQUE_COLOR
+        else:
+            aColor = MULTI_COLOR
+            bColor = MULTI_COLOR
+    else:
+        if lpart  > 1:
+            aColor = SPLICE_COLOR
+            bColor = SPLICE_COLOR
+        elif leftweight == 1.0:
+            aColor = UNIQUE_COLOR
+            bColor = UNIQUE_COLOR
+        else:
+            aColor = MULTI_COLOR
+            bColor = MULTI_COLOR
+
+    return aColor, bColor
+
+
+def doNotOutputChromosome(achrom, enforceChr):
+    result = False
+
+    if achrom == "chrM":
+        result = True
+
+    if enforceChr and ("chr" not in achrom):
+        result = True
+
+    return result
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makerdsfrombed.py b/makerdsfrombed.py
new file mode 100755 (executable)
index 0000000..4f38d51
--- /dev/null
@@ -0,0 +1,126 @@
+#
+#  makerdsfrombed.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 6/21/08.
+#
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 2.1" % sys.argv[0]
+print verstring
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--RNA", action="store_true", dest="rnaDataType")
+    parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        print "\ntreats all imported reads as uniquely mapped\n"
+        sys.exit(1)
+
+    label = args[0]
+    filename = args[1]
+    outdbname = args[2]
+
+    if options.rnaDataType:
+        dataType = "RNA"
+    else:
+        dataType = "DNA"
+
+    propertyList = []
+    for arg in args:
+        if "::" in arg:
+            (pname, pvalue) = arg.strip().split("::")
+            propertyList.append((pname, pvalue))
+
+    makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList)
+
+
+def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]):
+    readsize = 0
+    padsize = 0
+    index = 0
+    insertSize = 100000
+
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
+
+    infile = open(filename,"r")
+
+    rds = readDataset(outdbname, init, dataType, verbose=True)
+    if not init:
+        rds.dropIndex()
+
+    #check that our cacheSize is better than the dataset's default cache size
+    defaultCacheSize = rds.getDefaultCacheSize()
+    if cachePages > defaultCacheSize:
+        if init:
+            rds.setDBcache(cachePages, default=True)
+        else:
+            rds.setDBcache(cachePages)
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+    insertList = []
+    for line in infile:
+        if "track" in line:
+            continue
+
+        fields = line.split()
+        if readsize == 0:
+            readsize = abs(int(fields[1]) - int(fields[2]))
+            if init:
+                rds.insertMetadata([("readsize", readsize+1)])
+                rds.insertMetadata([("imported_from_bed", "True")])
+
+        chrom = fields[0]
+        start = int(fields[1])
+        stop = int(fields[2])
+        sense = fields[5]
+        readID = "%s-%s" % (label, str(index))
+        insertList.append((readID, chrom, start, stop, sense, 1.0, "", ""))
+        if index % insertSize == 0:
+            rds.insertUniqs(insertList)
+            insertList = []
+            print ".",
+            sys.stdout.flush()
+
+        index += 1
+
+    if len(insertList) > 0:
+        rds.insertUniqs(insertList)
+
+    countString = "%d unique reads" % index
+    print countString
+
+    writeLog(outdbname + ".log", verstring, countString)
+
+    if doIndex:
+        print "building index...."
+        if cachePages > defaultCacheSize:
+            rds.setDBcache(cachePages)
+            rds.buildIndex(cachePages)
+        else:
+            rds.buildIndex(defaultCacheSize)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makerdsfromblat.py b/makerdsfromblat.py
new file mode 100755 (executable)
index 0000000..f92d5f5
--- /dev/null
@@ -0,0 +1,362 @@
+#
+#  makerdsfromblat.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 12/7/08.
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 3.9"
+print verstring
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+    parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
+    parser.add_option("--flag", action="store_true", dest="flagReads")
+    parser.add_option("--strict", type="int", dest="minSpliceLength",
+                      help="min required bp on each side of a splice")
+    parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--RNA", dest="geneDataFileName")
+    parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    label = args[0]
+    filename = args[1]
+    outdbname = args[2]
+
+    if options.geneDataFileName:
+        dataType = "RNA"
+    else:
+        dataType = "DNA"
+
+    theFlag = ""
+    if options.flagReads:
+        theFlag = "blat"
+
+    propertyList = []
+    for arg in args:
+        if "::" in arg:
+            (pname, pvalue) = arg.strip().split("::")
+            propertyList.append((pname, pvalue))
+
+    makerdsfromblat(label, filename, outdbname, dataType, options.init,
+                   options.doIndex, options.trimReadID, options. minSpliceLength,
+                   options.forceRNA, theFlag, options.spliceOnly, options.verbose,
+                   options.cachePages, options.geneDataFileName, propertyList)
+
+
+def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
+                    doIndex=False,trimReadID=True, minSpliceLength=0,
+                    forceRNA=False, theFlag="", spliceOnly=False,
+                    verbose=False, cachePages=100000, geneDataFileName="",
+                    propertyList=[]):
+
+    delimiter = "|"
+    minIntron = 10
+    maxBorder = 0
+    index = 0
+    insertSize = 100000
+
+    if forceRNA:
+        print "forcing datatype to RNA"
+        dataType = "RNA"
+
+    if dataType == "RNA":
+        genedatafile = open(geneDataFileName)
+
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
+
+    geneDict = {}
+    mapDict = {}
+    if dataType == "RNA" and not forceRNA:
+        for line in genedatafile:
+            fields = line.strip().split("\t")
+            blockCount = int(fields[7])
+            if blockCount < 2:
+                continue
+
+            uname = fields[0]
+            chrom = fields[1]
+            sense = fields[2]
+            chromstarts = fields[8][:-1].split(",")
+            chromstops = fields[9][:-1].split(",")
+            exonLengths = []
+            totalLength = 0
+            for index in range(blockCount):
+                chromstarts[index] = int(chromstarts[index])
+                chromstops[index] = int(chromstops[index])
+                exonLengths.append(chromstops[index] - chromstarts[index])
+                totalLength += exonLengths[index]
+
+            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+            mapDict[uname] = []
+
+        genedatafile.close()
+
+    rds = readDataset(outdbname, init, dataType, verbose=True)
+
+    #check that our cacheSize is better than the dataset's default cache size
+    defaultCacheSize = rds.getDefaultCacheSize()
+    if cachePages > defaultCacheSize:
+        if init:
+            rds.setDBcache(cachePages, default=True)
+        else:
+            rds.setDBcache(cachePages)
+
+    if not init and doIndex:
+        try:
+            if rds.hasIndex():
+                rds.dropIndex()
+        except:
+            if verbose:
+                print "couldn't drop Index"
+
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+    # make some assumptions based on first read
+    infile = open(filename, "r")
+    for arg in range(6):
+        line = infile.readline()
+
+    fields = line.split()
+    readsize = int(fields[10])
+    pairedTest = fields[9][-2:]
+    paired = False
+    if pairedTest in ["/1", "/2"]:
+        print "assuming reads are paired"
+        paired = True
+
+    print "read size: %d bp" % readsize
+    if init:
+        rds.insertMetadata([("readsize", readsize)])
+        if paired:
+            rds.insertMetadata([("paired", "True")])
+
+    infile.close()
+    if "blat_mapped" not in rds.getMetadata():
+        rds.insertMetadata([("blat_mapped", "True")])
+
+    minReadScore = readsize - readsize/25 - 1
+    trim = -4
+    if dataType == "RNA":
+        maxBorder = readsize + trim
+
+    infile = open(filename, "r")
+    prevID = ""
+    readList = []
+    uInsertList = []
+    mInsertList = []
+    sInsertList = []
+    index = uIndex = mIndex = sIndex = lIndex = 0
+    bestScore = 0
+    # skip headers
+    for arg in range(5):
+        line = infile.readline()
+
+    for line in infile:
+        lIndex += 1
+        fields = line.strip().split()
+        readID = fields[9]
+        if trimReadID:
+            readID = string.join(readID.split(":")[1:], ":")
+
+        if readID != prevID:
+            newReadList = []
+            if bestScore > minReadScore:
+                for readData in readList:
+                    if readData[1] == bestScore:
+                        newReadList.append(readData)
+
+            if trimReadID:
+                prevID = label + "-" + prevID
+
+            listlen = len(newReadList)
+            if listlen == 1:
+                parts = int(newReadList[0][0])
+                if parts == 1 and not spliceOnly:
+                    (part, score, sense, chrom, start, mismatches) = newReadList[0]
+                    stop = start + readsize
+                    uInsertList.append((prevID, chrom, start, stop, sense, 1.0, theFlag, mismatches))
+                    uIndex += 1
+                elif forceRNA and parts == 2:
+                    (part, score, sense, chrom, startList, lengthList, mismatchList) = newReadList[0]
+                    startL = int(startList[0]) 
+                    stopL = startL + int(lengthList[0])
+                    startR = int(startList[1])
+                    stopR = startR + int(lengthList[1])
+                    if stopL + minIntron < startR:
+                        sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
+                        sIndex += 1
+                elif parts == 2:
+                    print newReadList
+                    (part, score, sense, chrom, start, mismatches) = newReadList[0]
+                    currentSplice = chrom
+                    (model, spliceID, regionStart) = currentSplice.split(delimiter)
+                    if model not in geneDict:
+                        print fields
+                        continue
+
+                    (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                    spliceID = int(spliceID)
+                    rstart = int(start) - 2
+                    lefthalf = maxBorder - rstart
+                    if lefthalf < 1 or lefthalf > maxBorder:
+                        continue
+
+                    righthalf = readsize - lefthalf
+                    startL = int(regionStart)  + rstart
+                    stopL = startL + lefthalf
+                    startR = chromstarts[spliceID + 1]
+                    stopR = chromstarts[spliceID + 1] + righthalf
+                    if stopL + minIntron < startR:
+                        sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, theFlag, mismatches))
+                        sIndex += 1
+            elif listlen > 1 and not spliceOnly:
+                prevID = prevID + "::" + str(listlen)
+                mIndex += 1
+                # ignore multireads that can also map across splices
+                skip = False
+                for readData in newReadList:
+                    if readData[0] > 1:
+                        skip = True
+
+                if not skip:
+                    for (part, score, sense, chrom, start, mismatches) in newReadList:
+                        stop = start + readsize
+                        mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, theFlag, mismatches))
+            else:
+                prevID = readID
+
+            if index % insertSize == 0:
+                rds.insertUniqs(uInsertList)
+                rds.insertMulti(mInsertList)
+                uInsertList = []
+                mInsertList = []
+                if dataType == "RNA":
+                    rds.insertSplices(sInsertList)
+                    sInsertList = []
+
+                print ".",
+                sys.stdout.flush()
+
+            # start processing new read
+            readList = []
+            prevID = readID
+            bestScore = 0
+            index += 1
+
+        # add the new read
+        score = int(fields[0])
+        sense = fields[8]
+        chrom = fields[13]
+        parts = int(fields[17])
+        passStrict = True
+        if parts > 1:
+            lengthList = fields[18][:-1].split(",")
+            startList = fields[20][:-1].split(",")
+            listlen = len(lengthList)
+            for lpos in range(listlen):
+                if int(lengthList[lpos]) < minSpliceLength:
+                    passStrict = False
+
+                # throw out deletions, for now
+                if lpos > 0:
+                    if int(lengthList[lpos - 1]) == int(startList[lpos]):
+                        passStrict = False
+            pass
+        else:
+            start = int(fields[15])
+
+        if passStrict:
+            if score > bestScore:
+                bestScore = score
+
+            mismatches = ""
+            if int(fields[1]) > 0:
+                try:
+                    mismatches = decodeMismatches(fields[-1].upper(), fields[-2].upper(), sense)
+                except:
+                    mismatches = ""
+
+            if parts == 1:
+                readList.append((parts, score, sense, chrom, start, mismatches))
+            else:
+                readList.append((parts, score, sense, chrom, startList, lengthList, mismatches))
+
+        if lIndex % 1000000 == 0:
+            print "processed %d lines" % lIndex
+
+    print "%d lines processed" % lIndex
+
+    if len(uInsertList) > 0:
+        rds.insertUniqs(uInsertList)
+    if len(mInsertList) > 0:
+        rds.insertMulti(mInsertList)
+    if len(sInsertList) > 0:
+        rds.insertSplices(sInsertList)
+
+    combString = "%d unique reads" % uIndex
+    combString += "\t%d multi reads" % mIndex
+    if dataType == "RNA":
+        combString += "\t%d spliced reads" % sIndex
+
+    print
+    print combString.replace("\t", "\n")
+
+    writeLog(outdbname + ".log", verstring, combString)
+
+    if doIndex:
+        print "building index...."
+        if cachePages > defaultCacheSize:
+            rds.setDBcache(cachePages)
+            rds.buildIndex(cachePages)
+        else:
+            rds.buildIndex(defaultCacheSize)
+
+
+def decodeMismatches(gString, rString, rsense):
+    
+    output = []
+    rlen = len(gString)
+    partIndex = 0
+    for rindex in xrange(rlen):
+        if gString == ",":
+            partIndex += 1
+
+        if gString[rindex] == rString[rindex]:
+            continue
+
+        genNT = gString[rindex]
+        readNT = rString[rindex]
+        # for eland-compatibility, we are 1-based
+        output.append("%s%d%s" % (readNT, rindex + 1 - partIndex, genNT))
+            
+    return string.join(output, ",")
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py
new file mode 100755 (executable)
index 0000000..3534a88
--- /dev/null
@@ -0,0 +1,332 @@
+#
+#  makerdsfrombowtie.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 10/20/08.
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse
+from commoncode import readDataset, writeLog
+
+verstring = "%prog: version 4.1"
+print verstring
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--RNA", dest="genedatafilename")
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--spacer", type="int", dest="spacer")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+    parser.add_option("--forcepair", type="int", dest="forceID")
+    parser.add_option("--flip", action="store_true", dest="flip")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--strip", action="store_true", dest="stripSpace")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
+                        trimReadID=True, forceID=None, flip=False, verbose=False,
+                        stripSpace=False, cachePages=100000)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    label = args[0]
+    filename = args[1]
+    outdbname = args[2]
+
+    propertyList = []
+    for arg in args:
+        if "::" in arg:
+            (pname, pvalue) = arg.strip().split("::")
+            propertyList.append((pname, pvalue))
+
+    makerdsfrombowtie(label, filename, outdbname, options.genedatafilename, options.init,
+                      options.doIndex, options.spacer, options.trimReadID, options.forceID,
+                      options.flip, options.verbose, options.stripSpace, options.cachePages,
+                      propertyList)
+
+
+def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
+                      doIndex=False, spacer=2, trimReadID=True, forceID=None,
+                      flip=False, verbose=False, stripSpace=False, cachePages=100000,
+                      propertyList=[]):
+
+    delimiter = "|"
+
+    dataType = "DNA"
+    if genedatafilename is not None:
+        dataType = "RNA"
+        genedatafile = open(genedatafilename)
+
+
+    forcePair = False
+    if forceID is not None:
+        forcePair = True
+    else:
+        forceID = 0
+
+    maxBorder = 0
+    index = 0
+    insertSize = 100000
+
+    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
+
+    geneDict = {}
+    mapDict = {}
+    if dataType == "RNA":
+        for line in genedatafile:
+            fields = line.strip().split("\t")
+            blockCount = int(fields[7])
+            if blockCount < 2:
+                continue
+
+            uname = fields[0]
+            chrom = fields[1]
+            sense = fields[2]
+            chromstarts = fields[8][:-1].split(",")
+            chromstops = fields[9][:-1].split(",")
+            exonLengths = []
+            totalLength = 0
+            for index in range(blockCount):
+                chromstarts[index] = int(chromstarts[index])
+                chromstops[index] = int(chromstops[index])
+                exonLengths.append(chromstops[index] - chromstarts[index])
+                totalLength += exonLengths[index]
+
+            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+            mapDict[uname] = []
+
+        genedatafile.close()
+
+    rds = readDataset(outdbname, init, dataType, verbose=True)
+
+    #check that our cacheSize is better than the dataset's default cache size
+    defaultCacheSize = rds.getDefaultCacheSize()
+    if cachePages > defaultCacheSize:
+        if init:
+            rds.setDBcache(cachePages, default=True)
+        else:
+            rds.setDBcache(cachePages)
+
+    if not init and doIndex:
+        try:
+            if rds.hasIndex():
+                rds.dropIndex()
+        except:
+            if verbose:
+                print "couldn't drop Index"
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+    # make some assumptions based on first read
+    infile = open(filename, "r")
+    line = infile.readline()
+    if stripSpace:
+        line = line.replace(" ","")
+
+    fields = line.split()
+    readsize = len(fields[5])
+    pairedTest = fields[0][-2:]
+    paired = False
+    if pairedTest in ["/1", "/2"] or forcePair:
+        print "assuming reads are paired"
+        paired = True
+
+
+    print "read size: %d bp" % readsize
+    if init:
+        rds.insertMetadata([("readsize", readsize)])
+        if paired:
+            rds.insertMetadata([("paired", "True")])
+
+    if "bowtie_mapped" not in rds.getMetadata():
+        rds.insertMetadata([("bowtie_mapped", "True")])
+
+    if dataType == "RNA" and "spacer" not in rds.getMetadata():
+        rds.insertMetadata([("spacer", spacer)])
+
+    infile.close()
+
+    trim = -4
+    if dataType == "RNA":
+        maxBorder = readsize + trim
+
+    infile = open(filename, "r")
+    prevID = ""
+    readList = []
+    uInsertList = []
+    mInsertList = []
+    sInsertList = []
+    index = uIndex = mIndex = sIndex = lIndex = 0
+    for line in infile:
+        lIndex += 1
+        if stripSpace:
+            line = line.replace(" ","")
+
+        fields = line.strip().split()
+        readID = fields[0]
+        if trimReadID:
+            readID = string.join(readID.split(":")[1:], ":")
+
+        if readID != prevID:
+            listlen = len(readList)
+            if trimReadID:
+                prevID = "%s-%s" % (label, prevID)
+
+            if forcePair:
+                prevID += "/%d" % forceID 
+
+            if listlen == 1:
+                (sense, chrom, start, mismatches) = readList[0]
+                if flip:
+                    if sense == "+":
+                        sense = "-"
+                    else:
+                        sense = "+"
+
+                if "|" not in chrom:
+                    stop = start + readsize
+                    uInsertList.append((prevID, chrom, start, stop, sense, 1.0, "", mismatches))
+                    uIndex += 1
+                elif dataType == "RNA":
+                    currentSplice = chrom
+                    (model, spliceID, regionStart) = currentSplice.split(delimiter)
+                    if model not in geneDict:
+                        prevID = readID
+                    else:
+                        (gsense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                        spliceID = int(spliceID)
+                        rstart = int(start) - spacer
+                        lefthalf = maxBorder - rstart
+                        if lefthalf < 1 or lefthalf > maxBorder:
+                            prevID = readID
+                        else:
+                            righthalf = readsize - lefthalf
+                            startL = int(regionStart)  + rstart
+                            stopL = startL + lefthalf
+                            startR = chromstarts[spliceID + 1]
+                            stopR = chromstarts[spliceID + 1] + righthalf
+                            sInsertList.append((prevID, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches))
+                            sIndex += 1
+            elif listlen > 1:
+                prevID = "%s::%s" % (prevID, str(listlen))
+                mIndex += 1
+                # ignore multireads that can also map across splices
+                skip = False
+                for (sense, chrom, start, mismatches) in readList:
+                    if "|" in chrom:
+                        skip = True
+
+                if not skip:
+                    for (sense, chrom, start, mismatches) in readList:
+                        stop = start + readsize
+                        if flip:
+                            if sense == "+":
+                                sense = "-"
+                            else:
+                                sense = "+"
+
+                        mInsertList.append((prevID, chrom, start, stop, sense, 1.0 / listlen, "", mismatches))
+            else:
+                prevID = readID
+
+            if index % insertSize == 0:
+                rds.insertUniqs(uInsertList)
+                rds.insertMulti(mInsertList)
+                uInsertList = []
+                mInsertList = []
+                if dataType == "RNA":
+                    rds.insertSplices(sInsertList)
+                    sInsertList = []
+
+                print ".",
+                sys.stdout.flush()
+
+            # start processing new read
+            readList = []
+            prevID = readID
+            index += 1
+
+        # add the new read
+        sense = fields[1]
+        chrom = fields[2]
+        # for eland compat, we are 1-based
+        start = int(fields[3]) + 1
+        mismatches = ""
+        if ":" in fields[-1]:
+            mismatches = decodeMismatches(fields[-1], sense)
+
+        readList.append((sense, chrom, start, mismatches))
+        if lIndex % 1000000 == 0:
+            print "processed %d lines" % lIndex
+
+    print "%d lines processed" % lIndex
+
+    if len(uInsertList) > 0:
+        rds.insertUniqs(uInsertList)
+
+    if len(mInsertList) > 0:
+        rds.insertMulti(mInsertList)
+
+    if len(sInsertList) > 0:
+        rds.insertSplices(sInsertList)
+
+    combString = "%d unique reads" % uIndex
+    combString += "\t%d multi reads" % mIndex
+    if dataType == "RNA":
+        combString += "\t%d spliced reads" % sIndex
+
+    print
+    print combString.replace("\t", "\n")
+
+    writeLog("%s.log" % outdbname, verstring, combString)
+
+    if doIndex:
+        print "building index...."
+        if cachePages > defaultCacheSize:
+            rds.setDBcache(cachePages)
+            rds.buildIndex(cachePages)
+        else:
+            rds.buildIndex(defaultCacheSize)
+
+
+def decodeMismatches(mString, rsense):
+    complement = {"A": "T",
+                  "T": "A",
+                  "C": "G",
+                  "G": "C",
+                  "N": "N"
+    }
+
+    output = []
+    mismatches = mString.split(",")
+    for mismatch in mismatches:
+        (pos,change) = mismatch.split(":")
+        (genNT, readNT) = change.split(">")
+        if rsense == "-":
+            readNT = complement[readNT]
+            genNT  = complement[genNT]
+
+        elandCompatiblePos = int(pos) + 1
+        output.append("%s%d%s" % (readNT, elandCompatiblePos, genNT))
+
+    return string.join(output, ",")
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py
new file mode 100755 (executable)
index 0000000..317ceda
--- /dev/null
@@ -0,0 +1,670 @@
+#
+#  makerdsfromeland2.py
+#  ENRAGE
+#
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse
+from commoncode import readDataset
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    verstring = "%prog: version 3.4"
+    print verstring
+
+    usage = "usage:  %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
+            \ninput reads must be sorted to properly record multireads"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init",
+                      help="append to existing rds file [default: create new]")
+    parser.add_option("--RNA", dest="geneDataFileName",
+                      help="set data type to RNA [default: DNA]")
+    parser.add_option("--index", action="store_true", dest="doIndex",
+                      help="index the output rds file")
+    parser.add_option("--cache", type="int", dest="cachePages",
+                      help="number of cache pages to use [default: 100000")
+    parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
+                      help="use : as the delimiter")
+    parser.add_option("--paired", dest="pairID",
+                      help="pairID value")
+    parser.add_option("--extended", action="store_true", dest="extended",
+                      help="use eland_extended input")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--maxlines", type="int", dest="maxLines",
+                      help="[default: 1000000000")
+    parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    label = args[0]
+    filename = args[1]
+    outdbname = args[2]
+
+    delimiter = '|'
+    if options.useOldDelimiter:
+        delimiter = ':'
+
+    paired = False
+    pairID = '1'
+    if options.pairID is not None:
+        paired = True
+        if options.pairID not in ['1','2']:
+            print 'pairID value must be 1 or 2'
+            sys.exit(-1)
+
+        print 'Treating read IDs as paired with label = %s and pairID = %s' % (label, pairID)
+
+    dataType = 'DNA'
+    if options.geneDataFileName is not None:
+        dataType = 'RNA'
+
+    makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
+
+
+def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
+    maxBorder = 0
+    index = 0
+    insertSize = 100000
+
+    geneDict = {}
+    mapDict = {}
+    seenSpliceList = []
+    if dataType == 'RNA':
+        genedatafile = open(geneDataFileName)
+        for line in genedatafile:
+            fields = line.strip().split('\t')
+            blockCount = int(fields[7])
+            if blockCount < 2:
+                continue
+
+            uname = fields[0]
+            chrom = fields[1]
+            sense = fields[2]
+            chromstarts = fields[8][:-1].split(',')
+            chromstops = fields[9][:-1].split(',')
+            exonLengths = []
+            totalLength = 0
+            for index in range(blockCount):
+                chromstarts[index] = int(chromstarts[index])
+                chromstops[index] = int(chromstops[index])
+                exonLengths.append(chromstops[index] - chromstarts[index])
+                totalLength += exonLengths[index]
+
+            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
+            mapDict[uname] = []
+        genedatafile.close()
+
+    rds = readDataset(outdbname, init, dataType, verbose=True)
+
+    if cachePages > rds.getDefaultCacheSize():
+        if init:
+            rds.setDBcache(cachePages, default=True)
+        else:
+            rds.setDBcache(cachePages)
+
+    if not init and doIndex:
+        try:
+            if rds.hasIndex():
+                rds.dropIndex()
+        except:
+            if verbose:
+                print "couldn't drop Index"
+
+    propertyList = []
+    for arg in sys.argv:
+        if '::' in arg:
+            (pname, pvalue) = arg.strip().split('::')
+            if pname == 'flowcell' and paired:
+                pvalue = pvalue + '/' + pairID
+
+            propertyList.append((pname, pvalue))
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+    infile = open(filename,'r')
+    line = infile.readline()
+    fields = line.split()
+    readsize = len(fields[1])
+    readsizeString = str(readsize)
+    if dataType == 'RNA' and readsize > 32:
+        splicesizeString = '32'
+    else:
+        splicesizeString = readsizeString
+
+    print 'read size: %d bp' % readsize
+    if init:
+        rds.insertMetadata([('readsize', readsize)])
+        rds.insertMetadata([('eland_mapped', 'True')])
+        if extended:
+            rds.insertMetadata([('eland_extended', 'True')])
+
+        if paired:
+            rds.insertMetadata([('paired', 'True')])
+
+    trim = -4
+    if dataType == 'RNA':
+        maxBorder = readsize + trim
+
+    insertList = []
+    infile = open(filename,'r')
+    print 'mapping unique reads...'
+    lineIndex = 0
+    for line in infile:
+        lineIndex += 1
+        if lineIndex > maxLines:
+            break
+
+        fields = line.split()
+        if fields[2] in  ['QC','NM']:
+            continue
+
+        (matchType, bestMatch) = getUniqueMatch(fields[2])
+        if matchType == -1:
+            continue
+
+        bestpos = []
+        try:
+            pos = fields[3].split(',')
+        except:
+            if verbose:
+                print 'problem with line: %s' % line.strip()
+            continue
+
+        matchDict = {0:[], 1:[], 2:[], 3:[]}
+        if len(pos) == 1:
+            if 'splice' in pos:
+                continue
+
+            bestpos = pos
+        else:
+            currentChr = ''
+            for apos in pos:
+                if 'splice' in apos:
+                    continue
+
+                if ':' in apos:
+                    (front, back) = apos.split(':')
+                    currentChr = front
+                else:
+                    back = apos
+                    apos = currentChr + ':' + apos
+
+                if extended:
+                    matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
+                    if matchType > 2:
+                        matchType = 3
+                else:
+                    matchType = int(apos[-1])
+
+                matchDict[matchType].append(apos)
+                if bestMatch[matchType]:
+                    bestpos.append(apos)
+
+        # for padded reads, mapped read might have more mismatches!
+        if len(bestpos) == 0:
+            # let's not worry about these yet.
+            if 'splice' in line:
+                continue
+
+            for matchType in [1, 2, 3]:
+                if len(matchDict[matchType]) > 0:
+                    if len(matchDict[matchType]) == 1 and 'splice' not in matchDict[matchType][0]:
+                        bestpos = matchDict[matchType]
+                    break
+
+            if len(bestpos) == 0 and verbose:
+                print "couldn't pick best read from line: %s" % line
+
+        for apos in bestpos:
+            try:
+                (chrom, back) = apos.split(':')
+            except:
+                continue
+
+            if 'splice' in chrom:
+                continue
+
+            if '/' in chrom:
+                chromfields = chrom.split('/')
+                chrom = chromfields[-1]
+
+            if '.' in chrom:
+                try:
+                    (chrom, fileExt) = chrom.split('.')
+                except:
+                    if verbose:
+                        print 'problem with chromosome on line %s' % line.strip()
+
+                    continue
+
+            if extended:
+                if 'F' in back:
+                    sense = '+'
+                    (start, matchPart) = back.split('F')
+                else:
+                    sense = '-'
+                    (start, matchPart) = back.split('R')
+
+                start = int(start) 
+                if matchPart == readsizeString:
+                    matchType = ''
+                else:
+                    matchType = decodeMismatches(fields[1], matchPart)
+            else:
+                start = int(back[:-2])
+                if back[-2] == 'F':
+                    sense = '+'        
+                else:
+                    sense = '-'
+
+            stop = int(start) + readsize - 1
+            if paired:
+                readID = label + '-' + str(lineIndex) + '/' + pairID
+            else:
+                readID = label + '-' + str(index)
+
+            if len(chrom) > 0:
+                insertList.append((readID, chrom, start, stop, sense, 1.0, '', matchType))
+
+            if index % insertSize == 0:
+                rds.insertUniqs(insertList)
+                insertList = []
+                print '.',
+                sys.stdout.flush()
+
+            index += 1
+
+    if len(insertList) > 0:
+        rds.insertUniqs(insertList)
+        insertList = []
+
+    print
+    print '%d unique reads' % index
+    infile.close()
+
+    if dataType == 'RNA':
+        print 'mapping splices...'
+        index = 0
+        lineIndex = 0
+        mapfile = open(filename,'r')
+        for line in mapfile:
+            lineIndex += 1
+            if lineIndex > maxLines:
+                break
+
+            if 'splice' not in line:
+                continue
+
+            fields = line.strip().split()
+            (matchType, bestMatch) = getUniqueMatch(fields[2])
+            if matchType == -1:
+                continue
+
+            bestpos = []
+            pos = fields[3].split(',')
+            matchDict = {0:[], 1:[], 2:[], 3:[]}
+            if len(pos) == 1:
+                if 'chr' in pos:
+                    continue
+
+                bestpos = pos
+            else:
+                currentSplice = ''
+                for apos in pos:
+                    if 'splice' not in apos:
+                        continue
+
+                    if ':' in apos:
+                        if delimiter == ':':
+                            try:
+                                (extmodel, spliceID, regionStart, thepos) = apos.split(':')
+                            except:
+                                try:
+                                    (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
+                                    extmodel = extmodel1 + ':' + extmodel2
+                                except:
+                                    print 'warning: could not process splice %s' % apos
+                                    continue
+
+                            currentSplice = extmodel + ':' + spliceID + ':' + regionStart
+                        else:
+                            try:
+                                (currentSplice, thepos) = apos.split(':')
+                            except:
+                                try:
+                                    (extmodel1, restSplice, thepos) = apos.split(':')
+                                    currentSplice = extmodel1 + ':' + restSplice
+                                    (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
+                                except:
+                                    print 'warning: could not process splice %s' % apos
+                                    continue
+                    else:
+                        thepos = apos
+                        apos = currentSplice + ':' + apos
+
+                    if extended:
+                        matchType = thepos.count('A') + thepos.count('C') + thepos.count('G') + thepos.count('T')
+                        if matchType > 2:
+                            matchType = 3
+
+                        # if readsize > 32, we risk loosing pefect matches that go beyond our expanded genome splices, so only ask for 32bp match
+                        if thepos[:2] == splicesizeString:
+                            matchType = 0
+                    else:
+                        matchType = int(apos[-1])
+
+                    if bestMatch[matchType]:
+                        bestpos.append(apos)
+
+            # for padded reads, mapped read might have more mismatches!
+            if len(bestpos) == 0:
+                for matchType in [1, 2, 3]:
+                    if len(matchDict[matchType]) > 0:
+                        if len(matchDict[matchType]) == 1 and 'splice' in matchDict[matchType][0]:
+                            bestpos = matchDict[matchType]
+
+                        break
+                if len(bestpos) == 0 and verbose:
+                    print "couldn't pick best read from line: %s" % line
+
+            for apos in bestpos:
+                if delimiter == ':':
+                    try:
+                        (extmodel, spliceID, regionStart, thepos) = apos.split(':')
+                    except:
+                        try:
+                            (extmodel1, extmodel2, spliceID, regionStart, thepos) = apos.split(':')
+                            extmodel = extmodel1 + ':' + extmodel2
+                        except:
+                            print 'warning: could not process splice %s' % apos
+                            continue
+                else:
+                    try:
+                        (currentSplice, thepos) = apos.split(':')
+                    except:
+                        try:
+                            (extmodel1, restSplice, thepos) = apos.split(':')
+                            currentSplice = extmodel1 + ':' + restSplice
+                        except:
+                            print 'warning: could not process splice %s' % apos
+                            continue
+
+                    (extmodel, spliceID, regionStart) = currentSplice.split(delimiter)
+
+                modelfields = extmodel.split('/')
+                if len(modelfields) > 2:
+                    model = string.join(modelfields[1:],'/')
+                else:
+                    model = modelfields[1]
+
+                if model not in geneDict:
+                    print fields
+                    continue
+
+                (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                if extended:
+                    if 'F' in thepos:
+                        rsense = '+'
+                        (start, matchPart) = thepos.split('F')
+                    else:
+                        rsense = '-'
+                        (start, matchPart) = thepos.split('R')
+
+                    rstart = int(start) - 2 
+                    if matchPart == readsizeString:
+                        matchType = ''
+                    elif matchPart[:2] == splicesizeString:
+                        matchType = ''
+                    else:
+                        matchType = decodeMismatches(fields[1], matchPart)
+                else:
+                    rstart = int(thepos[:-2]) - 2
+                    if thepos[-2] == 'F':
+                        rsense = '+'
+                    else:
+                        rsense = '-'
+
+                if trim <= rstart <= maxBorder:
+                    pass
+                else:
+                    print rstart
+                    continue
+
+                currentSplice = model + delimiter + spliceID + delimiter + regionStart
+                spliceID = int(spliceID)
+                lefthalf = maxBorder - rstart
+                if lefthalf < 1 or lefthalf > maxBorder:
+                    continue
+
+                righthalf = readsize - lefthalf
+                startL = int(regionStart)  + rstart
+                stopL = startL + lefthalf
+                startR = chromstarts[spliceID + 1]
+                stopR = chromstarts[spliceID + 1] + righthalf
+                if paired:
+                    readName = label + '-' + str(lineIndex) + '/' + pairID
+                else:
+                    readName = model + '-' + str(thepos)
+
+                insertList.append((readName, chrom, startL, stopL, startR, stopR, rsense, 1.0, '', matchType))
+                index += 1
+                if index % insertSize == 0:
+                    rds.insertSplices(insertList)
+                    print '.',
+                    sys.stdout.flush()
+                    insertList = []
+
+                if currentSplice not in seenSpliceList:
+                    seenSpliceList.append(currentSplice)
+
+        mapfile.close()
+        if len(insertList) > 0:
+            rds.insertSplices(insertList)
+            insertList = []
+
+        print
+        print 'saw %d spliced reads accross %d distinct splices' % (index, len(seenSpliceList))
+
+    infile = open(filename,'r')
+    print 'mapping multireads...'
+    lineIndex = 0
+    origReadid = rds.getMultiCount()
+    try:
+        readid = int(origReadid) + 1
+    except:
+        readid = 0
+        origReadid = 0
+
+    print 'starting at %d' % (readid + 1)
+
+    for line in infile:
+        lineIndex += 1
+        if lineIndex > maxLines:
+            break
+
+        fields = line.split()
+        if len(fields) < 4:
+            continue
+
+        if fields[2] == 'QC' or fields[2] == 'NM' or fields[3] == '-':
+            continue
+
+        (zero, one, two) = fields[2].split(':')
+        zero = int(zero)
+        one = int(one)
+        two = int(two)
+
+        bestMatch = [False] * readsize
+        if zero > 1:
+            bestMatch[0] = True
+        elif zero == 0 and one > 1:
+            bestMatch[1] = True
+        elif zero == 0 and one == 0 and two > 1:
+            bestMatch[2] = True
+        else:
+            continue
+
+        readcount = 0
+        bestpos = []
+        pos = fields[3].split(',')
+        matchDict = {0:[], 1:[], 2:[], 3:[]}
+        currentChr = ''
+        for apos in pos:
+            if ':' in apos:
+                try:
+                    (front, back) = apos.split(':')
+                except:
+                    if verbose:
+                        print "problem splitting %s" % str(apos)
+                    continue
+
+                currentChr = front
+            else:
+                back = apos
+                apos = currentChr + ':' + apos
+
+            if extended:
+                matchType = back.count('A') + back.count('C') + back.count('G') + back.count('T')
+            else:
+                matchType = int(apos[-1])
+
+            try:
+                matchDict[matchType].append(apos)
+            except:
+                matchDict[matchType] = [apos]
+
+            if bestMatch[matchType]:
+                bestpos.append(apos)
+
+        # for padded reads, mapped read might have more mismatches!
+        if len(bestpos) == 0:
+            for matchType in [1, 2, 3]:
+                if len(matchDict[matchType]) > 0:
+                    if len(matchDict[matchType]) > 1:
+                        noSplice = True
+                        for arg in matchDict[matchType]:
+                            if 'splice' in arg:
+                                noSplice = False
+
+                        if noSplice:
+                            bestpos = matchDict[matchType]
+                    break
+
+            if len(bestpos) == 0 and verbose:
+                print "couldn't pick best read from line: %s" % line
+                continue
+
+        hasSplice = False
+        for apos in bestpos:
+            if 'splice' in apos:
+                hasSplice = True
+
+        # do not allow multireads that can also map accross splices for now
+        if hasSplice:
+            if verbose:
+                print "throwing out multiread because of splice conflict"
+            continue
+
+        if len(bestpos) > 0:
+            readid += 1
+
+        for apos in bestpos:
+            readcount += 1
+            (front, back) = apos.split(':')
+            chrom = front[:-3]
+            if extended:
+                if 'F' in back:
+                    sense = '+'
+                    (start, matchPart) = back.split('F')
+                else:
+                    sense = '-'
+                    (start, matchPart) = back.split('R')
+
+                start = int(start)
+                if matchPart == readsizeString:
+                    matchType = ''
+                else:
+                    matchType = decodeMismatches(fields[1], matchPart)
+            else:
+                start = int(back[:-2])
+                if back[-2] == 'F':
+                    sense = '+'
+                else:
+                    sense = '-'
+
+            stop = int(start) + readsize
+            readName = '%dx%d' % (readid, len(bestpos))
+            if paired:
+                readName = label + '-' + str(lineIndex) + '/' + pairID + '::' + readName
+
+            insertList.append((readName, chrom, start, stop, sense, 1.0/len(bestpos), '', matchType))
+            if index % insertSize == 0:
+                rds.insertMulti(insertList)
+                insertList = []
+                print '.',
+                sys.stdout.flush()
+
+            index += 1
+
+    if len(insertList) > 0:
+        rds.insertMulti(insertList)
+        insertList = []
+
+    print
+    print '%d multireads' % (readid - origReadid)
+
+    if doIndex:
+        print 'building index....'
+        rds.buildIndex(cachePages)
+
+
+def getUniqueMatch(elandCode):
+    (zero, one, two) = elandCode.split(':')
+    zero = int(zero)
+    one = int(one)
+    two = int(two)
+    bestMatch = [False, False, False, False]
+    if zero == 1:
+        bestMatch[0] = True
+        matchType = 0
+    elif zero == 0 and one == 1:
+        bestMatch[1] = True
+        matchType = 1
+    elif zero == 0 and one == 0 and two == 1:
+        bestMatch[2] = True
+        matchType = 2
+    else:
+        matchType = -1
+    
+    return (matchType, bestMatch)
+
+
+def decodeMismatches(origSeq, code):
+    output = []
+    number = '0'
+    index = 0
+    for pos in code:
+        if pos.isdigit():
+            number += pos
+        else:   
+            index += int(number) + 1
+            origNT = origSeq[index - 1]
+            output.append('%s%d%s' % (origNT, index, pos))
+            number = '0'
+
+    return string.join(output, ',')
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makesitetrack.py b/makesitetrack.py
new file mode 100755 (executable)
index 0000000..c6d0b8e
--- /dev/null
@@ -0,0 +1,99 @@
+#
+#  makesitetrack.py
+#  ENRAGE
+#
+
+import sys, string, optparse
+
+print "%prog: version 2.1"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog sitefile outbedfile [--noheader] [--stype fieldID] [--color xx,yy,zz] [--append] [--exploded]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--noheader", action="store_true", dest="noHeader")
+    parser.add_option("--stype", type="int", dest="stypeID")
+    parser.add_option("--color", dest="color")
+    parser.add_option("--append", action="store_true", dest="append")
+    parser.add_option("--exploded", action="store_false", dest="compact")
+    parser.set_defaults(stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    infile = args[0]
+    outfileName = args[1]
+    
+    makesitetrack(infile, outfileName, options.stypeID, options.color, options.append, options.compact, options.noHeader)
+
+
+def makesitetrack(infileName, outFileName, stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False):
+    if stypeID is not None:
+        doStype = True
+    else:
+        doStype = False
+        stypeID = 4
+
+    infile = open(infileName)
+
+    if append:
+        outfile = open(outFileName, "a")
+    else:
+        outfile = open(outFileName, "w")
+
+    try:
+        (name, extension) = outFileName.split(".")
+    except ValueError:
+        name = outFileName.split(".")[:-1]
+        name = string.join(name, "_")
+    
+    if not noHeader:
+        outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % name)
+
+    count = 1
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        fields = line.split()
+        if compact:
+            (chrom, loc) = fields[0].split(":")
+            (start, stop) = loc.split("-")
+            score = fields[1]
+        else:
+            chrom = fields[1]
+            start = fields[2]
+            stop = fields[3]
+            score = 1.
+
+        stype = "%s-%s" % (name, str(count))
+        if doStype:
+            try:
+                stype = fields[stypeID]
+                if stype == "11":
+                    stype = "can"
+                elif stype == "0":
+                    stype = "half"
+                else:
+                    stype = "NC" + stype
+            except IndexError:
+                pass
+
+        sense = fields[-2].strip()
+        if sense not in ["+", "-"]:
+            sense = "+"
+
+        outfile.write("%s\t%s\t%d\t%s\t%s\t%s\t-\t-\t%s\n" % (chrom, start, int(stop) + 1, stype, score, sense, color))
+        count += 1
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/makewiggle.py b/makewiggle.py
new file mode 100755 (executable)
index 0000000..95b0634
--- /dev/null
@@ -0,0 +1,209 @@
+#
+#  makewiggle.py
+#  ENRAGE
+#
+import sys, optparse
+from commoncode import readDataset
+
+print "%prog: version 6.7"
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s name rdsfile outfilename [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--raw", action="store_false", dest="doNormalize")
+    parser.add_option("--color", dest="color")
+    parser.add_option("--altcolor", dest="altColor")
+    parser.add_option("--chrom", dest="limitChrom")
+    parser.add_option("--shift", type="int", dest="shift")
+    parser.add_option("--split", action="store_true", dest="doSplit")
+    parser.add_option("--listfile", dest="listfilename")
+    parser.add_option("--listprefix", dest="listPrefix")
+    parser.add_option("--group", dest="group")
+    parser.add_option("--startPriority", type="float", dest="startPriority")
+    parser.add_option("--skiprandom", action="store_true", dest="skipRandom")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="withSplices")
+    parser.add_option("--singlebase", action="store_true", dest="doSingle")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--stranded", dest="strand")
+    parser.add_option("--maxchunk", type="int", dest="chunk")
+    parser.set_defaults(doNormalize=True, color=None, altColor="", limitChrom=None,
+                        shift=0, doSplit=False, listfilename=None, listPrefix="",
+                        group="", startPriority=0.01, skipRandom=False, withMulti=True,
+                        withSplices=False, doSingle=False, cachePages=-1, enforceChr=False,
+                        strand=None, chunk=20)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    name = args[0]
+    hitfilename = args[1]
+    outfilename = args[2]
+
+    makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
+               options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
+               options.group, options.startPriority, options.skipRandom, options.withMulti,
+               options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
+               options.chunk)
+
+
+def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, altColor="",
+               limitChrom=None, shift=0, doSplit=False, listfilename=None, listPrefix="",
+               group="", startPriority=0.01, skipRandom=False, withMulti=True, withSplices=False,
+               doSingle=False, cachePages=-1, enforceChr=False, strand=None, chunk=20):
+
+    priorityIncrement = 0.01
+    wigType = "bedGraph"
+
+    if color is not None:
+        colorString = " color=%s" % color
+    else:
+        colorString = ""
+
+    if altColor:
+        colorString += " altcolor=%s" % altColor
+
+    doList = False
+    if listfilename is not None:
+        doList = True
+    
+    chromLimit = False
+    if limitChrom is not None:
+        chromLimit = True
+
+    if group:
+        groupName = "group=%s" % group
+
+    doCache = False
+    if cachePages > 0:
+        doCache = True
+
+    maxSpan = chunk * 1000000
+
+    isStranded = False
+    strandedDirection = "both"
+    if strand is not None:
+        isStranded = True
+        if strand == "plus":
+            strandedDirection = "plusOnly"
+        elif strand == "minus":
+            strandedDirection = "minusOnly"
+
+        print "will keep track of %s strand(s)" % strandedDirection
+
+    if shift:
+        print "Will shift reads by +/- %d bp according to their sense" % shift
+        name += "shift=%d" % shift
+    
+    hitRDS = readDataset(hitfilename, verbose=True, cache=doCache)
+
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    readlen = hitRDS.getReadSize()
+
+    if doNormalize:
+        normalizeBy = len(hitRDS) / 1000000.
+    else:
+        normalizeBy = 1.
+
+    if doList:
+        listfile = open(listfilename, "w")
+
+    priority = startPriority    
+    if not doSplit:
+        outfile = open(outfilename, "w")
+        if doList:
+            listfile.write("%s%s\n" % (listPrefix, outfilename))
+
+        outfile.write('track type=%s name="%s" %s priority=%.3f visibility=full%s\n' % (wigType, name, groupName, priority, colorString)) 
+
+    chromList = hitRDS.getChromosomes()
+    chromList.sort()
+    for achrom in chromList:
+        if enforceChr and ("chr" not in achrom):
+            continue
+
+        if chromLimit and achrom != limitChrom:
+            continue
+
+        if skipRandom and "random" in achrom:
+            continue
+
+        if doSplit:
+            outfile = open("%s.%s" % (outfilename, achrom), "w")
+            if doList:
+                listfile.write("%s%s.%s\n" % (listPrefix, outfilename, achrom))
+
+            outfile.write('track type=%s name="%s %s" %s priority=%.3f visibility=full%s\n' % (wigType, name, achrom, groupName, priority, colorString))   
+            priority += priorityIncrement  
+
+        lastNT = hitRDS.getMaxCoordinate(achrom, doMulti=withMulti, doSplices=withSplices) + readlen
+        spanStart = 0
+
+        previousVal = 0
+        previousStart = 1
+        lineIndex = 0
+        for spanStop in xrange(maxSpan, lastNT+maxSpan, maxSpan):
+            if spanStop > lastNT:
+                spanStop = lastNT
+
+            print achrom, spanStart, spanStop
+            chromModel = hitRDS.getChromProfile(achrom, spanStart, spanStop, withMulti, withSplices, normalizeBy, isStranded, strandedDirection, shiftValue=shift)
+
+            for index in xrange(len(chromModel)):
+                currentVal = chromModel[index]
+                if doSingle:
+                    outline = "%s %d %.4f\n" % (achrom, spanStart + index, currentVal)
+                    outfile.write(outline)
+                    continue
+
+                if currentVal == previousVal:
+                    continue
+
+                if currentVal != previousVal:
+                    if previousVal != 0:
+                        lastpos = index + spanStart
+                        outline = "%s %d %d %.4f\n" % (achrom, previousStart, lastpos, previousVal)
+                        outfile.write(outline)
+                        lineIndex += 1
+
+                    previousVal = currentVal
+                    previousStart = index + spanStart
+
+            currentVal = 0
+            del chromModel
+            spanStart = spanStop + 1
+
+        if doSplit:
+            outfile.close()
+
+        if doSingle:
+            print index + 1
+        else:
+            print lineIndex
+
+    if not doSplit:
+        outfile.close()
+
+    if doList:
+        listfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py
new file mode 100644 (file)
index 0000000..4d174bf
--- /dev/null
@@ -0,0 +1,225 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset
+from cistematic.genomes import Genome
+from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+
+print "%prog: version 5.6"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--gidField", type="int", dest="fieldID")
+    parser.add_option("--maxLength", type="float", dest="maxLength")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
+                        replaceModels=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(sys.argv) < 6:
+        print usage
+        print "\twhere splicecountfile can be set to 'none' to not count splices\n"
+        sys.exit(1)
+
+    genome = args[0]
+    hitfile = args[1]
+    uniquecountfile = args[2]
+    splicecountfile = args[3]
+    outfile = args[4]
+
+    candidateLines = []
+    acceptedfilename = ""
+    if len(args) > 5:
+        try:
+            candidatefile = open(args[5])
+            candidateLines = candidatefile.readlines()
+            candidatefile.close()
+            acceptedfilename = args[6]
+        except IndexError:
+            pass
+
+    normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
+                            candidateLines, acceptedfilename, options.fieldID,
+                            options.maxLength, options.doCache, options.extendGenome,
+                            options.replaceModels)
+
+
+def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
+                            outfilename, candidateLines=[], acceptedfilename="",
+                            fieldID=0, maxLength=1000000000., doCache=False,
+                            extendGenome="", replaceModels=False):
+
+    uniquecountfile = open(uniquecountfilename)
+
+    if acceptedfilename:
+        acceptedfile = open(acceptedfilename, "w")
+
+    dosplicecount = False
+    if splicecountfilename != "none":
+        dosplicecount = True
+        splicecountfile = open(splicecountfilename)
+
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
+    uniqcount = RDS.getUniqsCount()
+    print "%d unique reads" % uniqcount
+
+    splicecount = 0
+    countDict = {}
+    gidList = []
+    farList = []
+    candidateDict = {}
+
+    gidToGeneDict = {}
+
+    featuresDict = hg.getallGeneFeatures()
+    print "got featuresDict"
+
+    outfile = open(outfilename, "w")
+
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        countDict[gid] = float(fields[-1])
+        gidList.append(gid)
+        gidToGeneDict[gid] = gene
+
+    uniquecountfile.close()
+
+    if dosplicecount:
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                countDict[gid] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    for line in candidateLines:
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            if gid not in farList:
+                farList.append(gid)
+                gidToGeneDict[gid] = gene
+
+            if gid not in countDict:
+                countDict[gid] = 0
+
+            countDict[gid] += float(fields[6])
+
+        if gid not in candidateDict:
+            candidateDict[gid] = []
+
+        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+
+    totalCount = (uniqcount + splicecount) / 1000000.
+    uniqScale = uniqcount / 1000000.
+    for gid in gidList:
+        gene = gidToGeneDict[gid]
+        featureList = []
+        try:
+            featureList = featuresDict[gid]
+        except:
+            try:
+                featureList = featuresDict[gene]
+            except:
+                print gene, gid
+
+        newfeatureList = []
+        geneLength = 0.
+        for (ftype, chrom, start, stop, sense) in featureList:
+            if (start, stop) not in newfeatureList:
+                newfeatureList.append((start, stop))
+                geneLength += (abs(start - stop) + 1.) / 1000.
+
+        if geneLength < 0.1:
+            geneLength = 0.1
+        elif geneLength > maxLength:
+            geneLength = maxLength
+
+        rpm = countDict[gid] / totalCount
+        rpkm = rpm / geneLength
+        if gid in candidateDict:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                cratio = cCount / (cLength / 1000.)
+                cratio = (uniqScale * cratio) / totalCount
+                if 10. * cratio < rpkm:
+                    continue
+
+                countDict[gid] += cCount
+                geneLength += cLength / 1000.
+                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+
+        rpm = countDict[gid] / totalCount
+        rpkm = rpm / geneLength
+        outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
+
+    for gid in farList:
+        gene = gidToGeneDict[gid]
+        geneLength = 0
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        if geneLength < 0.1:
+            continue
+
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            cratio = cCount / (cLength / 1000.)
+            cratio = cratio / totalCount
+            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+
+        rpm = countDict[gid] / totalCount
+        rpkm = rpm / geneLength
+        outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
+
+    outfile.close()
+    try:
+        acceptedfile.close()
+    except:
+        pass
+
+    if doCache:
+        uncacheGeneDB(genome)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py
new file mode 100755 (executable)
index 0000000..6053e80
--- /dev/null
@@ -0,0 +1,161 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset
+
+print "%prog: version 3.5" % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--multifraction", action="store_true", dest="reportfraction")
+    parser.add_option("--multifold", action="store_true", dest="reportFold")
+    parser.add_option("--minrpkm", type="float", dest="minThreshold")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--withGID", action="store_true", dest="writeGID")
+    parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
+                        doCache=False, writeGID=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    rdsfilename = argv[1]
+    expandedRPKMfile = args[3]
+    multicountfile = args[2]
+    outfilename = args[3]
+
+    normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
+                         options.reportFraction, options.reportFold, options.minThreshold,
+                         options.doCache, options.writeGID)
+
+
+def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
+                         reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
+                         writeGID=False):
+
+    expandedRPKMfile = open(expandedRPKMfilename)
+    multicountfile = open(multicountfilename)
+
+    if reportFraction:
+        print "reporting fractional contribution of multireads"
+        reportFold = False
+    elif reportFold:
+        print "reporting fold contribution of multireads"
+
+    RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
+    uniqcount = RDS.getUniqsCount()
+    splicecount = RDS.getSplicesCount()
+    multicount = RDS.getMultiCount()
+    countDict = {}
+    multicountDict = {}
+    lengthDict = {}
+    gidList = []
+
+    uniqspliceCount = (uniqcount + splicecount) / 1000000.
+    totalCount = (uniqcount + splicecount + multicount) / 1000000.
+
+    symbolDict = {}
+
+    for line in expandedRPKMfile:
+        fields = line.strip().split()
+        lineGID = fields[0]
+        symbolDict[lineGID] = fields[1]
+        countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
+        lengthDict[lineGID] = float(fields[-2])
+        multicountDict[lineGID] = 0
+        if lineGID not in gidList:
+            gidList.append(lineGID)
+
+    expandedRPKMfile.close()
+
+    for line in multicountfile:
+        fields = line.strip().split()
+        gid = fields[0]
+        if gid in countDict:
+            countDict[gid] += float(fields[-1])
+            multicountDict[gid] = float(fields[-1])
+        else:
+            print "could not find gid %s in dictionaries" % gid
+
+    multicountfile.close()
+
+    outfile = open(outfilename, "w")
+    outheader = "#"
+    if writeGID:
+        outheader += "GID\t"
+
+    outheader += "gene\tlen_kb\tRPKM"
+    if reportFraction:
+        outheader += "\tmulti/all"
+    elif reportFold:
+        outheader += "\tall/uniq"
+        
+    outheader += "\n"
+    outfile.write(outheader)
+
+    outlineList = []
+    index = 0
+    for gid in gidList:
+        outline = ""
+        gene = symbolDict[gid]
+        rpm = countDict[gid] / totalCount
+        rpkm = rpm / lengthDict[gid]
+        if rpkm < minThreshold:
+            continue
+
+        if writeGID:
+            outline = "%s\t" % gid
+
+        index += 1
+        try:
+            multirpm = multicountDict[gid] / totalCount
+            multirpkm = multirpm / lengthDict[gid]
+        except:
+            print "problem with %s - skipping " % gid
+            continue
+
+        if reportFraction or reportFold:
+            try:
+                if reportFraction:
+                    multivalue = multirpkm / rpkm
+                else:
+                    if rpm > multirpm:
+                        uniqrpkm = (rpm - multirpm) / lengthDict[gid]
+                        multivalue = rpkm / uniqrpkm
+                    elif rpkm > 0.01:
+                        multivalue = 100.
+                    else:
+                        multivalue = 1.0
+            except:
+                multivalue = 0
+
+            outline += "%s\t%.3f\t%.2f\t%.2f\n" %  (gene, lengthDict[gid], rpkm, multivalue)
+            outlineList.append((rpkm, outline))
+        else:
+            outline += "%s\t%.3f\t%.2f\n" %  (gene, lengthDict[gid], rpkm)
+            outlineList.append((rpkm, outline))
+
+    outlineList.sort()
+    outlineList.reverse()
+
+    for (rpkm, line) in outlineList:
+        outfile.write(line)
+
+    outfile.close()
+
+    print "returned %d genes" % index
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/partition.py b/partition.py
new file mode 100755 (executable)
index 0000000..89148fd
--- /dev/null
@@ -0,0 +1,129 @@
+#
+#  partition.py
+#  ENRAGE
+#
+""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum]
+           where the regionfiles must be comma-separated with no white space
+           -minFeature controls the size of the smallest partition
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, string, optparse
+from commoncode import getMergedRegions, writeLog
+
+versionString = '%s: version 2.0' % sys.argv[0]
+print versionString
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--minFeature", type="int", dest="minFeature",
+                      help="size of smallest partition")
+    parser.add_option("--chromField", type="int", dest="cField",
+                      help="num chromosome fields")
+    parser.add_option("--padregion", type="int", dest="padregion",
+                      help="padding on each side of region")
+    parser.add_option("--mergeregion", type="int", dest="mergeregion",
+                      help="bp threshold to merge regions")
+    parser.add_option("--nomerge", action="store_false", dest="merging",
+                      help="do not merge regions")
+    parser.add_option("--log", dest="logfilename",
+                      help="log file")
+    parser.add_option("--locID", action="store_true", dest="locID",
+                      help="use location as region ID")
+    parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
+                      help="ignore 'random' chromosomes")
+    parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log")
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    mergeID = args[0]
+    regionfiles = args[1]
+    outfilename = args[2]
+
+    if options.padregion:
+        print "padding %d bp on each side of a region" % options.padregion
+
+    if options.mergeregion:
+        print "merging regions closer than %d bp" % options.mergeregion
+
+    if options.locID:
+        print "using locations as region ID"
+
+    if options.ignoreRandom:
+        print "ignoring 'random' chromosomes"
+
+    partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename)
+
+
+def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"):
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+    allregionsDict = {}
+    regionFileList = regionfiles.split(',')
+    numRegions = len(regionFileList)
+    chromList = []
+    for regionID in range(numRegions):
+        allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,  minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion)
+        for achrom in allregionsDict[regionID]:
+            if achrom not in chromList:
+                chromList.append(achrom)
+            
+    outregionDict = {}
+
+    chromList = sorted(chromList)
+
+    for chrom in chromList:
+        if ignoreRandom and 'random' in chrom:
+            continue
+
+        outregionDict[chrom] = []
+        pointList = []
+        for regionID in range(numRegions):
+            if chrom in allregionsDict[regionID]:
+                for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]:
+                    pointList.append(rstart)
+                    pointList.append(rstop)
+
+        pointList.sort()
+        start = 0
+        for point in pointList:
+            if (point - start) > minFeature:
+                outregionDict[chrom].append((start, point - 1, point - 1 - start))
+                start = point
+
+    outfile = open(outfilename, 'w')
+    if locID:
+        outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n')
+    else:
+        outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n')
+
+    index = 0
+    for chrom in outregionDict:
+        for (start, stop, length) in outregionDict[chrom]:
+            index += 1
+            if locID:
+                outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.))
+            else:
+                outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.))
+
+    message = "%s was partitioned into %d regions" % (mergeID, index)
+    print message
+    writeLog(logfilename, versionString, message)
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/peakstoregion.py b/peakstoregion.py
new file mode 100755 (executable)
index 0000000..78000f5
--- /dev/null
@@ -0,0 +1,71 @@
+#
+#  peakstoregion.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+
+print "%s: version 1.0" % sys.argv[0]
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 3:
+        print "usage: python %s peakfile outfile [radius] [chromField] [posField] [labelField] [datafield]" % sys.argv[0]
+        sys.exit(1)
+
+    peakfile = argv[1]
+    outfile = argv[2]
+
+    radius = 500
+    chromField = 2
+    posField = 3
+    labelField = 1
+    dataField = -1
+
+    if len(argv) > 3:
+        radius = int(argv[3])
+
+    if len(argv) > 4:
+        chromField = int(argv[4])
+
+    if len(argv) > 5:
+        posField = int(argv[5])
+
+    if len(argv) > 6:
+        labelField = int(argv[6])
+
+    if len(argv) > 7:
+        dataField = int(argv[7])
+
+    peakstoregion(peakfile, outfile, radius, chromField, posField, labelField, dataField)
+
+
+def peakstoregion(peakfilename, outfilename, radius=500, chromField=2, posField=3, labelField=1, dataField=-1):
+    peakfile = open(peakfilename)
+    outfile = open(outfilename, "w")
+
+    for line in peakfile:
+        fields = line.strip().split()
+        label = "REGION"
+        try:
+            label = fields[labelField]
+        except IndexError:
+            pass
+
+        start = int(fields[posField]) - radius
+        stop = int(fields[posField]) + radius
+        outfile.write("%s\t%s\t%d\t%d\t%s\n" % (label, fields[chromField], start, stop, fields[dataField]))
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/plotbardist.py b/plotbardist.py
new file mode 100755 (executable)
index 0000000..52ccbe2
--- /dev/null
@@ -0,0 +1,183 @@
+#
+#  plotbardist.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 12/13/07.
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import optparse
+import matplotlib
+from pylab import *
+from math import *
+
+
+print "%prog: version 3.2"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog infile1 [infile2] [infile3] [options] outfile.png"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--bins", type="int", dest="bins")
+    parser.add_option("--field", type="int", dest="binnedField")
+    parser.add_option("--binSize", type="float", dest="binLength")
+    parser.add_option("--doLog", type="int", dest="logBase")
+    parser.add_option("--ymax", type="int", dest="maxY")
+    parser.add_option("--xlabel", dest="xLabel")
+    parser.add_option("--ylabel", dest="yLabel")
+    parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
+    parser.add_option("--title", dest="figTitle")
+    parser.add_option("--legend", dest="barsLegend", help="comma separated list")
+    parser.add_option("--xoffset", type="float", dest="pointOffset")
+    parser.add_option("--figsize", dest="figSizes", help="x,y pair")
+    parser.set_defaults(bins=10, binnedField=-1, binLength=-1, logBase=None, maxY=0,
+                        xLabel="bins", yLabel="count", binLabels=None, figTitle="",
+                        barsLegend=None, pointOffset=0., figSizes=None)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+
+    if len(args) < 2 or len(args) > 4:
+        print usage
+        print "where labelList and legendList are comma delimited strings of the form 'labelA,labelB,...,labelN'"
+        sys.exit(1)
+
+    fileList = args[:-1]
+    pngfilename = args[-1]
+
+    plotbardist(fileList, pngfilename, options.bins, options.binnedField, options.binLength,
+                options.logBase, options.maxY, options.xLabel, options.yLabel, options.binLabels,
+                options.figTitle, options.barsLegend, options.pointOffset, options.figSizes)
+
+
+def plotbardist(fileList, pngfilename, bins=10, binnedField=-1, binLength=-1, logBase=None,
+                maxY=0, xLabel="bins", yLabel="count", binLabels=None, figTitle="",
+                barsLegend=None, pointOffset=0., figSizes=None):
+
+    matplotlib.use("Agg")
+    plotParameters = {1: {"width": 0.5,
+                          "offset": [-0.25]},
+                      2: {"width": 0.3,
+                          "offset": [-0.3, 0]},
+                      3: {"width": 0.2,
+                          "offset": [-0.2, 0., 0.2]}
+    }
+
+    colorList = ["b", "r", "c"]
+    width = plotParameters[len(fileList)]["width"]
+    offset = plotParameters[len(fileList)]["offset"]
+
+    doLog = False
+    if logBase is not None:
+        doLog = True
+        print "taking log%d of x datapoints" % logBase
+        xLabel = "log%d(%s)" % (logBase, xLabel)
+    else:
+        logBase = 10
+
+    if figSizes is not None:
+        sizes = figSizes.strip().split(",")
+        figure(figsize=(float(sizes[0]),float(sizes[1])))
+
+    doLabels = False
+    if binLabels is not None:
+        binLabels = binLabels.strip().split(",")
+        doLabels = True
+    else:
+        binLabels = []
+
+    if barsLegend is not None:
+        barsLegend = barsLegend.strip().split(",")
+    else:
+        barsLegend = []
+    
+    ind2 = arange(bins)
+
+    bars = []
+    barsColors = []
+    index = 0
+    for fileName in fileList:
+        aFile = open(fileName)
+        distbin = bins * [0]
+
+        dataList = []
+        for line in aFile:
+            fields = line.strip().split()
+            try:
+                point = float(fields[binnedField]) + pointOffset
+                if doLog:
+                    if point < 1:
+                        point = 1
+
+                    point = log(point, logBase)
+
+                dataList.append(point)
+            except:
+                continue
+
+        print "%d data points" % len(dataList)
+
+        dataList.sort()
+        print "low = %f high = %f" % (dataList[0], dataList[-1])
+
+        if binLength < 0:
+            binLength = abs(dataList[-1] - dataList[0]) / bins
+
+        for point in dataList:
+            try:
+                distbin[int(round(point/binLength))] += 1
+            except:
+                distbin[-1] += 1
+
+        print binLength, int(round(point/binLength))
+
+        bars.append(bar(ind2 + offset[index], distbin, width, color=colorList[index]))
+        barsColors.append(bars[-1][0])
+
+        print distbin
+        halfCount = sum(distbin) / 2
+        median = 0
+        foundMedian = False
+        while not foundMedian:
+            if sum(distbin[:median]) < halfCount:
+                median += 1
+            else:
+                foundMedian = True
+
+        print median
+        index += 1
+
+    xlim(-1 * width - 0.2, bins + 0.2)
+
+    if len(barsLegend) > 0:
+        legend(barsColors, barsLegend)
+
+    ylabel(yLabel)
+    xlabel(xLabel)
+
+    if doLabels:
+        setp(gca(), "xticklabels", binLabels)
+
+    if maxY > 0:
+        ylim(0, maxY)
+
+    if len(figTitle) > 0:
+        title(figTitle)
+
+    gca().get_xaxis().tick_bottom()
+    gca().get_yaxis().tick_left()
+
+    savefig(pngfilename)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/plotnomogram.py b/plotnomogram.py
new file mode 100755 (executable)
index 0000000..238a4da
--- /dev/null
@@ -0,0 +1,126 @@
+#
+#  plotnomogram.py
+#  ENRAGE
+#
+
+import sys
+
+import matplotlib
+from pylab import *
+import matplotlib.axes 
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+print "%s: version 1.1" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 5:
+        print "usage: python %s maxdev xreads infile outpng" % argv[0]
+        sys.exit(1)
+
+    maxdev = float(argv[1])
+    xreads = float(argv[2])
+    infilename = argv[3]
+    outfilename = argv[4]
+
+    plotnomogram(maxdev, xreads, infilename, outfilename)
+
+
+def plotnomogram(maxdev, xreads, infilename, outfilename):
+    matplotlib.use("Agg")
+    infile = open(infilename)
+    line = infile.readline().strip()
+
+    percentages = line.split()
+    del percentages[0]
+
+    listWidth = len(percentages)
+
+    geneValues = {}
+
+    for line in infile:
+        fields = line.strip().split()
+        geneValues[fields[0]] = []
+        for pos in range(listWidth):
+            geneValues[fields[0]].append(float(fields[1 + pos]))
+
+    # categories here are: 3000+, 2999-300, 299-30, 29-3
+    genes3000p = []
+    genes300p = []
+    genes30p = []
+    genes3p = []
+
+    for gene in geneValues:
+        finalLevel = geneValues[gene][0]
+        if finalLevel >= 3000:
+            genes3000p.append(gene)
+        elif finalLevel >= 300:
+            genes300p.append(gene)
+        elif finalLevel >= 30:
+            genes30p.append(gene)
+        elif finalLevel >= 3:
+            genes3p.append(gene)
+
+    organizedList = [genes3000p, genes300p, genes30p, genes3p]
+    listNames = ["3000+ RPKM     ", "300-2999 RPKM", "30-299 RPKM    ", "3-29 RPKM        "]
+    listColors = ["k", "c", "m", "r"]
+    geneCounts = {}
+    oldscores = [0.]
+    newscores = {}
+    for name in listNames:
+        newscores[name] = [0.]
+
+    index = 0
+    for percent in percentages[1:]:
+        oldscores.append(xreads * float(percent) / 100.)
+        index += 1
+        listindex = 0
+        for geneList in organizedList:
+            geneCount = len(geneList)
+            numOver = 0.
+            for gene in geneList:
+                finalVal = geneValues[gene][0]
+                currentVal = geneValues[gene][index]
+                if abs((currentVal - finalVal) / finalVal) > maxdev:
+                    numOver += 1.
+
+            fraction = 1. - numOver / geneCount
+            print "%s %s %d %.2f" % (percent, listNames[listindex], geneCount, fraction)
+            newscores[listNames[listindex]].append(fraction)
+            geneCounts[listNames[listindex]] = geneCount        
+            listindex += 1
+
+    matplotlib.axes._process_plot_var_args.defaultColors = ["k", "y", "m", "c", "b", "g", "r"]
+
+    oldscores.append(xreads)
+    index = 0
+    plots = []
+    plotsColors = []
+    plotsLegend = []
+    for name in listNames:
+        newscores[name].append(1.0)
+        plots.append(plot(oldscores, newscores[name], listColors[index], linewidth=2))
+        plot(oldscores[1:-1], newscores[name][1:-1], listColors[index] + "^")
+        plotsColors.append(plots[-1][0])
+        plotsLegend.append("%s n = %d" % (name, geneCounts[name]))
+        index += 1
+
+    legend(plotsColors, plotsLegend, loc=0)
+    xticks(oldscores)
+    locs, labels = xticks()
+    setp(labels, rotation="vertical")
+    ylim(0, 1.03)
+    xlim(-0.1, xreads + .1)
+    savefig(outfilename)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/plotprofile.py b/plotprofile.py
new file mode 100755 (executable)
index 0000000..854affa
--- /dev/null
@@ -0,0 +1,129 @@
+#
+#  plotprofile.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import optparse
+from pylab import *
+from math import *
+import matplotlib
+
+
+print "%prog: version 2.2"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s infile outfile.png [--scale] [--max weightMax] [--ymin bottom] [--ymax top] [--subtractEvens]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--scale", action="store_true", dest="doScale")
+    parser.add_option("--max", type="float", dest="weightMax")
+    parser.add_option("--ymin", type="float", dest="ymin")
+    parser.add_option("--ymax", type="float", dest="ymax")
+    parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
+    parser.set_defaults(doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    infile = args[0]
+    pngfilename = args[1]
+
+    plotprofile(infile, pngfilename, options.doScale, options.weightMax, options.ymin, options.ymax, options.subtractEvens)
+
+
+def plotprofile(inFileName, pngfilename, doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False):
+    infile = open(inFileName)
+    limitYscale = False
+    if ymax is not None:
+        limitYscale = True
+    else:
+        ymax = 0.
+
+    if ymin is not None:
+        limitYscale = True
+    else:
+        ymin = 0.
+
+    matplotlib.use("Agg")
+
+    labelList = []
+    dataList = []
+    plotList = []
+    xmin = 10**20
+    xmax = -10**20
+
+    xcoordList = []
+    datapointList = []
+    weightList = []
+    line = infile.readline()
+    fields = line.strip().split()
+    for data in fields[1:-1]:
+        datapoint = float(data)
+        if datapoint < xmin:
+            xmin = datapoint
+
+        if datapoint > xmax:
+            xmax = datapoint
+
+        xcoordList.append(datapoint)
+
+    index = 1
+    for line in infile:
+        fields = line.strip().split()
+        datapointList = []
+        for data in fields[1:-1]:
+            datapointList.append(float(data))
+
+        if subtractEvens and index % 2 == 0:
+            for dataIndex in range(len(datapointList)):
+                dataList[-1][dataIndex] -= datapointList[dataIndex]
+        else:
+            dataList.append(datapointList)
+
+        weight = float(fields[-1])
+        if subtractEvens and index % 2 == 0:
+            pass
+        else:
+            labelList.append(fields[0])
+            if weight > weightMax:
+                weightMax = weight
+
+            weightList.append(weight)
+
+        index += 1
+
+    for index in range(len(dataList)):
+        newList = []
+        if doScale:
+            scale = weightList[index] / weightMax
+            print weightList[index], weightMax, scale
+            for val in dataList[index]:
+                newList.append(val * scale)
+        else:
+            newList = dataList[index]
+
+        plotList.append(plot(xcoordList, newList, linewidth=3.0))
+
+    xticks(xcoordList, rotation="vertical")
+    xlim(xmin - 0.1, xmax + 0.1)
+    if limitYscale:
+        ylim(ymin, ymax)
+
+    legend(plotList, labelList)
+    savefig(pngfilename)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/predictSpliceCount.py b/predictSpliceCount.py
new file mode 100755 (executable)
index 0000000..bab85db
--- /dev/null
@@ -0,0 +1,84 @@
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys
+from cistematic.genomes import Genome
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print '%s: version 1.1' % argv[0]
+
+    if len(argv) < 6:
+        print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
+        sys.exit(1)
+
+    genome = argv[1]
+    # number of nucleotides at the end of each exon that is affected by splicing
+    splicelead = int(argv[2])
+    uniquefilecount = argv[3]
+    splicefilecount =  argv[4]
+    outfilename = argv[5]
+
+    predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename)
+
+
+def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
+    hg = Genome(genome)
+
+    gidDict = {}
+    gidList = []
+    uniqueCountDict = {}
+    spliceCountDict = {}
+
+    uniquefile = open(uniquefilecount)
+    for line in uniquefile:
+        fields = line.strip().split()
+        gidDict[fields[0]] = fields[1]
+        gidList.append(fields[0])
+        uniqueCountDict[fields[0]] = int(fields[2])
+
+    splicefile = open(splicefilecount)
+    for line in splicefile:
+        fields = line.strip().split()
+        spliceCountDict[fields[0]] = int(fields[2])
+
+    outfile = open(outfilename,'w')
+
+    gidList.sort()
+    for gid in gidList:
+        symbol = gidDict[gid]
+        featureList = hg.getGeneFeatures((genome, gid))
+        newfeatureList = []
+        featuresizesum = 0
+        for (ftype, chrom, start, stop, sense) in featureList:
+            if (start, stop) not in newfeatureList:
+                newfeatureList.append((start, stop))
+                featuresizesum += stop - start + 1
+
+        if featuresizesum < 1:
+            featuresizesum = 1
+
+        splicearea = (len(newfeatureList) - 1) * splicelead
+        if splicearea < splicelead:
+            splicearea = 0
+
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/profilebins.py b/profilebins.py
new file mode 100755 (executable)
index 0000000..46274f5
--- /dev/null
@@ -0,0 +1,154 @@
+#
+#  profilebins.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+print "%prog: version 2.2"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog label infile1 [--upstream infile2] [--downstream infile3] [--uplength kb] [--downlength kb] [--gene geneName] [--genes genefile] [--append] outfile"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--upstream", dest="upfilename")
+    parser.add_option("--downstream", dest="downfilename")
+    parser.add_option("--uplength", type="float", dest="uplength")
+    parser.add_option("--downlength", type="int", dest="")
+    parser.add_option("--gene", dest="gene")
+    parser.add_option("--genes", dest="genefile")
+    parser.add_option("--append", action="store_true", dest="doAppend")
+    parser.set_defaults(upfilename=None, downfilename=None, uplength=0.0, downlength=0.0,
+                        gene=None, genefile=None, doAppend=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    label = args[0]
+    infilename = args[1]
+    outfilename = args[2]
+
+    profilebins(label, infilename, outfilename, options.upfilename, options.downfilename,
+                options.uplength, options.downlength, options.gene, options.genefile,
+                options.doAppend)
+
+
+def profilebins(label, infilename, outfilename, upfilename=None, downfilename=None,
+                uplength=0.0, downlength=0.0, gene=None, genefile=None, doAppend=False):
+
+    fileList = [infilename]
+    geneList = []
+    restrictGenes = False
+    if gene is not None:
+        geneList.append(gene)
+        restrictGenes = True
+
+    if genefile is not None:
+        for line in genefile:
+            fields = line.strip().split()
+            if len(fields) > 1:
+                geneList.append(fields[0])
+            else:
+                geneList.append(line.strip())
+
+        restrictGenes = True
+
+    if upfilename is not None:
+        fileList = [upfilename, infilename]
+
+    if downfilename is not None:
+        fileList.append(downfilename)
+
+    partLength = [10.]
+    partOffset = [0.]
+
+    if uplength:
+        partLength = [uplength, 10.]
+        partOffset = [-1. * uplength, 0.]
+
+    if downlength:
+        partLength.append(downlength)
+        partOffset.append(10.)
+
+    totalWeight = 0.
+    totalBins = []
+    for afile in fileList:   
+        infile = open(afile)
+
+        line = infile.readline()
+        fields = line.strip().split()
+        numBins = len(fields) - 4
+
+        geneName = fields[1]
+        weight = float(fields[2])
+        if restrictGenes and geneName in geneList:
+            totalWeight += weight
+
+        totalBins.append([])
+        for myBin in fields[4:]:
+            if not restrictGenes or (restrictGenes and geneName in geneList):
+                totalBins[-1].append(weight * float(myBin))
+            else:
+                totalBins[-1].append(0.)
+
+        for line in infile:
+            fields = line.strip().split()
+            geneName = fields[1]
+            if restrictGenes and geneName not in geneList:
+                continue
+
+            weight = float(fields[2])
+            index = 0
+            for myBin in fields[4:]:
+                totalBins[-1][index] += weight * float(myBin)
+                index += 1
+
+            totalWeight += weight
+
+    sumWeight = 0.
+    totalPercent = 0.
+    if doAppend:
+        outfile = open(outfilename, "a")
+    else:
+        outfile = open(outfilename, "w")
+        outfile.write("x-axis")
+        partIndex = 0
+        for partBins in totalBins:
+            partLen = partLength[partIndex]
+            numBins = len(partBins)
+            for binIndex in range(numBins):
+                outfile.write("\t%.2f" % (partOffset[partIndex] + (binIndex * partLen/numBins)))
+
+            partIndex += 1
+
+        outfile.write("\tweight\n")
+
+    outfile.write(label)
+    for partBins in totalBins:
+        for aBin in partBins:
+            percent = aBin / totalWeight
+            outfile.write("\t%.1f" % percent)
+            sumWeight += aBin
+            totalPercent += percent
+
+    outfile.write("\t%.1f\n" % totalWeight)
+    outfile.close()
+
+    print sumWeight
+    print totalPercent
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/ratio.py b/ratio.py
new file mode 100755 (executable)
index 0000000..ae14cfe
--- /dev/null
+++ b/ratio.py
@@ -0,0 +1,83 @@
+import sys
+import string
+import optparse
+import math
+
+print "%prog: version 2.3"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog denominatorField infile [--only fieldID] [--out outfile]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--only", type="int", dest="onlyField")
+    parser.add_option("--out", dest="outFileName")
+    parser.set_defaults(outFileName=None, onlyField=-1)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    field = int(args[0])
+    if args[1] == "-":
+        inFileName = sys.stdin
+    else:
+        inFileName = args[1]
+
+    ratio(field, inFileName, options.outFileName, options.onlyField)
+
+def ratio(field, inFileName, outFileName=None, onlyField=-1):
+
+    if inFileName is not None:
+        infile = open(inFileName)
+    else:
+        infile = sys.stdin
+
+    record = False
+    if outFileName is not None:
+        outfile = open(outFileName, "w")
+        record = True
+
+    doOnly = False
+    if onlyField != -1:
+        doOnly = True
+
+    line = infile.readline()
+    count = len(line.strip().split())
+    if record:
+        outfile.write(line)
+
+    for line in infile:
+        fields = line.strip().split()
+        outline = str(fields[0])
+        outError = False
+        for index in range(1, count):
+            if field == index:
+                outline = string.join([outline, "0"], " ")
+            elif doOnly and index != onlyField:
+                outline = string.join([outline, str(fields[index])], " ")
+            else:
+                try:
+                    ratioString = "%2.2f" % math.log((float(fields[index]) + 1)/(float(fields[field]) + 1), 2)
+                    outline = string.join([outline, ratioString], " ")
+                except:
+                    try:
+                        outline = string.join([outline, "e%s" % fields[index]], " ")
+                    except:
+                        outError = True
+
+        if outError:
+            continue
+
+        if record:
+            outfile.write(outline + "\n")
+        else:
+            print outline
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rdsmetadata.py b/rdsmetadata.py
new file mode 100755 (executable)
index 0000000..1ac458b
--- /dev/null
@@ -0,0 +1,106 @@
+#
+#  rdsmetadata.py
+#  ENRAGE
+#
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+import optparse
+from commoncode import readDataset
+
+print "%prog: version 2.7"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog rdsfile [propertyName1::propertyValue1] ... [propertyNameN::propertyValueN] [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--defaultcache", type="int", dest="cacheVal")
+    parser.add_option("--index", action="store_true", dest="buildIndex")
+    parser.add_option("--dropindex", action="store_true", dest="dropIndex")
+    parser.add_option("--nocount", action="store_false", dest="doCount")
+    parser.add_option("--complexity", action="store_true", dest="doComplexity")
+    parser.add_option("--reset", action="store_true", dest="resetFlags")
+    parser.add_option("--initrna", action="store_true", dest="rnaDataType")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(cacheVal=0, buildIndex=False, dropIndex=False, doCount=True,
+                        doComplexity=False, resetFlags=False, rnaDataType=False,
+                        cachePages=-1)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 1:
+        print usage
+        print "where the optional metadata name::value pairs are added to the existing dataset"
+        sys.exit(1)
+
+    datafile = args[0]
+
+    propertyList=[]
+    for arg in args:
+        if "::" in arg:
+            (pname, pvalue) = arg.strip().split("::")
+            print "adding %s : %s" % (pname, pvalue)
+            propertyList.append((pname, pvalue))
+
+    rdsmetadata(datafile, propertyList, options.cacheVal, options.buildIndex,
+                options.dropIndex, options.doCount, options.doComplexity,
+                options.resetFlags, options.rnaDataType, options.cachePages)
+
+
+def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False,
+                dropIndex=False, doCount=True, doComplexity=False, resetFlags=False,
+                rnaDataType=False, cachePages=-1):
+
+    doCache = False
+    if cachePages != -1:
+        doCache = True
+
+    if rnaDataType:
+        rds = readDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
+    else:
+        rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
+
+    if cachePages > rds.getDefaultCacheSize():
+        rds.setDBcache(cachePages)
+
+    if cacheVal > 0:
+        rds.setDBcache(cacheVal, default=True)
+        print "set default cache size to %d pages" % cacheVal
+
+    if resetFlags:
+        print "clearing read flags"
+        rds.resetFlags()
+
+    if dropIndex:
+        try:
+            rds.dropIndex()
+        except:
+            print "could not drop index"
+
+    if buildIndex:
+        print "building index...."
+        if cacheVal > 0:
+            rds.buildIndex(cacheVal)
+        else:
+            rds.buildIndex()
+
+    if doComplexity:
+        print "calculating uniq read complexity..."
+        uniqs = rds.getUniqsCount(distinct=False)
+        distincts = rds.getUniqsCount(distinct=True)
+        print "%d distincts / %d uniqs = %.2f" % (distincts, uniqs, float(distincts) / uniqs)
+
+    if len(propertyList) > 0:
+        rds.insertMetadata(propertyList)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/recordLog.py b/recordLog.py
new file mode 100755 (executable)
index 0000000..731d378
--- /dev/null
@@ -0,0 +1,31 @@
+#
+#  recordLog.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 12/14/08.
+#
+
+import sys
+from commoncode import writeLog
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if "-verbose" in argv or len(argv) < 4:
+        print "%s: version 1.0" % sys.argv[0]
+    
+    if len(argv) < 4:
+        print "usage: python %s logFile messenger message [--verbose]" % argv[0]
+        sys.exit(1)
+
+    logFile = argv[1]
+    messenger = argv[2]
+    message = argv[3]
+
+    writeLog(logFile, messenger, message)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/regionBins.py b/regionBins.py
new file mode 100755 (executable)
index 0000000..2d1649b
--- /dev/null
@@ -0,0 +1,89 @@
+#
+#  regionBins.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys
+print '%s: version 2.0' % sys.argv[0]
+
+if len(sys.argv) < 4:
+    print 'usage: python %s regionfile rdsfile outfilename [-bins numbins] [-field fieldNum] [-raw] [-padregion bp] [-mergeregion bp] [-cache]' % sys.argv[0]
+    sys.exit(1)
+
+from commoncode import *
+
+regionfilename = sys.argv[1]
+hitfile =  sys.argv[2]
+outfilename = sys.argv[3]
+
+if '-raw' in sys.argv:
+    normalize = False
+    normalizeBins = False
+else:
+    normalize = True
+    normalizeBins = True    
+
+doCache = False
+if '-cache' in sys.argv:
+    doCache = True
+
+cField = 1
+if '-field' in sys.argv:
+    fieldIndex = sys.argv.index('-field') + 1
+    cField = int(sys.argv[fieldIndex])
+
+padregion = 0
+if '-padregion' in sys.argv:
+    padField = sys.argv.index('-padregion') + 1
+    padregion = int(sys.argv[padField])
+    print 'padding %d bp on each side of a region' % padregion
+
+mergeregion = 0
+if '-mergeregion' in sys.argv:
+    mergeField = sys.argv.index('-mergeregion') + 1
+    mergeregion = int(sys.argv[mergeField])
+    print 'merging regions closer than %d bp' % mergeregion
+
+bins = 10
+if '-bins' in sys.argv:
+    binfield = sys.argv.index('-bins') + 1
+    bins = int(sys.argv[binfield])
+
+hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+readlen = hitRDS.getReadSize()
+normalizationFactor = 1.0
+if normalize:
+    totalCount = len(hitRDS)
+    normalizationFactor = totalCount / 1000000.
+
+chromList = hitRDS.getChromosomes(fullChrom=False)
+chromList.sort()
+
+regionDict = getMergedRegions(regionfilename, maxDist = mergeregion, keepLabel = True, verbose = True, chromField = cField, pad=padregion)
+
+hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+
+(regionsBins, regionsLen) = computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizationFactor)
+
+outfile = open(outfilename, 'w')
+for regionID in regionsBins:
+    tagCount = 0.
+    for binAmount in regionsBins[regionID]:
+        tagCount += binAmount
+    outfile.write('%s\t%s\t%.1f\t%d' % (regionID, regionID, tagCount, Len[gid]))
+    for binAmount in gidBins[gid]:
+            if normalizeBins:
+                if tagCount == 0:
+                    tagCount = 1
+                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
+            else:
+                outfile.write('\t%.1f' % binAmount)
+    outfile.write('\n')
+
+outfile.close()
\ No newline at end of file
diff --git a/regionCounts.py b/regionCounts.py
new file mode 100755 (executable)
index 0000000..0104cc2
--- /dev/null
@@ -0,0 +1,221 @@
+#
+#  regionCounts.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    print 'psyco not running'
+
+import sys, string, optparse
+from commoncode import readDataset, getMergedRegions, findPeak, writeLog
+
+versionString = "%prog: version 3.9"
+print versionString
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog regionfile rdsfile outfilename [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--markRDS", action="store_true", dest="flagRDS")
+    parser.add_option("--chromField", type="int", dest="cField")
+    parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--padregion", type="int", dest="padregion")
+    parser.add_option("--mergeregion", type="int", dest="mergeregion")
+    parser.add_option("--nomerge", action="store_false", dest="merging")
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+    parser.add_option("--noMulti", action="store_false", dest="doMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--peak", action="store_true", dest="usePeak")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--log", dest="logfilename")
+    parser.add_option("--rpkm", action="store_true", dest="doRPKM")
+    parser.add_option("--length", action="store_true", dest="doLength")
+    parser.add_option("--force", action="store_true", dest="forceRegion")
+    parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
+                        padregion=0, mergeregion=0, merging=True, doUniqs=True,
+                        doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
+                        logfilename="regionCounts.log", doRPKM=False, doLength=False,
+                        forceRegion=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    regionfilename = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+                 options.useFullchrom, options.normalize, options.padregion,
+                 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
+                 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
+                 options.doRPKM, options.doLength, options.forceRegion)
+
+
+def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+                 useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
+                 merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
+                 cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
+                 forceRegion=False):
+
+    print "padding %d bp on each side of a region" % padregion
+    print "merging regions closer than %d bp" % mergeregion
+    print "will use peak values"
+
+    if cachePages != -1:
+        doCache = True
+    else:
+        doCache = False
+
+    normalize = True
+    doRPKM = False
+    if doRPKM == True:
+        normalize = True
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+
+    regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
+                                  fullChrom=useFullchrom, verbose=True, chromField=cField,
+                                  doMerge=merging, pad=padregion)
+
+    labelList = []
+    labeltoRegionDict = {}
+    regionCount = {}
+
+    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    readlen = hitRDS.getReadSize()
+    if cachePages > hitRDS.getDefaultCacheSize():
+        hitRDS.setDBcache(cachePages)
+
+    totalCount = len(hitRDS)
+    if normalize:
+        normalizationFactor = totalCount / 1000000.
+
+    chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
+    if len(chromList) == 0 and doSplices:
+        chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
+
+    chromList.sort()
+
+    if flagRDS:
+        hitRDS.setSynchronousPragma("OFF")        
+
+    for rchrom in regionDict:
+        if forceRegion and rchrom not in chromList:
+            print rchrom
+            for (label, start, stop, length) in regionDict[rchrom]:
+                regionCount[label] = 0
+                labelList.append(label)
+                labeltoRegionDict[label] = (rchrom, start, stop)
+
+    for rchrom in chromList:
+        regionList = []
+        if rchrom not in regionDict:
+            continue
+
+        print rchrom
+        if useFullchrom:
+            fullchrom = rchrom
+        else:
+            fullchrom = "chr%s" % rchrom
+
+        if usePeak:
+            readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
+            rindex = 0
+            dictLen = len(readDict[fullchrom])
+
+        for (label, start, stop, length) in regionDict[rchrom]:
+            regionCount[label] = 0
+            labelList.append(label)
+            labeltoRegionDict[label] = (rchrom, start, stop)
+
+        if useFullchrom:
+            fullchrom = rchrom
+        else:
+            fullchrom = "chr%s" % rchrom
+
+        for (label, rstart, rstop, length) in regionDict[rchrom]:
+            regionList.append((label, fullchrom, rstart, rstop))
+            if usePeak:
+                readList = []
+                for localIndex in xrange(rindex, dictLen):
+                    read = readDict[fullchrom][localIndex]
+                    if read[0] < rstart:
+                        rindex += 1
+                    elif rstart <= read[0] <= rstop:
+                        readList.append(read)
+                    else:
+                        break
+
+                if len(readList) < 1:
+                    continue
+
+                readList.sort()
+                (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
+                try:
+                    topValue = smoothArray[topPos[0]]
+                except:
+                    print "problem with %s %s" % (str(topPos), str(smoothArray))
+                    continue
+
+                regionCount[label] += topValue
+            else:
+                regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+        if flagRDS:
+            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+
+    if flagRDS:
+        hitRDS.setSynchronousPragma("ON")    
+
+    if normalize:
+        for label in regionCount:
+            regionCount[label] = float(regionCount[label]) / normalizationFactor
+
+    outfile = open(outfilename, "w")
+
+    if forceRegion:
+        labelList.sort()
+
+    for label in labelList:
+        (chrom, start, stop) = labeltoRegionDict[label]
+        if useFullchrom:
+            fullchrom = chrom
+        else:
+            fullchrom = "chr%s" % chrom
+
+        if normalize:
+            if doRPKM:
+                length = abs(stop - start) / 1000.
+            else:
+                length = 1.
+
+            if length < 0.001:
+                length = 0.001
+
+            outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
+            if doLength:
+                outfile.write("\t%.1f" % length)
+        else:
+            outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
+
+        outfile.write("\n")
+
+    outfile.close()
+    if doCache and flagRDS:
+        hitRDS.saveCacheDB(hitfile)
+
+    writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/regionintersects.py b/regionintersects.py
new file mode 100755 (executable)
index 0000000..340d2f8
--- /dev/null
@@ -0,0 +1,203 @@
+#
+#  regionintersects.py
+#  ENRAGE
+#
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, optparse
+from commoncode import readDataset, getMergedRegions, findPeak
+
+print "%prog: version 3.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--reject1", dest="rejectOneName")
+    parser.add_option("--reject2", dest="rejectTwoName")
+    parser.add_option("--union", action="store_true", dest="trackReject")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
+                        doCache=False, normalize=True, doVerbose=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 5:
+        print usage
+        sys.exit(1)
+
+    readOneName =  args[0]
+    regionOneName = args[1]
+    readTwoName = args[2]
+    regionTwoName = args[3]
+    outfilename = args[4]
+
+    regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
+                     outfilename, options.rejectOneName, options.rejectTwoName,
+                     options.trackReject, options.doCache, options.normalize,
+                     options.doVerbose)
+
+
+def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
+                     outfilename, rejectOneName=None, rejectTwoName=None,
+                     trackReject=False, doCache=False, normalize=True, doVerbose=False):
+
+    mergedist=0
+
+    outfile = open(outfilename, "w")
+
+    doReject = False
+    if rejectOneName is not None:
+        trackReject = True
+        doReject = True
+        rejectOne = open(rejectOneName, "w")
+
+    if rejectTwoName is not None:
+        trackReject = True
+        doReject = True
+        rejectTwo = open(rejectTwoName, "w")
+
+    oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
+    twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
+
+    oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) 
+    twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
+
+    if normalize:
+        normalize1 = len(oneRDS) / 1000000.
+        normalize2 = len(twoRDS) / 1000000.
+    else:
+        normalize1 = 1.
+        normalize2 = 1.
+
+    commonRegions = 0
+    oneRejectIndex = 0
+    twoRejectIndex = 0
+
+    onePeaksDict = {}
+    oneFoundDict = {}
+
+    numRegionsOne = 0
+    numRegionsTwo = 0
+    for rchrom in oneDict:
+        numRegionsOne += len(oneDict[rchrom])
+
+    for rchrom in twoDict:
+        numRegionsTwo += len(twoDict[rchrom])
+
+    outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
+
+    for rchrom in oneDict:
+        if rchrom not in twoDict:
+            continue
+
+        print rchrom
+        rindex = 0
+        rindex2 = 0
+        fullchrom = "chr" + rchrom
+        oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
+        dictLen1 = len(oneReads[fullchrom])
+        twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
+        dictLen2 = len(twoReads[fullchrom])
+        chrom = rchrom
+        onePeaksDict[chrom] = []
+        oneFoundDict[chrom] = []
+        for (start, stop, length) in oneDict[chrom]:
+            readList = []
+            for localIndex in xrange(rindex, dictLen1):
+                read = oneReads[fullchrom][localIndex]
+                if read[0] < start:
+                    rindex += 1
+                elif start <= read[0] <= stop:
+                    readList.append(read)
+                else:
+                    break
+
+            if len(readList) < 1:
+                continue
+
+            readList.sort()
+
+            (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
+            onePeakScore = smoothArray[topPos[0]]
+            onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
+
+        for (start, stop, length) in twoDict[chrom]:
+            readList2 = []
+            for localIndex in xrange(rindex2, dictLen2):
+                read = twoReads[fullchrom][localIndex]
+                if read[0] < start:
+                    rindex2 += 1
+                elif start <= read[0] <= stop:
+                    readList2.append(read)
+                else:
+                    break
+
+            if len(readList2) < 1:
+                continue
+
+            readList2.sort()
+            (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
+            numHits /= normalize2
+            twoIsCommon = False
+            twoPeak = topPos[0] + start
+            twoRadius = length/2
+            twoPeakScore = smoothArray[topPos[0]] / normalize2
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+                if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
+                    if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
+                        oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
+
+                    twoIsCommon = True
+                    commonRegions += 1
+                    outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
+                    if doVerbose:
+                        print outline
+
+                    outfile.write(outline + "\n")
+
+            if trackReject and not twoIsCommon:
+                twoRejectIndex += 1
+                outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
+                if doReject:
+                    rejectTwo.write(outline + "\n")
+                else:
+                    outfile.write(outline + "\n")
+
+                if doVerbose:
+                    print outline
+
+        if trackReject:
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+                if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
+                    oneRejectIndex += 1
+                    outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
+                    if doReject:
+                        rejectOne.write(outline + "\n")
+                    else:
+                        outfile.write(outline + "\n")
+
+                    if doVerbose:
+                        print outline
+
+    if trackReject:
+        print "common: %d   one-only: %d   two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex)
+        outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex))
+    else:
+        print "common: %d" % commonRegions
+        outfile.write("#common: %d\n" % commonRegions)
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/regiontobed.py b/regiontobed.py
new file mode 100755 (executable)
index 0000000..e6ce22a
--- /dev/null
@@ -0,0 +1,113 @@
+"""
+    usage: python regiontobed label regionfile outbedfile [--color r,g,b] [--score field] [--narrowPeak] [--broadPeak] [--itemRgb] [--nolabel]
+           where color is in comma-delimited RGB without space
+           and field is a column with a score (first column is 0, second is 1,...)
+           t-narrowPeak assumes that findall.py was run with -listPeak
+           t-broadPeak assumes that findall.py was *NOT* run with -listPeak
+"""
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, math, optparse
+
+print "%prog: version 3.1"
+
+
+def usage():
+    print __doc__
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--color", dest="color")
+    parser.add_option("--score", type="int", dest="scoreField")
+    parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
+    parser.add_option("--broadPeak", action="store_true", dest="doBroad")
+    parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
+    parser.add_option("--nolabel", action="store_true", dest="noLabel")
+    parser.set_defaults(color="0,0,0", scoreField=None, doNarrow=False,
+                        doBroad=False, itemRGB=False, noLabel=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        usage()
+        sys.exit(2)
+
+    factorlabel = args[0]
+    regionfile = args[1]
+    outfile = args[2]
+
+    regiontobed(factorlabel, regionfile, outfile, options.color,
+                options.scoreField, options.doNarrow, options.doBroad,
+                options.itemRGB, options.noLabel)
+
+
+def regiontobed(factorlabel, regionFileName, outFileName, color="0,0,0",
+                scoreField=None, doNarrow=False, doBroad=False, itemRGB=False,
+                noLabel=False):
+
+    regionfile = open(regionFileName)
+    outfile = open(outFileName, "w")
+
+    if itemRGB:
+        print "assigning each item its color"
+
+    if noLabel:
+        if itemRGB:
+            outfile.write('track name=%s visibility=4 itemRgb="on"\n' % factorlabel)
+        else:
+            outfile.write("track name=%s visibility=4 color=%s\n" % (factorlabel, color))
+
+    for line in regionfile:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split()
+        if doNarrow:
+            signalVal = float(fields[4])
+            pval = float(fields[-1])
+            if pval == 0.:
+                pValue = 350
+            else:
+                pValue = -1. * math.log(pval, 10)
+
+            peakPos = int(fields[9]) - int(fields[2])
+            outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1\t%d" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue, peakPos))
+        elif doBroad:
+            signalVal = float(fields[4])
+            pval = float(fields[-1])
+            if pval == 0.:
+                pValue = 350
+            else:
+                pValue = -1. * math.log(pval, 10)
+
+            outfile.write("%s\t%s\t%s\t%s\t%d\t.\t%.4f\t%.4f\t-1" % (fields[1], fields[2], fields[3], fields[0], 0, signalVal, pValue))
+        elif scoreField is not None:
+            score = int(float(fields[scoreField]))
+            if score > 1000:
+                score = 1000
+
+            outfile.write("%s\t%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0], score))
+            if itemRGB:
+                outfile.write("\t+\t-\t-\t%s" % color)
+        else:
+            outfile.write("%s\t%s\t%s\t%s" % (fields[1], fields[2], fields[3], fields[0]))
+            if itemRGB:
+                outfile.write("\t1000\t+\t-\t-\t%s" % color)
+
+        outfile.write("\n")
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnaAToIFilter.py b/rnaAToIFilter.py
new file mode 100644 (file)
index 0000000..aefa78b
--- /dev/null
@@ -0,0 +1,39 @@
+import sys
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s infile outfile" % sys.argv[0]
+
+    if len(argv) < 3:
+        print usage
+        sys.exit(1)
+
+    infile = open(argv[1])
+    outfile = open(argv[2], "w")
+
+    lines = infile.readlines()
+    outputLines = rnaAToIFilter(lines)
+
+    for line in outputLines:
+        outfile.write(line)
+
+    outfile.close()
+
+
+def rnaAToIFilter(snpPropertiesList):
+    outputLines = []
+    for line in snpPropertiesList:
+        fields = line.split()
+        if fields[13] == "F" and fields[7] == "A-G":
+            outputLines.append(line)
+        elif fields[13] == "R" and fields[7] == "T-C":
+            outputLines.append(line)
+
+    return outputLines
+
+
+if __name__ == '__main__':
+    pass
\ No newline at end of file
diff --git a/rnaEditing.py b/rnaEditing.py
new file mode 100644 (file)
index 0000000..30de5a3
--- /dev/null
@@ -0,0 +1,87 @@
+"""
+Based on shell script provided by Ali.
+"""
+
+import sys
+import optparse
+from Erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
+from Erange.commoncode import countDuplicatesInList
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog dbfile snpsfile genome rpkmfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--goprefix", dest="prefix")
+    parser.add_option("--novelsnp", dest="novelsnpoutfilename")
+    parser.add_option("--bedfile", dest="bedoutfilename")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--snpDB", action="append", dest="snpDBList",
+                      help="additional snp db files to check will be searched in order given")
+    parser.set_defaults(prefix=None, novelsnpoutfilename=None, bedoutfilename=None, cachePages=None, snpDBList=[])
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    dbfile = args[0]
+    hitfile = args[1]
+    genome = args[2]
+    rpkmfilename = args[3]
+
+    if options.cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+
+    # get the SNPs
+    snpList = getSNPs.getSNPs(hitfile, 3, 0.25, doCache, options.cachePages, forceChr=True)
+
+    # check for existing SNPs
+    dbList = [dbfile]
+    for dbFileName in options.snpDBList:
+        dbList.append(dbFileName)
+
+    snpPropertiesList = chksnp.chkSNP(dbList, snpList, options.cachePages)
+
+    # get the neighboring genes
+    geneInfoList = getSNPGeneInfo.getSNPGeneInfo(genome, snpPropertiesList, rpkmfilename, doCache, flankBP=10000)
+
+    # filter out for the A-to-I events in the same direction as the genes
+    filteredSNPs = rnaAToIFilter.rnaAToIFilter(geneInfoList)
+
+    # count the number of different bases that have been called for each gene
+    # pick a set of genes with a high number of sites (here 5)
+    geneList = getGenesWithMultipleSNPs(filteredSNPs, minCount=5)
+
+    if options.prefix is not None:
+        analyzego.analyzeGO(genome, geneList, options.prefix, translateGene=True, fieldID=1)
+
+    if options.novelsnpoutfilename is not None:
+        getNovelSNPs.writeNovelSNPFile(genome, filteredSNPs, options.novelsnpoutfilename)
+
+    if options.bedoutfilename is not None:
+        makeSNPtrack.writeSNPsBedfile(filteredSNPs, "rnaEdit_sample", options.bedoutfilename)
+
+
+def getGenesWithMultipleSNPs(snpList, minCount=1):
+    geneList = []
+    for snpEntry in snpList:
+        geneList.append(snpEntry[11])
+
+    duplicateCountList = countDuplicatesInList(geneList)
+
+    geneList = []
+    for (gene, count) in duplicateCountList:
+        if count >= minCount:
+            geneList.append(gene)
+
+    return geneList
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnafarPairs.py b/rnafarPairs.py
new file mode 100755 (executable)
index 0000000..d1baebd
--- /dev/null
@@ -0,0 +1,180 @@
+#
+#  RNAFARpairs.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 11/2/08.
+#
+""" usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
+           looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
+"""
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys, time, optparse
+from commoncode import readDataset
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "%prog: version 3.6"
+    usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--verbose", action="store_true", dest="doVerbose",
+                      help="verbose output")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+    parser.add_option("--maxDist", type="int", dest="maxDist",
+                      help="maximum distance")
+    parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    goodfilename = args[1]
+    rdsfile = args[2]
+    outfilename = args[3]
+
+    rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+    
+
+def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+    goodDict = {}
+    goodfile = open(goodfilename)
+    for line in goodfile:
+        fields = line.split()
+        goodDict[fields[0]] = line
+
+    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    rdsChromList = RDS.getChromosomes()
+
+    if doVerbose:
+        print time.ctime()
+
+    distinct = 0
+    total = 0
+    outfile = open(outfilename,"w")
+
+    idb = geneinfoDB()
+    if genome == "dmelanogaster":
+        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+    else:
+        geneinfoDict = idb.getallGeneInfo(genome)
+
+    hg = Genome(genome)
+    geneannotDict = hg.allAnnotInfo()
+
+    assigned = {}
+    farConnected = {}
+    for achrom in rdsChromList:
+        if achrom == "chrM":
+            continue
+
+        print achrom
+        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+        if doVerbose:
+            print len(uniqDict), time.ctime()    
+
+        for readID in uniqDict:
+            readList = uniqDict[readID]
+            if len(readList) == 2:
+                total += 1
+                (start1, flag1, pair1) = readList[0]
+                (start2, flag2, pair2) = readList[1]
+
+                if flag1 != flag2:
+                    dist = abs(start1 - start2)
+                    if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+                        geneID = ""
+                        saw1 = False
+                        saw2 = False
+                        if flag1 in goodDict:
+                            geneID = flag2
+                            farFlag = flag1
+                            saw1 = True
+
+                        if flag2 in goodDict:
+                            geneID = flag1
+                            farFlag = flag2
+                            saw2 = True
+
+                        if saw1 or saw2:
+                            total += 1
+
+                        if saw1 and saw2:
+                            if flag1 < flag2:
+                                geneID = flag1
+                                farFlag = flag2
+                            else:
+                                geneID = flag2
+                                farFlag = flag1
+
+                            if geneID in farConnected:
+                                farConnected[geneID].append(farFlag)
+                            else:
+                                farConnected[geneID] = [farFlag]
+                        elif geneID != "":
+                            try:
+                                if genome == "dmelanogaster":
+                                    symbol = geneinfoDict["Dmel_" + geneID][0][0]
+                                else:
+                                    symbol = geneinfoDict[geneID][0][0]
+                            except:
+                                try:
+                                    symbol = geneannotDict[(genome, geneID)][0]
+                                except:
+                                    symbol = "LOC" + geneID
+
+                            symbol = symbol.strip()
+                            symbol = symbol.replace(" ","|")
+                            symbol = symbol.replace("\t","|")
+                            if farFlag not in assigned:
+                                assigned[farFlag] = (symbol, geneID)
+                                print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+                                outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+                                distinct += 1
+
+    farIndex = 0
+    for farFlag in farConnected:
+        geneID = ""
+        symbol = ""
+        idList = [farFlag] + farConnected[farFlag]
+        for oneID in idList:
+            if oneID in assigned:
+                (symbol, geneID) = assigned[oneID]
+
+        if geneID == "":
+            farIndex += 1
+            symbol = "FAR%d" % farIndex
+            geneID = -1 * farIndex
+
+        for oneID in idList:
+            if oneID not in assigned:
+                print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
+                outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
+                distinct += 1
+                assigned[oneID] = (symbol, geneID)
+
+    for farFlag in goodDict:
+        if farFlag not in assigned:
+            farIndex += 1
+            line = "FAR%d %d %s" % (farIndex, -1 * farIndex, goodDict[farFlag])
+            print line.strip()
+            outfile.write(line)
+
+    outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+    outfile.close()
+    print "distinct: %d\ttotal: %d" % (distinct, total)
+    print time.ctime()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/.svn/entries b/rnapath/.svn/entries
new file mode 100644 (file)
index 0000000..d37a6ac
--- /dev/null
@@ -0,0 +1,86 @@
+10
+
+dir
+23
+file:///Users/sau/svn/repos/erange/source/Erange/rnapath
+file:///Users/sau/svn/repos
+
+
+
+2010-10-01T18:32:26.347691Z
+22
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+d8abe7b5-3c2c-4fba-ae09-e6a8aa828af9
+\f
+RNAPATH.py
+file
+
+
+
+
+
+dbb616164849ddb57ad0880cf59ff36a
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
+__init__.py
+file
+
+
+
+
+2010-09-10T18:56:21.000000Z
+d41d8cd98f00b204e9800998ecf8427e
+2010-09-10T18:57:45.549780Z
+20
+sau
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+0
+\f
+processvelvet.py
+file
+
+
+
+
+
+c232f2e5338d3f018f259576a65ff49e
+2010-10-01T18:32:26.347691Z
+22
+sau
+\f
diff --git a/rnapath/.svn/text-base/RNAPATH.py.svn-base b/rnapath/.svn/text-base/RNAPATH.py.svn-base
new file mode 100644 (file)
index 0000000..86f61cd
--- /dev/null
@@ -0,0 +1,468 @@
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+    """ returns the complementary basepair to base nt
+    """
+    compDict = { "A": "T",
+                 "T": "A",
+                 "G": "C",
+                 "C": "G",
+                 "S": "S",
+                 "W": "W",
+                 "R": "Y",
+                 "Y": "R",
+                 "M": "K",
+                 "K": "M",
+                 "H": "D",
+                 "D": "H",
+                 "B": "V",
+                 "V": "B",
+                 "N": "N",
+                 "a": "t",
+                 "t": "a",
+                 "g": "c",
+                 "c": "g",
+                 "n": "n",
+                 "z": "z"
+    }
+
+    return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+    """ returns the complement of the sequence.
+    """
+    newSeq = ""
+    
+    seqLength = len(sequence)
+    
+    if length == seqLength or length < 0:
+        seqList = list(sequence)
+        seqList.reverse()
+        return "".join(map(compNT, seqList))
+
+    #TODO: this seems to want to deal with case where length is more than
+    # sequence length except that a negative index on a sequence is fine
+    # index will only be overrun if length is negative but that case is
+    # handled above
+    for index in range(seqLength - 1,seqLength - length - 1, -1):
+        try:
+            newSeq += compNT(sequence[index])
+        except:
+            newSeq += "N"
+
+    return newSeq
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="pathPrefix")
+    parser.add_option("--overlap", type="int", dest="overlap")
+    parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(0)
+
+    incontigfilename = args[0]
+    distalPairsfile = args[1]
+    outpathfilename = args[2]
+    outcontigfilename = args[3]
+
+    rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+    outpathfile = open(outpathfilename, "w")
+    
+    outheader = "#settings: %s" % " ".join(sys.argv)
+    print outheader
+    print >> outpathfile, outheader
+   
+    contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+    halfSize = calculateN50(origSize)
+    print "building the adjacency graph"
+    pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+    print "found %d paths" % len(pathList)            
+
+    newSizeList = []
+    pathID = 0
+    outcontigfile = open(outcontigfilename, "w")
+    for path in pathList:
+        pathID += 1
+        outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) 
+        vertexNameList = []
+        for vertex in path:
+            vertexNameList.append(nameList[vertex])
+            pathDescription = string.join(vertexNameList, ",")
+
+        print >> outpathfile, pathDescription
+        currentVertex = path[0]
+        currentSense = "+"
+        assemblyList = currentVertex
+        sequence = contigDict[currentVertex]
+        for nextVertex in path[1:]:
+            if (currentVertex, nextVertex) in edgeSenseDict:
+                senseList = edgeSenseDict[currentVertex, nextVertex]
+                FR = senseList.count(("+", "-"))
+                RF = senseList.count(("-", "+"))
+            else:
+                senseList = edgeSenseDict[nextVertex, currentVertex]
+                # flip
+                FR = senseList.count(("-", "+"))
+                RF = senseList.count(("+", "-"))
+
+            FF = senseList.count(("+", "+"))
+            RR = senseList.count(("-", "-"))
+            if currentSense == "-":
+                # we had flipped the upstream piece! Must flip again
+                temp1 = FR
+                temp2 = FF
+                FR = RR
+                FF = RF
+                RR = temp1
+                RF = temp2
+
+            if FR >= FF and FR >= RR and FR >= RF:
+                # we have FR - leave alone
+                sense1 = "+"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+                seqleft = sequence[-20:]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += contigDict[nextVertex][offset:]
+                else:
+                    sequence += "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            elif FF >= RR and FF >= RF:
+                # we have FF - flip seqright
+                sense1 = "+"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+                seqleft = sequence[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence += "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+            elif RR >= RF:
+                # we have RR - flip seqleft
+                sense1 = "-"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+                seqleft = complement(sequence)[:20]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + contigDict[nextVertex][offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            else:
+                # we have RF - flip both
+                sense1 = "-"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+                seqleft = complement(sequence)[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+
+            outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+            print outstring
+            print >> outpathfile, outstring
+            currentVertex = nextVertex
+
+        outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+        newSizeList.append(len(sequence))
+
+    for vertex in contigDict:
+        if vertex in visitedDict:
+            continue
+
+        newSizeList.append(len(contigDict[vertex]))
+        outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+    calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+    if referenceMean is None:
+        totalSize = sum(sizeList)
+        referenceMean = totalSize / 2
+
+    sizeList.sort()
+    sizeList.reverse()
+    currentTotalLength = 0
+    for size in sizeList:
+        if currentTotalLength + size > referenceMean:
+            print "#contigs", len(sizeList)
+            print "N50", size
+            break
+
+        currentTotalLength += size
+
+    print sizeList[:50]
+
+    return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+    nameList = []
+    origSize = []
+    contigNum = 0
+    currentChrom = ""
+    seq = ""
+    contigDict = {}
+
+    try:
+        incontigfile = open(contigFileName)
+    except IOError:
+        print "Error opening contig file: %s" % contigFileName
+        return contigNum, nameList, contigDict, origSize
+
+    for line in incontigfile:
+        if ">" in line:
+            if currentChrom !="":
+                nameList.append(currentChrom)
+                contigDict[contigNum] = seq
+                origSize.append(len(seq))
+                contigNum += 1
+
+            currentChrom = line.strip().split()[0][1:]
+            seq = ""
+        else:
+            seq += line.strip()
+
+    incontigfile.close()
+
+    return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+    edgeMatrix = EdgeMatrix(contigNum)
+
+    print len(edgeMatrix.edgeArray)
+    try:
+        print len(edgeMatrix.edgeArray[50])
+    except IndexError:
+        pass
+
+    print "processing distal pairs"
+    verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+    willVisitList = verticesWithEdges.keys()
+    willVisitList.sort()
+    print "visiting %d vertices" % len(willVisitList)
+
+    print "cleaning up graph of edges with weight 1"
+    verticesToDelete = []
+    for rindex in willVisitList:
+        if rindex not in notSoloDict:
+            cindex = vertexEdges[rindex][0]
+            edgeMatrix.edgeArray[rindex][cindex] = 0
+            edgeMatrix.edgeArray[cindex][rindex] = 0
+            verticesToDelete.append(rindex)
+
+    for vertex in verticesToDelete:
+        willVisitList.remove(vertex)
+
+    print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+    zeroedEdge = 0
+    print "visiting %d vertices" % len(willVisitList)
+
+    leafList = []
+    print "picking top 2 edges per vertex - zero out others"
+    for rindex in willVisitList:
+        vertices = vertexEdges[rindex]
+        rEdges = []
+        for avertex in vertices:
+            if avertex in willVisitList:
+                rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+        if len(rEdges) > 2:
+            rEdges.sort()
+            rEdges.reverse()
+            zeroedEdge += len(rEdges[2:])
+            for (weight, cindex) in rEdges[2:]:
+                edgeMatrix.edgeArray[rindex][cindex] = 0
+                edgeMatrix.edgeArray[cindex][rindex] = 0
+        elif len(rEdges) == 1:
+            if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+                leafList.append(rindex)
+
+    print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+    pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+    return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+    pathList = []
+    visitedDict = {}
+    leafList.sort()
+    print "traveling through the graph"
+    for rindex in leafList:
+        if visitedDict.has_key(rindex):
+            pass
+        else:
+            path = edgeMatrix.visitLink(rindex)
+            if len(path) > 1:
+                for vertex in path:
+                    visitedDict[vertex] = ""
+
+                print path
+                pathList.append(path)
+
+    return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+    contigToRowLookup = {}
+    verticesWithEdges = {}
+    vertexEdges = {}
+    notSoloDict = {}
+    edgeSenseDict = {}
+
+    distalPairs = open(distalPairsfilename)
+    for line in distalPairs:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split()
+        contA = "chr%s" % fields[1]
+        try:
+            contig1 = contigToRowLookup[contA]
+        except KeyError:
+            try:
+                contig1 = nameList.index(contA)
+                contigToRowLookup[contA] = contig1
+            except ValueError:
+                print "problem with end1: ", line
+                continue
+
+        sense1 = fields[3]
+
+        contB = "chr%s" % fields[4]
+        try:
+            contig2 = contigToRowLookup[contB]
+        except KeyError:
+            try:
+                contig2 = nameList.index(contB)
+                contigToRowLookup[contB] = contig2
+            except ValueError:
+                print "problem with end2: ", line
+                continue
+
+        sense2 = fields[6]
+
+        edgeMatrix.edgeArray[contig1][contig2] += 1
+        edgeMatrix.edgeArray[contig2][contig1] += 1
+        verticesWithEdges[contig1] = ""
+        verticesWithEdges[contig2] = ""
+        if (contig1, contig2) in edgeSenseDict:
+            edgeSenseDict[contig1, contig2].append((sense1, sense2))
+        elif (contig2, contig1) in edgeSenseDict:
+            edgeSenseDict[contig2, contig1].append((sense2, sense1))
+        else:
+            edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+        if contig1 in vertexEdges:
+            if contig2 not in vertexEdges[contig1]:
+                vertexEdges[contig1].append(contig2)
+        else:
+            vertexEdges[contig1] = [contig2]
+
+        if contig2 in vertexEdges:
+            if contig1 not in vertexEdges[contig2]:
+                vertexEdges[contig2].append(contig1)
+        else:
+            vertexEdges[contig2] = [contig1]
+
+        if edgeMatrix.edgeArray[contig1][contig2] > 1:
+            notSoloDict[contig1] = ""
+            notSoloDict[contig2] = ""
+
+    distalPairs.close()
+    
+    return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+    """ Describes a sparse matrix to hold edge data.
+    """
+
+    def __init__(self, dimension):
+        self.dimension = dimension
+        self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+    def visitLink(self, fromVertex, ignoreList=[]):
+        returnPath = [fromVertex]
+        toVertex = []
+        for toindex in xrange(self.dimension):
+            if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+                toVertex.append(toindex)
+
+        for vertex in toVertex:
+            if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+                self.edgeArray[fromVertex][vertex] = 0
+                self.edgeArray[vertex][fromVertex] = 0
+                return returnPath + [vertex]
+            else:
+                self.edgeArray[fromVertex][vertex] = 0
+                try:
+                    return returnPath + self.visitLink(vertex, returnPath)
+                except IOError:
+                    return returnPath + [vertex]
+        return []
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/.svn/text-base/__init__.py.svn-base b/rnapath/.svn/text-base/__init__.py.svn-base
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/rnapath/.svn/text-base/processvelvet.py.svn-base b/rnapath/.svn/text-base/processvelvet.py.svn-base
new file mode 100644 (file)
index 0000000..0af43d1
--- /dev/null
@@ -0,0 +1,110 @@
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="contigPrefix")
+    parser.add_option("--filter", dest="filterFileName")
+    parser.add_option("--min", type="int", dest="minSize")
+    parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+    parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(2)
+
+    infile = args[0]
+    outfile = args[1]
+
+    processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    filterList = getFilterList(filterFileName)
+
+    node = {"contigPrefix": contigPrefix,
+            "completeID": "",
+            "currentSeq": ""
+    }
+
+    counts = {"acceptedSize": 0,
+              "nSize": 0,
+              "contigsAccepted": 0,
+              "filteredSize": 0
+    }
+
+    for line in infile:
+        if ">NODE" in line:
+            writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+            node["completeID"] = line.strip()[1:]
+            node["currentSeq"] = ""
+        else:
+            node["currentSeq"] += line
+
+    writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+    infile.close()
+    outfile.close()
+
+    print "%d contigs accepted" % counts["contigsAccepted"]
+    print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+    print "%d bp accepted" % counts["acceptedSize"]
+    print "%d bp accepted N" % counts["nSize"]
+    print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+    filterList = []
+
+    if filterFileName:
+        try:
+            filterFile = open(filterFileName)
+        except IOError:
+            return filterList
+
+        for line in filterFile:
+            if "NODE" in line:
+                fields = line.strip().split()
+                try:
+                    exclude = fields[9]
+                except IndexError:
+                    continue
+
+                if exclude not in filterList:
+                    filterList.append(exclude)
+
+        filterFile.close()
+
+    return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+    completeID = node["completeID"]
+    currentSeq = node["currentSeq"]
+    sequenceLength = len(currentSeq) - currentSeq.count("\n")
+    if len(completeID) > 5 and completeID not in filterList:
+        fields = completeID.split("_")
+        newID = fields[1]
+        if keepCoverage:
+            newID = fields[1] + "_" + fields[-1].strip()
+
+        if sequenceLength >= minSize:
+            outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+            counts["acceptedSize"] += sequenceLength
+            counts["nSize"] += currentSeq.count("N")
+            counts["contigsAccepted"] += 1
+    else:
+        counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/.svn/tmp/RNAPATH.py.tmp b/rnapath/.svn/tmp/RNAPATH.py.tmp
new file mode 100644 (file)
index 0000000..86f61cd
--- /dev/null
@@ -0,0 +1,468 @@
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+    """ returns the complementary basepair to base nt
+    """
+    compDict = { "A": "T",
+                 "T": "A",
+                 "G": "C",
+                 "C": "G",
+                 "S": "S",
+                 "W": "W",
+                 "R": "Y",
+                 "Y": "R",
+                 "M": "K",
+                 "K": "M",
+                 "H": "D",
+                 "D": "H",
+                 "B": "V",
+                 "V": "B",
+                 "N": "N",
+                 "a": "t",
+                 "t": "a",
+                 "g": "c",
+                 "c": "g",
+                 "n": "n",
+                 "z": "z"
+    }
+
+    return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+    """ returns the complement of the sequence.
+    """
+    newSeq = ""
+    
+    seqLength = len(sequence)
+    
+    if length == seqLength or length < 0:
+        seqList = list(sequence)
+        seqList.reverse()
+        return "".join(map(compNT, seqList))
+
+    #TODO: this seems to want to deal with case where length is more than
+    # sequence length except that a negative index on a sequence is fine
+    # index will only be overrun if length is negative but that case is
+    # handled above
+    for index in range(seqLength - 1,seqLength - length - 1, -1):
+        try:
+            newSeq += compNT(sequence[index])
+        except:
+            newSeq += "N"
+
+    return newSeq
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="pathPrefix")
+    parser.add_option("--overlap", type="int", dest="overlap")
+    parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(0)
+
+    incontigfilename = args[0]
+    distalPairsfile = args[1]
+    outpathfilename = args[2]
+    outcontigfilename = args[3]
+
+    rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+    outpathfile = open(outpathfilename, "w")
+    
+    outheader = "#settings: %s" % " ".join(sys.argv)
+    print outheader
+    print >> outpathfile, outheader
+   
+    contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+    halfSize = calculateN50(origSize)
+    print "building the adjacency graph"
+    pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+    print "found %d paths" % len(pathList)            
+
+    newSizeList = []
+    pathID = 0
+    outcontigfile = open(outcontigfilename, "w")
+    for path in pathList:
+        pathID += 1
+        outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) 
+        vertexNameList = []
+        for vertex in path:
+            vertexNameList.append(nameList[vertex])
+            pathDescription = string.join(vertexNameList, ",")
+
+        print >> outpathfile, pathDescription
+        currentVertex = path[0]
+        currentSense = "+"
+        assemblyList = currentVertex
+        sequence = contigDict[currentVertex]
+        for nextVertex in path[1:]:
+            if (currentVertex, nextVertex) in edgeSenseDict:
+                senseList = edgeSenseDict[currentVertex, nextVertex]
+                FR = senseList.count(("+", "-"))
+                RF = senseList.count(("-", "+"))
+            else:
+                senseList = edgeSenseDict[nextVertex, currentVertex]
+                # flip
+                FR = senseList.count(("-", "+"))
+                RF = senseList.count(("+", "-"))
+
+            FF = senseList.count(("+", "+"))
+            RR = senseList.count(("-", "-"))
+            if currentSense == "-":
+                # we had flipped the upstream piece! Must flip again
+                temp1 = FR
+                temp2 = FF
+                FR = RR
+                FF = RF
+                RR = temp1
+                RF = temp2
+
+            if FR >= FF and FR >= RR and FR >= RF:
+                # we have FR - leave alone
+                sense1 = "+"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+                seqleft = sequence[-20:]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += contigDict[nextVertex][offset:]
+                else:
+                    sequence += "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            elif FF >= RR and FF >= RF:
+                # we have FF - flip seqright
+                sense1 = "+"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+                seqleft = sequence[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence += "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+            elif RR >= RF:
+                # we have RR - flip seqleft
+                sense1 = "-"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+                seqleft = complement(sequence)[:20]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + contigDict[nextVertex][offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            else:
+                # we have RF - flip both
+                sense1 = "-"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+                seqleft = complement(sequence)[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+
+            outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+            print outstring
+            print >> outpathfile, outstring
+            currentVertex = nextVertex
+
+        outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+        newSizeList.append(len(sequence))
+
+    for vertex in contigDict:
+        if vertex in visitedDict:
+            continue
+
+        newSizeList.append(len(contigDict[vertex]))
+        outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+    calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+    if referenceMean is None:
+        totalSize = sum(sizeList)
+        referenceMean = totalSize / 2
+
+    sizeList.sort()
+    sizeList.reverse()
+    currentTotalLength = 0
+    for size in sizeList:
+        if currentTotalLength + size > referenceMean:
+            print "#contigs", len(sizeList)
+            print "N50", size
+            break
+
+        currentTotalLength += size
+
+    print sizeList[:50]
+
+    return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+    nameList = []
+    origSize = []
+    contigNum = 0
+    currentChrom = ""
+    seq = ""
+    contigDict = {}
+
+    try:
+        incontigfile = open(contigFileName)
+    except IOError:
+        print "Error opening contig file: %s" % contigFileName
+        return contigNum, nameList, contigDict, origSize
+
+    for line in incontigfile:
+        if ">" in line:
+            if currentChrom !="":
+                nameList.append(currentChrom)
+                contigDict[contigNum] = seq
+                origSize.append(len(seq))
+                contigNum += 1
+
+            currentChrom = line.strip().split()[0][1:]
+            seq = ""
+        else:
+            seq += line.strip()
+
+    incontigfile.close()
+
+    return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+    edgeMatrix = EdgeMatrix(contigNum)
+
+    print len(edgeMatrix.edgeArray)
+    try:
+        print len(edgeMatrix.edgeArray[50])
+    except IndexError:
+        pass
+
+    print "processing distal pairs"
+    verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+    willVisitList = verticesWithEdges.keys()
+    willVisitList.sort()
+    print "visiting %d vertices" % len(willVisitList)
+
+    print "cleaning up graph of edges with weight 1"
+    verticesToDelete = []
+    for rindex in willVisitList:
+        if rindex not in notSoloDict:
+            cindex = vertexEdges[rindex][0]
+            edgeMatrix.edgeArray[rindex][cindex] = 0
+            edgeMatrix.edgeArray[cindex][rindex] = 0
+            verticesToDelete.append(rindex)
+
+    for vertex in verticesToDelete:
+        willVisitList.remove(vertex)
+
+    print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+    zeroedEdge = 0
+    print "visiting %d vertices" % len(willVisitList)
+
+    leafList = []
+    print "picking top 2 edges per vertex - zero out others"
+    for rindex in willVisitList:
+        vertices = vertexEdges[rindex]
+        rEdges = []
+        for avertex in vertices:
+            if avertex in willVisitList:
+                rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+        if len(rEdges) > 2:
+            rEdges.sort()
+            rEdges.reverse()
+            zeroedEdge += len(rEdges[2:])
+            for (weight, cindex) in rEdges[2:]:
+                edgeMatrix.edgeArray[rindex][cindex] = 0
+                edgeMatrix.edgeArray[cindex][rindex] = 0
+        elif len(rEdges) == 1:
+            if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+                leafList.append(rindex)
+
+    print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+    pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+    return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+    pathList = []
+    visitedDict = {}
+    leafList.sort()
+    print "traveling through the graph"
+    for rindex in leafList:
+        if visitedDict.has_key(rindex):
+            pass
+        else:
+            path = edgeMatrix.visitLink(rindex)
+            if len(path) > 1:
+                for vertex in path:
+                    visitedDict[vertex] = ""
+
+                print path
+                pathList.append(path)
+
+    return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+    contigToRowLookup = {}
+    verticesWithEdges = {}
+    vertexEdges = {}
+    notSoloDict = {}
+    edgeSenseDict = {}
+
+    distalPairs = open(distalPairsfilename)
+    for line in distalPairs:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split()
+        contA = "chr%s" % fields[1]
+        try:
+            contig1 = contigToRowLookup[contA]
+        except KeyError:
+            try:
+                contig1 = nameList.index(contA)
+                contigToRowLookup[contA] = contig1
+            except ValueError:
+                print "problem with end1: ", line
+                continue
+
+        sense1 = fields[3]
+
+        contB = "chr%s" % fields[4]
+        try:
+            contig2 = contigToRowLookup[contB]
+        except KeyError:
+            try:
+                contig2 = nameList.index(contB)
+                contigToRowLookup[contB] = contig2
+            except ValueError:
+                print "problem with end2: ", line
+                continue
+
+        sense2 = fields[6]
+
+        edgeMatrix.edgeArray[contig1][contig2] += 1
+        edgeMatrix.edgeArray[contig2][contig1] += 1
+        verticesWithEdges[contig1] = ""
+        verticesWithEdges[contig2] = ""
+        if (contig1, contig2) in edgeSenseDict:
+            edgeSenseDict[contig1, contig2].append((sense1, sense2))
+        elif (contig2, contig1) in edgeSenseDict:
+            edgeSenseDict[contig2, contig1].append((sense2, sense1))
+        else:
+            edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+        if contig1 in vertexEdges:
+            if contig2 not in vertexEdges[contig1]:
+                vertexEdges[contig1].append(contig2)
+        else:
+            vertexEdges[contig1] = [contig2]
+
+        if contig2 in vertexEdges:
+            if contig1 not in vertexEdges[contig2]:
+                vertexEdges[contig2].append(contig1)
+        else:
+            vertexEdges[contig2] = [contig1]
+
+        if edgeMatrix.edgeArray[contig1][contig2] > 1:
+            notSoloDict[contig1] = ""
+            notSoloDict[contig2] = ""
+
+    distalPairs.close()
+    
+    return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+    """ Describes a sparse matrix to hold edge data.
+    """
+
+    def __init__(self, dimension):
+        self.dimension = dimension
+        self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+    def visitLink(self, fromVertex, ignoreList=[]):
+        returnPath = [fromVertex]
+        toVertex = []
+        for toindex in xrange(self.dimension):
+            if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+                toVertex.append(toindex)
+
+        for vertex in toVertex:
+            if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+                self.edgeArray[fromVertex][vertex] = 0
+                self.edgeArray[vertex][fromVertex] = 0
+                return returnPath + [vertex]
+            else:
+                self.edgeArray[fromVertex][vertex] = 0
+                try:
+                    return returnPath + self.visitLink(vertex, returnPath)
+                except IOError:
+                    return returnPath + [vertex]
+        return []
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/.svn/tmp/processvelvet.py.tmp b/rnapath/.svn/tmp/processvelvet.py.tmp
new file mode 100644 (file)
index 0000000..0af43d1
--- /dev/null
@@ -0,0 +1,110 @@
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="contigPrefix")
+    parser.add_option("--filter", dest="filterFileName")
+    parser.add_option("--min", type="int", dest="minSize")
+    parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+    parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(2)
+
+    infile = args[0]
+    outfile = args[1]
+
+    processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    filterList = getFilterList(filterFileName)
+
+    node = {"contigPrefix": contigPrefix,
+            "completeID": "",
+            "currentSeq": ""
+    }
+
+    counts = {"acceptedSize": 0,
+              "nSize": 0,
+              "contigsAccepted": 0,
+              "filteredSize": 0
+    }
+
+    for line in infile:
+        if ">NODE" in line:
+            writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+            node["completeID"] = line.strip()[1:]
+            node["currentSeq"] = ""
+        else:
+            node["currentSeq"] += line
+
+    writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+    infile.close()
+    outfile.close()
+
+    print "%d contigs accepted" % counts["contigsAccepted"]
+    print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+    print "%d bp accepted" % counts["acceptedSize"]
+    print "%d bp accepted N" % counts["nSize"]
+    print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+    filterList = []
+
+    if filterFileName:
+        try:
+            filterFile = open(filterFileName)
+        except IOError:
+            return filterList
+
+        for line in filterFile:
+            if "NODE" in line:
+                fields = line.strip().split()
+                try:
+                    exclude = fields[9]
+                except IndexError:
+                    continue
+
+                if exclude not in filterList:
+                    filterList.append(exclude)
+
+        filterFile.close()
+
+    return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+    completeID = node["completeID"]
+    currentSeq = node["currentSeq"]
+    sequenceLength = len(currentSeq) - currentSeq.count("\n")
+    if len(completeID) > 5 and completeID not in filterList:
+        fields = completeID.split("_")
+        newID = fields[1]
+        if keepCoverage:
+            newID = fields[1] + "_" + fields[-1].strip()
+
+        if sequenceLength >= minSize:
+            outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+            counts["acceptedSize"] += sequenceLength
+            counts["nSize"] += currentSeq.count("N")
+            counts["contigsAccepted"] += 1
+    else:
+        counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/RNAPATH.py b/rnapath/RNAPATH.py
new file mode 100644 (file)
index 0000000..86f61cd
--- /dev/null
@@ -0,0 +1,468 @@
+import sys
+import optparse
+import string
+from numpy import zeros, int16
+
+versionString = "%s: version 0.95" % sys.argv[0]
+print versionString
+
+
+def compNT(nt):
+    """ returns the complementary basepair to base nt
+    """
+    compDict = { "A": "T",
+                 "T": "A",
+                 "G": "C",
+                 "C": "G",
+                 "S": "S",
+                 "W": "W",
+                 "R": "Y",
+                 "Y": "R",
+                 "M": "K",
+                 "K": "M",
+                 "H": "D",
+                 "D": "H",
+                 "B": "V",
+                 "V": "B",
+                 "N": "N",
+                 "a": "t",
+                 "t": "a",
+                 "g": "c",
+                 "c": "g",
+                 "n": "n",
+                 "z": "z"
+    }
+
+    return compDict.get(nt, "N")
+
+
+def complement(sequence, length=-1):
+    """ returns the complement of the sequence.
+    """
+    newSeq = ""
+    
+    seqLength = len(sequence)
+    
+    if length == seqLength or length < 0:
+        seqList = list(sequence)
+        seqList.reverse()
+        return "".join(map(compNT, seqList))
+
+    #TODO: this seems to want to deal with case where length is more than
+    # sequence length except that a negative index on a sequence is fine
+    # index will only be overrun if length is negative but that case is
+    # handled above
+    for index in range(seqLength - 1,seqLength - length - 1, -1):
+        try:
+            newSeq += compNT(sequence[index])
+        except:
+            newSeq += "N"
+
+    return newSeq
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="pathPrefix")
+    parser.add_option("--overlap", type="int", dest="overlap")
+    parser.set_defaults(pathPrefix="RNAPATH", overlap=30)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(0)
+
+    incontigfilename = args[0]
+    distalPairsfile = args[1]
+    outpathfilename = args[2]
+    outcontigfilename = args[3]
+
+    rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, options.pathPrefix, options.overlap)
+
+
+def rnaPath(incontigfilename, distalPairsfile, outpathfilename,
+            outcontigfilename, pathPrefix="RNAPATH", overlap=30):
+
+    outpathfile = open(outpathfilename, "w")
+    
+    outheader = "#settings: %s" % " ".join(sys.argv)
+    print outheader
+    print >> outpathfile, outheader
+   
+    contigNum, nameList, contigDict, origSize = getContigsFromFile(incontigfilename)
+    halfSize = calculateN50(origSize)
+    print "building the adjacency graph"
+    pathList, edgeSenseDict, visitedDict = getPath(contigNum, distalPairsfile, nameList)
+
+    print "found %d paths" % len(pathList)            
+
+    newSizeList = []
+    pathID = 0
+    outcontigfile = open(outcontigfilename, "w")
+    for path in pathList:
+        pathID += 1
+        outpathfile.write("chr%s%d: %s\n" % (pathPrefix, pathID, str(path))) 
+        vertexNameList = []
+        for vertex in path:
+            vertexNameList.append(nameList[vertex])
+            pathDescription = string.join(vertexNameList, ",")
+
+        print >> outpathfile, pathDescription
+        currentVertex = path[0]
+        currentSense = "+"
+        assemblyList = currentVertex
+        sequence = contigDict[currentVertex]
+        for nextVertex in path[1:]:
+            if (currentVertex, nextVertex) in edgeSenseDict:
+                senseList = edgeSenseDict[currentVertex, nextVertex]
+                FR = senseList.count(("+", "-"))
+                RF = senseList.count(("-", "+"))
+            else:
+                senseList = edgeSenseDict[nextVertex, currentVertex]
+                # flip
+                FR = senseList.count(("-", "+"))
+                RF = senseList.count(("+", "-"))
+
+            FF = senseList.count(("+", "+"))
+            RR = senseList.count(("-", "-"))
+            if currentSense == "-":
+                # we had flipped the upstream piece! Must flip again
+                temp1 = FR
+                temp2 = FF
+                FR = RR
+                FF = RF
+                RR = temp1
+                RF = temp2
+
+            if FR >= FF and FR >= RR and FR >= RF:
+                # we have FR - leave alone
+                sense1 = "+"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "+"))
+                seqleft = sequence[-20:]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (currentVertex, nextVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += contigDict[nextVertex][offset:]
+                else:
+                    sequence += "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            elif FF >= RR and FF >= RF:
+                # we have FF - flip seqright
+                sense1 = "+"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "+"), (nextVertex, "-"))
+                seqleft = sequence[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence += complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence += "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+            elif RR >= RF:
+                # we have RR - flip seqleft
+                sense1 = "-"
+                sense2 = "-"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "+"))
+                seqleft = complement(sequence)[:20]
+                seqright = contigDict[nextVertex][:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + contigDict[nextVertex][offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + contigDict[nextVertex]
+
+                currentSense = "+"
+            else:
+                # we have RF - flip both
+                sense1 = "-"
+                sense2 = "+"
+                assemblyList = ((assemblyList, "-"), (nextVertex, "-"))
+                seqleft = complement(sequence)[-20:]
+                seqright = complement(contigDict[nextVertex])[:overlap]
+                if seqleft in seqright:
+                    pos = seqright.index(seqleft)
+                    offset = pos + 20
+                    outstring = "stitching %d and %d using %d overlap" % (nextVertex, currentVertex, offset)
+                    print outstring
+                    print >> outpathfile, outstring
+                    sequence = complement(sequence) + complement(contigDict[nextVertex])[offset:]
+                else:
+                    sequence = complement(sequence) + "NN" + complement(contigDict[nextVertex])
+
+                currentSense = "-"
+
+            outstring = "(%d, %d): FF %d RR %d RF %d FR %d : %s %s\t%s" % (currentVertex, nextVertex, FF, RR, RF, FR, sense1, sense2, str(assemblyList))
+            print outstring
+            print >> outpathfile, outstring
+            currentVertex = nextVertex
+
+        outcontigfile.write(">chr%s%d %dbp %s | %s\n%s\n" % (pathPrefix, pathID, len(sequence), pathDescription, str(assemblyList), sequence))
+        newSizeList.append(len(sequence))
+
+    for vertex in contigDict:
+        if vertex in visitedDict:
+            continue
+
+        newSizeList.append(len(contigDict[vertex]))
+        outcontigfile.write(">%s\n%s\n" % (nameList[vertex], contigDict[vertex]))
+
+    calculateN50(newSizeList, referenceMean=halfSize)
+
+
+def calculateN50(sizeList, referenceMean=None):
+    if referenceMean is None:
+        totalSize = sum(sizeList)
+        referenceMean = totalSize / 2
+
+    sizeList.sort()
+    sizeList.reverse()
+    currentTotalLength = 0
+    for size in sizeList:
+        if currentTotalLength + size > referenceMean:
+            print "#contigs", len(sizeList)
+            print "N50", size
+            break
+
+        currentTotalLength += size
+
+    print sizeList[:50]
+
+    return referenceMean
+
+
+def getContigsFromFile(contigFileName):
+    nameList = []
+    origSize = []
+    contigNum = 0
+    currentChrom = ""
+    seq = ""
+    contigDict = {}
+
+    try:
+        incontigfile = open(contigFileName)
+    except IOError:
+        print "Error opening contig file: %s" % contigFileName
+        return contigNum, nameList, contigDict, origSize
+
+    for line in incontigfile:
+        if ">" in line:
+            if currentChrom !="":
+                nameList.append(currentChrom)
+                contigDict[contigNum] = seq
+                origSize.append(len(seq))
+                contigNum += 1
+
+            currentChrom = line.strip().split()[0][1:]
+            seq = ""
+        else:
+            seq += line.strip()
+
+    incontigfile.close()
+
+    return contigNum, nameList, contigDict, origSize
+
+
+def getPath(contigNum, distalPairsfile, nameList):
+    edgeMatrix = EdgeMatrix(contigNum)
+
+    print len(edgeMatrix.edgeArray)
+    try:
+        print len(edgeMatrix.edgeArray[50])
+    except IndexError:
+        pass
+
+    print "processing distal pairs"
+    verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict = processDistalPairsFile(distalPairsfile, edgeMatrix, nameList)
+
+    willVisitList = verticesWithEdges.keys()
+    willVisitList.sort()
+    print "visiting %d vertices" % len(willVisitList)
+
+    print "cleaning up graph of edges with weight 1"
+    verticesToDelete = []
+    for rindex in willVisitList:
+        if rindex not in notSoloDict:
+            cindex = vertexEdges[rindex][0]
+            edgeMatrix.edgeArray[rindex][cindex] = 0
+            edgeMatrix.edgeArray[cindex][rindex] = 0
+            verticesToDelete.append(rindex)
+
+    for vertex in verticesToDelete:
+        willVisitList.remove(vertex)
+
+    print "%d 1-edges zeroed out" % len(verticesToDelete)
+
+    zeroedEdge = 0
+    print "visiting %d vertices" % len(willVisitList)
+
+    leafList = []
+    print "picking top 2 edges per vertex - zero out others"
+    for rindex in willVisitList:
+        vertices = vertexEdges[rindex]
+        rEdges = []
+        for avertex in vertices:
+            if avertex in willVisitList:
+                rEdges.append((edgeMatrix.edgeArray[rindex][avertex], avertex))
+
+        if len(rEdges) > 2:
+            rEdges.sort()
+            rEdges.reverse()
+            zeroedEdge += len(rEdges[2:])
+            for (weight, cindex) in rEdges[2:]:
+                edgeMatrix.edgeArray[rindex][cindex] = 0
+                edgeMatrix.edgeArray[cindex][rindex] = 0
+        elif len(rEdges) == 1:
+            if edgeMatrix.edgeArray[rindex][rEdges[0][1]] > 1:
+                leafList.append(rindex)
+
+    print "zeroed out %d lower-weight edges at vertices with degree > 2" % zeroedEdge
+    pathList, visitedDict = traverseGraph(leafList, edgeMatrix)
+
+    return pathList, edgeSenseDict, visitedDict
+
+
+def traverseGraph(leafList, edgeMatrix):
+    pathList = []
+    visitedDict = {}
+    leafList.sort()
+    print "traveling through the graph"
+    for rindex in leafList:
+        if visitedDict.has_key(rindex):
+            pass
+        else:
+            path = edgeMatrix.visitLink(rindex)
+            if len(path) > 1:
+                for vertex in path:
+                    visitedDict[vertex] = ""
+
+                print path
+                pathList.append(path)
+
+    return pathList, visitedDict
+
+
+def processDistalPairsFile(distalPairsfilename, edgeMatrix, nameList):
+    contigToRowLookup = {}
+    verticesWithEdges = {}
+    vertexEdges = {}
+    notSoloDict = {}
+    edgeSenseDict = {}
+
+    distalPairs = open(distalPairsfilename)
+    for line in distalPairs:
+        if line[0] == "#":
+            continue
+
+        fields = line.strip().split()
+        contA = "chr%s" % fields[1]
+        try:
+            contig1 = contigToRowLookup[contA]
+        except KeyError:
+            try:
+                contig1 = nameList.index(contA)
+                contigToRowLookup[contA] = contig1
+            except ValueError:
+                print "problem with end1: ", line
+                continue
+
+        sense1 = fields[3]
+
+        contB = "chr%s" % fields[4]
+        try:
+            contig2 = contigToRowLookup[contB]
+        except KeyError:
+            try:
+                contig2 = nameList.index(contB)
+                contigToRowLookup[contB] = contig2
+            except ValueError:
+                print "problem with end2: ", line
+                continue
+
+        sense2 = fields[6]
+
+        edgeMatrix.edgeArray[contig1][contig2] += 1
+        edgeMatrix.edgeArray[contig2][contig1] += 1
+        verticesWithEdges[contig1] = ""
+        verticesWithEdges[contig2] = ""
+        if (contig1, contig2) in edgeSenseDict:
+            edgeSenseDict[contig1, contig2].append((sense1, sense2))
+        elif (contig2, contig1) in edgeSenseDict:
+            edgeSenseDict[contig2, contig1].append((sense2, sense1))
+        else:
+            edgeSenseDict[contig1, contig2] = [(sense1, sense2)]
+
+        if contig1 in vertexEdges:
+            if contig2 not in vertexEdges[contig1]:
+                vertexEdges[contig1].append(contig2)
+        else:
+            vertexEdges[contig1] = [contig2]
+
+        if contig2 in vertexEdges:
+            if contig1 not in vertexEdges[contig2]:
+                vertexEdges[contig2].append(contig1)
+        else:
+            vertexEdges[contig2] = [contig1]
+
+        if edgeMatrix.edgeArray[contig1][contig2] > 1:
+            notSoloDict[contig1] = ""
+            notSoloDict[contig2] = ""
+
+    distalPairs.close()
+    
+    return verticesWithEdges, vertexEdges, notSoloDict, edgeSenseDict
+
+
+class EdgeMatrix:
+    """ Describes a sparse matrix to hold edge data.
+    """
+
+    def __init__(self, dimension):
+        self.dimension = dimension
+        self.edgeArray = zeros((self.dimension, self.dimension), int16)
+
+
+    def visitLink(self, fromVertex, ignoreList=[]):
+        returnPath = [fromVertex]
+        toVertex = []
+        for toindex in xrange(self.dimension):
+            if self.edgeArray[fromVertex][toindex] > 1 and toindex not in ignoreList:
+                toVertex.append(toindex)
+
+        for vertex in toVertex:
+            if sum(self.edgeArray[vertex]) == self.edgeArray[fromVertex][vertex]:
+                self.edgeArray[fromVertex][vertex] = 0
+                self.edgeArray[vertex][fromVertex] = 0
+                return returnPath + [vertex]
+            else:
+                self.edgeArray[fromVertex][vertex] = 0
+                try:
+                    return returnPath + self.visitLink(vertex, returnPath)
+                except IOError:
+                    return returnPath + [vertex]
+        return []
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/rnapath/__init__.py b/rnapath/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/rnapath/processvelvet.py b/rnapath/processvelvet.py
new file mode 100644 (file)
index 0000000..0af43d1
--- /dev/null
@@ -0,0 +1,110 @@
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--prefix", dest="contigPrefix")
+    parser.add_option("--filter", dest="filterFileName")
+    parser.add_option("--min", type="int", dest="minSize")
+    parser.add_option("--keepcov", action="store_true", dest="keepCoverage")
+    parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(2)
+
+    infile = args[0]
+    outfile = args[1]
+
+    processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage)
+
+
+def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+    filterList = getFilterList(filterFileName)
+
+    node = {"contigPrefix": contigPrefix,
+            "completeID": "",
+            "currentSeq": ""
+    }
+
+    counts = {"acceptedSize": 0,
+              "nSize": 0,
+              "contigsAccepted": 0,
+              "filteredSize": 0
+    }
+
+    for line in infile:
+        if ">NODE" in line:
+            writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+            node["completeID"] = line.strip()[1:]
+            node["currentSeq"] = ""
+        else:
+            node["currentSeq"] += line
+
+    writeNode(outfile, node, filterList, counts, minSize, keepCoverage)
+
+    infile.close()
+    outfile.close()
+
+    print "%d contigs accepted" % counts["contigsAccepted"]
+    print "%d bp original" % (counts["acceptedSize"] + counts["filteredSize"])
+    print "%d bp accepted" % counts["acceptedSize"]
+    print "%d bp accepted N" % counts["nSize"]
+    print "%d bp filtered\n" % counts["filteredSize"]
+
+
+def getFilterList(filterFileName=""):
+    filterList = []
+
+    if filterFileName:
+        try:
+            filterFile = open(filterFileName)
+        except IOError:
+            return filterList
+
+        for line in filterFile:
+            if "NODE" in line:
+                fields = line.strip().split()
+                try:
+                    exclude = fields[9]
+                except IndexError:
+                    continue
+
+                if exclude not in filterList:
+                    filterList.append(exclude)
+
+        filterFile.close()
+
+    return filterList
+
+
+def writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False):
+    completeID = node["completeID"]
+    currentSeq = node["currentSeq"]
+    sequenceLength = len(currentSeq) - currentSeq.count("\n")
+    if len(completeID) > 5 and completeID not in filterList:
+        fields = completeID.split("_")
+        newID = fields[1]
+        if keepCoverage:
+            newID = fields[1] + "_" + fields[-1].strip()
+
+        if sequenceLength >= minSize:
+            outfile.write(">%s%s\n%s" % (node["contigPrefix"], newID, currentSeq))
+            counts["acceptedSize"] += sequenceLength
+            counts["nSize"] += currentSeq.count("N")
+            counts["contigsAccepted"] += 1
+    else:
+        counts["filteredSize"] += sequenceLength
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/scatterfields.py b/scatterfields.py
new file mode 100755 (executable)
index 0000000..60649ff
--- /dev/null
@@ -0,0 +1,297 @@
+"""
+    usage: python scatterfields.py infilename xaxisLabel xField yaxisLabel yField outImageName [--xmin xMin] [--ymin yMin]
+                  [--xmax xMax] [--ymax yMax] [--doLogF1] [--doLogF2] [--arcsinh] [--order polyOrder] [--base logBase]
+                  [--markGenes geneFile] [--markfold times] [--noregression] [--large] [--markdiag] [--title text] [--verbose]
+
+           Do a scatter plot of 2 fields from an input file.
+           fields are counted from 0.
+           use [-order polyOrder] to specify polynomial fits > 1
+           Supports very rudimentary compound fields for X value
+           using python's lambda functions (omit the keyword lambda)
+"""
+
+import matplotlib
+matplotlib.use("Agg")
+
+from pylab import *
+import math, cmath
+import sys
+import optparse
+
+alphaVal = 0.5
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--xmin", type="float", dest="forcexmin")
+    parser.add_option("--ymin", type="float", dest="forceymin")
+    parser.add_option("--xmax", type="float", dest="forcexmax")
+    parser.add_option("--ymax", type="float", dest="forceymax")
+    parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
+    parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
+    parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
+    parser.add_option("--order", type="int", dest="fitOrder")
+    parser.add_option("--base", type="int", dest="base")
+    parser.add_option("--markGenes", dest="markFile")
+    parser.add_option("--markfold", type="float", dest="foldChange")
+    parser.add_option("--noregression", action="store_false", dest="doRegression")
+    parser.add_option("--large", action="store_true", dest="plotLarge")
+    parser.add_option("--markdiag", action="store_true", dest="markDiag")
+    parser.add_option("--title", type="int", dest="figtitle")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False,
+                        doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None,
+                        foldChange=None, doRegression=True, plotLarge=False, markDiag=False,
+                        figtitle="", verbose=False)
+
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 6:
+        print usage
+        sys.exit(1)
+
+    infile = open(args[0])
+    xaxis = args[1]
+    xField = args[2]
+    yaxis = args[3]
+    yField = int(args[4])
+    outfilename = args[5]
+
+    scatterfields(infile, xaxis, xField, yaxis, yField, outfilename, options.forcexmin,
+                  options.forceymin, options.forcexmax, options.forceymax, options.doLogF1,
+                  options.doLogF2, options.doArcsinh, options.fitOrder, options.base,
+                  options.markFile, options.foldChange, options.doRegression, options.plotLarge,
+                  options.markDiag, options.figtitle, options.verbose)
+
+
+def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
+                  forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
+                  base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
+                  markDiag=False, figtitle="", verbose=False):
+
+    infile = open(infilename)
+    compoundField = False
+    try:
+        xField = int(xField)
+    except:
+        try:
+            compoundOp = "lambda %s" % xField
+            operator = eval(compoundOp)
+            compoundField = True
+            print "compound field %s" % xField
+        except:
+            pass
+
+        if not compoundField:
+            print "expression %s not supported" % xField
+            sys.exit(1)
+
+    markedGenes = []
+    marking = False
+    if markFile is not None:
+        for line in markFile:
+            try:
+                markedGenes.append(line.strip().split()[0].upper())
+            except:
+                markedGenes.append(line.strip().upper())
+       
+        markFile.close()
+        marking = True
+
+    markFold = False
+    if foldChange is not None:
+        markFold = True
+
+    newscores = []
+    oldscores = []
+
+    markednewscores = []
+    markedoldscores = []
+
+    markedfoldnewscores = []
+    markedfoldoldscores = []
+
+    ymax = 0.
+    xmax = 0.
+    for line in infile:
+        fields = line.strip().split()
+        gene = fields[0]
+        try:
+            if compoundField:
+                score = operator(fields)
+            else:
+                score = float(fields[xField])
+
+            newscore = float(fields[yField])
+        except:
+            continue
+
+        foldMarkThisScore = False
+        if markFold:
+            tempscore = score
+            if tempscore == 0:
+                tempscore = 0.03
+
+            tempratio = newscore / tempscore
+            if tempratio == 0:
+                tempratio2 = tempscore / 0.03
+            else:
+                tempratio2 = 1. / tempratio
+
+            if tempratio > foldChange or tempratio2 > foldChange:
+                foldMarkThisScore = True
+
+        if doArcsinh:
+            score = abs(cmath.asinh(score))
+        elif doLogF1:
+            try:
+                score = math.log(score, base)
+            except:
+                score = forcexmin
+
+            if score > xmax:
+                xmax = score
+
+        if doArcsinh:
+            newscore = abs(cmath.asinh(newscore))
+        elif doLogF2:
+            try:
+                newscore = math.log(newscore, base)
+            except:
+                newscore = forceymin
+
+            if newscore > ymax:
+                ymax = newscore
+
+        oldscores.append(score)
+        newscores.append(newscore)
+        if foldMarkThisScore:
+            markedfoldoldscores.append(score)
+            markedfoldnewscores.append(newscore)
+            if marking and gene.upper() not in markedGenes:
+                print gene, score, newscore, "unmarked"
+
+            if gene.upper() in markedGenes:
+                print gene, score, newscore, "overfold"
+
+            if verbose:
+                print len(markedfoldoldscores), line.strip()
+
+        if gene.upper() in markedGenes:
+            if not foldMarkThisScore:
+                print gene, score, newscore
+
+            markedoldscores.append(score)
+            markednewscores.append(newscore)
+
+    print score, newscore
+    print fields
+
+    if plotLarge and markFold:
+        plot(oldscores, newscores, "^", markersize=10., color="0.75", alpha=alphaVal)
+    elif plotLarge:
+        plot(oldscores, newscores, "b^", markersize=10., alpha=alphaVal)
+    elif markFold:
+        plot(oldscores, newscores, ",", color="0.75", alpha=alphaVal)
+    else:
+        plot(oldscores, newscores, "b,", alpha=alphaVal)
+
+    if len(markedfoldoldscores) > 0:
+        if plotLarge:
+            plot(markedfoldoldscores, markedfoldnewscores, "b^", markersize=10., alpha=alphaVal)
+        else:
+            plot(markedfoldoldscores, markedfoldnewscores, "b,", alpha=alphaVal)
+
+    if len(markedoldscores) > 0:
+        if plotLarge:
+            plot(markedoldscores, markednewscores, "r^", color="red", markersize=10., alpha=alphaVal)
+        else:
+            plot(markedoldscores, markednewscores, ".", color="red", markersize=4., alpha=alphaVal)
+
+    fitvalues = polyfit(oldscores, newscores, fitOrder)
+    print fitvalues
+    print len(oldscores)
+
+    meanObserved = float(sum(newscores)) / len(newscores)
+    if len(fitvalues) == 2:
+        predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
+    else:
+        predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
+
+    SSt = 0.
+    SSe = 0.
+
+    for index in range(len(newscores)):
+        SSt += (newscores[index] - meanObserved) ** 2
+        SSe += (newscores[index] - predicted[index]) ** 2
+
+    rSquared = 1. - SSe / SSt
+    print "R**2 = %f" % rSquared
+
+    oldscores.sort()
+    if len(fitvalues) == 2:
+        predicted = [(fitvalues[0] * x + fitvalues[1]) for x in oldscores]
+    else:
+        predicted = [(fitvalues[0] * x**2 + fitvalues[1] * x + fitvalues[2]) for x in oldscores]
+
+    if doRegression:
+        plot(oldscores, predicted, "-k", linewidth=2)
+
+    if figtitle == "":
+        figtitle = "%s vs %s (R^2: %.2f)" % (yaxis, xaxis, rSquared)
+
+    title(figtitle)
+
+    if markDiag:
+        min = forcexmin
+        if forceymin < min:
+            min = forceymin
+
+        max = xmax
+        if ymax > max:
+            max = ymax
+
+        if forcexmax > max:
+            max = forcexmax
+
+        if forceymax > max:
+            max = forceymax
+
+        plot([min,max], [min,max], "-g", linewidth=2)
+
+    print forcexmin, forceymin
+
+    if doLogF2:
+        ylabel("log%s(%s)" % (str(base), yaxis))
+    else:
+        ylabel(yaxis)
+
+    if doLogF1:
+        xlabel("log%s(%s)" % (str(base), xaxis))
+    else:
+        xlabel(xaxis)
+
+    if xmax > 0:
+        xlim(forcexmin - 0.05, xmax)
+
+    if ymax > 0:
+        ylim(forceymin - 0.05, ymax)
+
+    if forcexmax > 0 and forceymax > 0:
+        xlim(forcexmin - 0.05, forcexmax)
+        ylim(forceymin - 0.05, forceymax)
+
+    gca().get_xaxis().tick_bottom()
+    gca().get_yaxis().tick_left()
+
+    savefig(outfilename, dpi=100)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/siteintersects.py b/siteintersects.py
new file mode 100755 (executable)
index 0000000..ba0f1cd
--- /dev/null
@@ -0,0 +1,147 @@
+#
+#  siteintersects.py
+#  ENRAGE
+#
+
+import sys
+
+print "%s: version 2.0" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 4:
+        print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
+        sys.exit(1)
+
+    sitefilename1 =  argv[1]
+    sitefilename2 = argv[2]
+    outfilename = argv[3]
+
+    doReject = False
+    if "--reject" in sys.argv:    
+        reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
+        reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
+        doReject = True
+
+    doExpanded = False
+    if "--expanded" in sys.argv:
+        doExpanded = True
+
+    siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
+
+
+def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+
+    siteDict = {}
+    file1Dict = {}
+
+    infile1count = 0
+    infile = open(sitefilename1)
+    infile.readline()
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        infile1count += 1
+        fields = line.strip().split()
+        if doExpanded:
+            chrom = fields[1][3:]
+            start = int(fields[2])
+            stop = int(fields[3])
+            rest = fields[4:]
+        else:
+            (chrom, pos) = fields[0].split(":")
+            chrom = chrom[3:]
+            (start, stop) = pos.split("-")
+            start = int(start)
+            stop = int(stop)
+            rest = fields[1:]
+
+        try:
+            siteDict[chrom].append((start, stop, rest))
+        except:
+            siteDict[chrom] = [(start, stop, rest)]
+
+        if doReject:
+            file1Dict[str((chrom, start, stop, rest))] = line
+
+    infile.close()
+
+    print "file1: %d" % infile1count
+
+    infile2count = 0
+    infile = open(sitefilename2)
+    infile.readline()
+
+    commonSites = 0
+    unique2List = []
+    outfile = open(outfilename, "w")
+    for line in infile:
+        if line[0] == "#":
+            continue
+
+        infile2count += 1
+        fields = line.strip().split()
+        if doExpanded:
+            chrom = fields[1][3:]
+            start = int(fields[2])
+            stop = int(fields[3])
+            rest = fields[4:]
+        else:
+            (chrom, pos) = fields[0].split(":")
+            chrom = chrom[3:]
+            (start, stop) = pos.split("-")
+            rest = str(fields[1:])
+
+        start = int(start)
+        stop = int(stop)
+        mid = start + abs(stop - start)/2
+        if chrom not in siteDict:
+            if doReject:
+                unique2List.append(line)
+                continue
+
+        twoNotCommon = True
+        for (rstart, rstop, rline) in siteDict[chrom]:
+            rsize = abs(rstart - rstop) /2
+            rmid = rstart + abs(rstop - rstart)/2
+            if abs(mid - rmid) < rsize:
+                commonSites += 1
+                if twoNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
+                    twoNotCommon = False
+
+                try:
+                    if doReject:
+                        del file1Dict[str((chrom, rstart, rstop, rline))]
+                except:
+                    pass
+
+        if doReject and twoNotCommon:
+            unique2List.append(line)
+
+    outfile.close()
+
+    print "file2: %d" % infile2count
+
+    if doReject:
+        reject1file = open(reject1filename, "w")
+        reject2file = open(reject2filename, "w")
+
+        for key in file1Dict:
+            reject1file.write(file1Dict[key])
+
+        for line in unique2List:
+            reject2file.write(line)
+
+        reject1file.close()
+        reject2file.close()
+
+    print commonSites
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/stallCategory.py b/stallCategory.py
new file mode 100755 (executable)
index 0000000..92cd519
--- /dev/null
@@ -0,0 +1,165 @@
+#
+#  stallCategory.py
+#  ENRAGE
+#
+
+try:
+        import psyco
+        psyco.full()
+except:
+        pass
+
+import sys
+import optparse
+
+print "%prog: version 1.1"
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--out", dest="outFileName")
+    parser.add_option("--statout", dest="statOutFileName")
+    parser.add_option("--expression", type="float", dest="expressionLevel")
+    parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    infile1 = args[1]
+    infile2 = args[2]
+    transcriptFile = args[2]
+
+    stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
+
+
+def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
+
+    infile1 = open(inFile1Name)
+    infile2 = open(inFile2Name)
+    transcriptFile = open(transcriptFileName)
+
+    writeOut = False
+    if outFileName is not None:
+        outfile = open(outFileName, "w")
+        outfile.write("gene\texpression\tratio1\tpromAmount1\ttotal1\trestRPKM1\tratio2\tpromAmount2\ttotal2\trestRPKM2\n")
+        writeOut = True
+
+    statWriteOut = False
+    if statOutFileName is not None:
+        statoutfile = open(statOutFileName, "w")
+        statoutfile.write("ExpressionR1R2Stalled1Stalled2\tCount\n")
+        statWriteOut = True
+
+    dictOne = {}
+    dictTwo = {}
+    expressionDict = {}
+
+    for line in infile1:
+        if "short" in line:
+            continue
+
+        fields = line.strip().split()
+        promAmount = float(fields[4]) + float(fields[5])
+        genelen = float(fields[3])/100
+        total = float(fields[2])
+        if total < 0.1:
+            total = 0.1
+
+        restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
+        ratio = float(fields[-1])
+        dictOne[fields[1]] = (ratio, promAmount, total, restRPKM)
+
+    for line in infile2:
+        if "short" in line:
+            continue
+
+        fields = line.strip().split()
+        promAmount = float(fields[4]) + float(fields[5])
+        genelen = float(fields[3])/100
+        if promAmount == 0.:
+            promAmount = 0.1
+
+        total = float(fields[2])
+        if total < 0.1:
+            total = 0.1
+
+        restRPKM = (total * (1. - promAmount/100.))/ (genelen - 0.6)
+        ratio = float(fields[-1])
+        dictTwo[fields[1]] = (ratio, promAmount, total, restRPKM)
+
+    for line in transcriptFile:
+        (gene, transc, transcpercell) = line.strip().split()
+        expressionDict[gene] = float(transcpercell)
+
+    categoryList = []
+    categoryDict = {}
+    for atype in ["HH", "HL", "LH", "LL"]:
+        for expression in ["E", "N"]:
+            for cat1 in ["Y", "N"]:
+                for cat2 in ["Y", "N"]:
+                    category = expression + cat1 + cat2 + atype
+                    categoryList.append(category)
+                    categoryDict[category] = []
+
+    for gene in dictOne:
+        if gene not in expressionDict:
+            if writeOut:
+                print "%s is not in expressionDict - skipping" % gene
+
+            continue
+
+        expression = expressionDict[gene]
+        (ratio1, promAmount1, total1, restRPKM1) = dictOne[gene]
+        (ratio2, promAmount2, total2, restRPKM2) = dictTwo[gene]
+
+        if expression > expressionLevel:
+            category = "E"
+        else:
+            category = "N"
+
+        if total1 > 5.0:
+            category += "Y"
+        else:
+            category += "N"
+
+        if total2 > 5.0:
+            category += "Y"
+        else:
+            category += "N"
+
+        if ratio1 > 15:
+            category += "H"
+        else:
+            category += "L"
+
+        if ratio2 > 15:
+            category += "H"
+        else:
+            category += "L"
+
+        categoryDict[category].append(gene)
+        if writeOut:
+            outfile.write("%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
+)
+        else:
+            print "%s %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %s" % (gene, expression, ratio1, promAmount1, total1, restRPKM1, ratio2, promAmount2, total2, restRPKM2, category)
+
+    if writeOut:
+        outfile.close()
+
+    for category in categoryList:
+        if statWriteOut:
+            statoutfile.write("%s\t%d\n" % (category, len(categoryDict[category])))
+        else:
+            print "%s %d" % (category, len(categoryDict[category]))
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/test/testAnalyzeGO.py b/test/testAnalyzeGO.py
new file mode 100644 (file)
index 0000000..dacf4a2
--- /dev/null
@@ -0,0 +1,84 @@
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import analyzego
+
+
+class TestAnalyzeGO(unittest.TestCase):
+    genome = "celegans"
+    prefix = "testGO"
+    inFileName = "testAnayzeGOInput.txt"
+
+    def setUp(self):
+        infile = open(self.inFileName, "w")
+        infile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.inFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove("%s.gostat" % self.prefix)
+        except OSError:
+            pass
+
+        try:
+            os.remove("%s.gozscore" % self.prefix)
+        except OSError:
+            pass
+
+        try:
+            os.remove("%s.gosig" % self.prefix)
+        except OSError:
+            pass
+
+
+    #TODO: write more tests
+    def testAnalyzeGO(self):
+        geneInfoList = []
+        analyzego.analyzeGO(self.genome, geneInfoList, self.prefix)
+        self.assertRaises(IOError, open, "%s.gostat" % self.prefix, "r")
+        self.assertRaises(IOError, open, "%s.gozscore" % self.prefix, "r")
+        self.assertRaises(IOError, open, "%s.gosig" % self.prefix, "r")
+
+        geneInfoList = ["worm\tgeneID"]
+        analyzego.analyzeGO(self.genome, geneInfoList, self.prefix)
+        statfile = open("%s.gostat" % self.prefix, "r")
+        stats = statfile.readlines()
+        print len(stats)
+
+        statfile.close()
+        scorefile = open("%s.gozscore" % self.prefix, "r")
+        scores = scorefile.readlines()
+        print len(scores)
+
+        scorefile.close()
+        sigfile = open("%s.gosig" % self.prefix, "r")
+        sigs = sigfile.readlines()
+        print len(sigs)
+
+        sigfile.close()
+
+
+    def testMain(self):
+        argv = ["analyzego", self.genome, self.inFileName, self.prefix]
+        analyzego.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestAnalyzeGO))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testChkSNP_input.txt b/test/testChkSNP_input.txt
new file mode 100644 (file)
index 0000000..f9f36de
--- /dev/null
@@ -0,0 +1,3 @@
+# header line
+foo    foo     chr1    691
+foo2   foo2    chr1    81752
diff --git a/test/testChksnp.py b/test/testChksnp.py
new file mode 100644 (file)
index 0000000..3f2e8ae
--- /dev/null
@@ -0,0 +1,199 @@
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+import string
+import os
+from Erange import chksnp
+
+dbPath = "/Users/sau/work/snpdb/hg18"
+
+class TestChksnp(unittest.TestCase):
+    """ First entries from snpDB using select func, name, start, stop from snp where chrom="1" limit 4;
+        unknown|rs10218492|690|691
+        unknown|rs10218493|766|767
+        unknown|rs10218527|789|790
+        unknown|rs28853987|800|801
+
+        Entry from altSnpDB not in sndDB
+        unknown|rs17160650|81751|81752
+    """
+
+    snpDB = "%s/dbSNP128.db" % dbPath
+    altSnpDB = "%s/snp129cDNA.db" % dbPath
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testChkSNPFile(self):
+        inputFileName = "testChkSNP_input.txt"
+        infile = open(inputFileName, "w")
+        infile.write("# header line\n")
+        snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+        infile.write("%s\n" % snpEntry)
+        snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+        infile.write("%s\n" % snpEntry)
+        infile.close()
+
+        outputFileName = "testChkSNP_output.txt"
+
+        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
+        outfile = open(outputFileName, "r")
+        line = outfile.readline()
+        result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+        self.assertEquals(result, line)
+        result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n"
+        line = outfile.readline()
+        self.assertEquals(result, line)
+        outfile.close()
+        os.remove(outputFileName)
+
+        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
+        outfile = open(outputFileName, "r")
+        line = outfile.readline()
+        result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+        self.assertEquals(result, line)
+        result = "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown\n"
+        line = outfile.readline()
+        self.assertEquals(result, line)
+        outfile.close()
+
+        os.remove(inputFileName)
+        os.remove(outputFileName)
+
+
+    def testMain(self):
+        inputFileName = "testChkSNP_input.txt"
+        infile = open(inputFileName, "w")
+        infile.write("# header line\n")
+        snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+        infile.write("%s\n" % snpEntry)
+        snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+        infile.write("%s\n" % snpEntry)
+        infile.close()
+
+        outputFileName = "testChkSNP_output.txt"
+
+        argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+        chksnp.main(argv)
+        outfile = open(outputFileName, "r")
+        line = outfile.readline()
+        result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
+        self.assertEquals(result, line)
+        result = "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A\n"
+        line = outfile.readline()
+        self.assertEquals(result, line)
+        outfile.close()
+        os.remove(outputFileName)
+
+    def testChkSNP(self):
+        snpPropertiesList = []
+        dbList = [self.snpDB]
+        self.assertEquals({}, chksnp.chkSNP(dbList, snpPropertiesList))
+
+        snpPropertiesList = ["# header line"]
+        snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
+        snpPropertiesList.append(snpEntry)
+        snpEntry = string.join(["foo2", "foo2", "chr1", "81752"], "\t")
+        snpPropertiesList.append(snpEntry)
+        dbList = [self.snpDB, self.altSnpDB]
+        result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+                  ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+        self.assertEquals(result, chksnp.chkSNP(dbList, snpPropertiesList))
+
+
+    def testGetSNPLocationInfo(self):
+        snpPropertiesList = []
+        snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t")
+        snpPropertiesList.append(snpEntry)
+        snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList)
+        self.assertEquals([("1", 20)], snpLocationList)
+        self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict)
+
+        snpPropertiesList = ["# header line"]
+        snpEntry = string.join(["foo", "foo", "chr1", "20"], "\t")
+        snpPropertiesList.append(snpEntry)
+        snpLocationList, snpDict = chksnp.getSNPLocationInfo(snpPropertiesList)
+        self.assertEquals([("1", 20)], snpLocationList)
+        self.assertEquals({("1", 20): "foo\tfoo\tchr1\t20"}, snpDict)
+
+
+    def testDoNotProcessLine(self):
+        self.assertTrue(chksnp.doNotProcessLine("#anything"))
+        self.assertFalse(chksnp.doNotProcessLine("line to process"))
+
+
+    def testAnnotateSNPFromDB(self):
+        snpLocationList = [("1", 691), ("1", 81752)]
+        snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+                   ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+        result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+                  ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"}
+        self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.snpDB))
+
+        snpLocationList = [("1", 691), ("1", 81752)]
+        snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+                   ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+        result = {("1", 691): "foo\tfoo\tchr1\t691\tN\\A\tN\\A",
+                  ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+        self.assertEquals(result, chksnp.annotateSNPFromDB(snpLocationList, snpDict, self.altSnpDB))
+
+
+    def testAnnotateSNPFromDBList(self):
+        snpLocationList = []
+        snpDict = {}
+        dbList = [self.snpDB]
+        self.assertEquals({}, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+        snpLocationList = [("1", 21)]
+        snpDict = {("1", 21): "foo\tfoo\tchr1\t21"}
+        dbList = [self.snpDB]
+        result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"}
+        self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+        snpLocationList = [("1", 21)]
+        snpDict = {("1", 21): "foo\tfoo\tchr1\t21"}
+        dbList = [self.snpDB]
+        result = {("1", 21): "foo\tfoo\tchr1\t21\tN\\A\tN\\A"}
+        self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=10000))
+
+        snpLocationList = [("1", 691)]
+        snpDict = {("1", 691): "foo\tfoo\tchr1\t691"}
+        dbList = [self.snpDB]
+        result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown"}
+        self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+        snpLocationList = [("1", 691), ("1", 81752)]
+        snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+                   ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+        dbList = [self.snpDB]
+        result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+                  ("1", 81752): "foo2\tfoo2\tchr1\t81752\tN\\A\tN\\A"}
+        self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+        snpLocationList = [("1", 691), ("1", 81752)]
+        snpDict = {("1", 691): "foo\tfoo\tchr1\t691",
+                   ("1", 81752): "foo2\tfoo2\tchr1\t81752"}
+        dbList = [self.snpDB, self.altSnpDB]
+        result = {("1", 691): "foo\tfoo\tchr1\t691\trs10218492\tunknown",
+                  ("1", 81752): "foo2\tfoo2\tchr1\t81752\trs17160650\tunknown"}
+        self.assertEquals(result, chksnp.annotateSNPFromDBList(snpLocationList, snpDict, dbList))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestChksnp))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testCommoncode.py b/test/testCommoncode.py
new file mode 100644 (file)
index 0000000..1ea4f80
--- /dev/null
@@ -0,0 +1,555 @@
+'''
+Created on Aug 30, 2010
+
+@author: sau
+'''
+import unittest
+import os
+import string
+from array import array
+from Erange import commoncode
+from cistematic.genomes import Genome
+
+
+class TestCommoncode(unittest.TestCase):
+    logFile = "testLogFile"
+    celegansChroms = ["I", "II", "III", "IV", "V", "X", "MtDNA"]
+    genome = Genome("celegans")
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.logFile)
+        except OSError:
+            pass
+
+
+    def testGetReverseComplement(self):
+        self.assertEquals("T", commoncode.getReverseComplement("A"))
+        self.assertEquals("A", commoncode.getReverseComplement("T"))
+        self.assertEquals("C", commoncode.getReverseComplement("G"))
+        self.assertEquals("G", commoncode.getReverseComplement("C"))
+        self.assertEquals("N", commoncode.getReverseComplement("N"))
+        self.assertRaises(KeyError, commoncode.getReverseComplement, "")
+        self.assertRaises(KeyError, commoncode.getReverseComplement, "B")
+
+
+    def testCountDuplicatesInList(self):
+        testList = []
+        self.assertEquals([], commoncode.countDuplicatesInList(testList))
+
+        testList = [0, 1]
+        result = [(0, 1), (1, 1)]
+        self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+        testList = [0, 1, 1]
+        result = [(0, 1), (1, 2)]
+        self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+        testList = [0, 1, 2, 1]
+        result = [(0, 1), (1, 2), (2, 1)]
+        self.assertEquals(result, commoncode.countDuplicatesInList(testList))
+
+
+    def testWriteLog(self):
+        messenger = "testMessenger"
+        message = "testMessage"
+
+        commoncode.writeLog(self.logFile, messenger, message)
+        file = open(self.logFile)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[%s]" % messenger)
+        self.assertEquals(fields[3], message)
+        line = file.readline()
+        self.assertEquals("", line)
+
+        messenger2 = "testMessenger2"
+        message2 = "testMessage2"
+
+        commoncode.writeLog(self.logFile, messenger2, message2)
+        file = open(self.logFile)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[%s]" % messenger)
+        self.assertEquals(fields[3], message)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[%s]" % messenger2)
+        self.assertEquals(fields[3], message2)
+        line = file.readline()
+        self.assertEquals("", line)
+
+        os.remove(self.logFile)
+
+        commoncode.writeLog(self.logFile, messenger, message)
+        file = open(self.logFile)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[%s]" % messenger)
+        self.assertEquals(fields[3], message)
+        line = file.readline()
+        self.assertEquals("", line)
+
+        os.remove(self.logFile)
+
+        commoncode.writeLog(self.logFile, "", message)
+        file = open(self.logFile)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[]")
+        self.assertEquals(fields[3], message)
+        line = file.readline()
+        self.assertEquals("", line)
+
+        os.remove(self.logFile)
+
+        commoncode.writeLog(self.logFile, "", "")
+        file = open(self.logFile)
+        line = file.readline()
+        fields = line.split()
+        self.assertEquals(fields[2], "[]")
+        self.assertEquals(3, len(fields))
+        line = file.readline()
+        self.assertEquals("", line)
+
+
+    def testGetMergedRegions(self):
+        testfile = open("regionTestFile", "w")
+        regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+        testfile.write(regionEntry)
+        testfile.close()
+        result = {"1": [(10, 20, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+        os.remove("regionTestFile")
+
+
+    def testGetMergedRegionsFromList(self):
+        self.assertEquals({}, commoncode.getMergedRegionsFromList([]))
+
+        regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+        regionList = [regionEntry]
+        result = {"1": [(10, 20, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+        result = {"1": [(5, 25, 20)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
+        result = {"1": [(12, 18, 6)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
+        result = {"chr1": [(10, 20, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+
+        regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
+        regionList = [regionEntry]
+        result = {"1": [(10, 20, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+
+        regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+        regionList = [regionEntry]
+        regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [(10, 40, 30)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+        result = {"1": [(10, 20, 10), (15, 40, 25)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
+        result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+
+        regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
+        regionList = [regionEntry]
+        regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+
+        regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
+        regionList = [regionEntry]
+        regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+        result = {"1": [(10, 2040, 2030)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
+        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
+        result = {"1": [(2030, 2040, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+
+        regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
+        regionList = [regionEntry]
+        regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
+        regionList.append(regionEntry)
+        result = {"2": [(15, 40, 25)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+        self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
+        self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
+        self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
+
+        regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
+        regionList = [regionEntry]
+        result = {"1": [(10, 20, 10)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
+        result = {"1": [(10, 20, 10, 3, 40)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        result = {"1": [("1", 10, 20, 10, 3, 40)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
+                  "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
+        }
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+
+        regionList = ["# comment"]
+        regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [(10, 20, 10, 3, 40)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        regionList = ["# pvalue"]
+        regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
+        regionList.append(regionEntry)
+        result = {"1": [(10, 20, 10, 3, 40)]}
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        regionList = ["# readShift"]
+        regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
+        regionList.append(regionEntry)
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        regionList = ["# pvalue readShift"]
+        regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
+        regionList.append(regionEntry)
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        #Test fails - the header line is required if there are fields after the peak which isn't so good
+        #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
+
+
+    def testRegionsOverlap(self):
+        self.assertTrue(commoncode.regionsOverlap(100, 200, 1, 300))
+        self.assertTrue(commoncode.regionsOverlap(100, 200, 150, 300))
+        self.assertTrue(commoncode.regionsOverlap(100, 500, 1, 300))
+        self.assertTrue(commoncode.regionsOverlap(100, 200, 110, 160))
+
+        self.assertFalse(commoncode.regionsOverlap(100, 200, 250, 300))
+        self.assertFalse(commoncode.regionsOverlap(100, 200, 1, 60))
+
+        self.assertFalse(commoncode.regionsOverlap(-200, -100, 1, 300))
+        self.assertFalse(commoncode.regionsOverlap(100, 200, -300, -1))
+
+        self.assertTrue(commoncode.regionsOverlap(-200, -100, -300, -1))
+
+        self.assertTrue(commoncode.regionsOverlap(-100, -200, -300, -1))
+        self.assertTrue(commoncode.regionsOverlap(-200, -100, -1, -300))
+        self.assertTrue(commoncode.regionsOverlap(-100, -200, -1, -300))
+
+
+    def testRegionsAreWithinDistance(self):
+        self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 40, 50, 30))
+        self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5))
+        self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 25, 50, 10))
+        self.assertTrue(commoncode.regionsAreWithinDistance(10, 20, 1, 5, 5))
+
+        self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 100, 150, 5))
+        self.assertFalse(commoncode.regionsAreWithinDistance(100, 200, 10, 15, 5))
+
+        self.assertTrue(commoncode.regionsAreWithinDistance(20, 10, 30, 150, 10))
+        self.assertFalse(commoncode.regionsAreWithinDistance(20, 10, 100, 150, 5))
+        self.assertFalse(commoncode.regionsAreWithinDistance(10, 20, 150, 100, 5))
+
+
+    #TODO: write test
+    def testFindPeak(self):
+        hitList = []
+        result = ([], 0.0, array("f"), 0.0)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+
+        hitList= [[4, "+", 0.5]]
+        result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
+        result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
+        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
+        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+
+        hitList= [[4, "+", 0.5]]
+        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+
+        hitList= [[4, "+", 0.5]]
+        result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
+        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
+        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+
+
+    #TODO: write test
+    def testGetBestShiftForRegion(self):
+        hitList = [[14, "-", 1.0], [16, "-", 1.0], [24, "+", 1.0],  [26, "+", 10.0]]
+        self.assertEquals(74, commoncode.getBestShiftForRegion(hitList, 0, 100))
+        self.assertEquals(16, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=30))
+        self.assertEquals(0, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=10))
+
+
+    #TODO: write test
+    def testGetFeaturesByChromDict(self):
+        firstFeatures = {"I": (4123, 4219, "Y74C9A.3", "R", "3UTR"),
+                         "II": (1866, 1910, "2L52.1", "F", "CDS"),
+                         "III": (1270, 1506, "cTel54X.1", "R", "CDS"),
+                         "IV": (694, 1064, "Y38C1AB.4", "F", "CDS"),
+                         "V": (1479, 1578, "cTel3X.1", "F", "CDS"),
+                         "X": (3622, 4099, "CE7X_3.1", "F", "CDS"),
+                         "MtDNA": (112, 543, "MTCE.3", "F", "CDS")
+        }
+        featureDict = commoncode.getFeaturesByChromDict(self.genome)
+        for chrom in featureDict.keys():
+            self.assertTrue(chrom in self.celegansChroms)
+            self.assertEquals(firstFeatures[chrom], featureDict[chrom][0])
+
+        restrictList = ["almost certainly not a value feature"]
+        featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList)
+        self.assertEquals({}, featureDict)
+
+        restrictList = ["Y74C9A.3"]
+        featureDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList)
+        self.assertEquals(["I"], featureDict.keys())
+        featureDict, complementDict = commoncode.getFeaturesByChromDict(self.genome, restrictList=restrictList, regionComplement=True)
+        result = {"I": [(0, 4123, "nonExon1", "F", "nonExon"),
+                        (4219, 4220, "nonExon2", "F", "nonExon"),
+                        (4357, 5194, "nonExon3", "F", "nonExon"),
+                        (5295, 6036, "nonExon4", "F", "nonExon"),
+                        (6326, 9726, "nonExon5", "F", "nonExon"),
+                        (9845, 10094, "nonExon6", "F", "nonExon"),
+                        (10147, 10148, "nonExon7", "F", "nonExon"),
+                        (10231, 250000000, "nonExon8", "F", "nonExon")]
+        }
+        self.assertEquals(result, complementDict)
+
+        regionDict = {"I": [("new feature", 100, 150, 50)]}
+        featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
+        result = (100, 150, "new feature", "+", "custom")
+        self.assertEquals(result, featureDict["I"][0])
+
+
+    def testGetLocusByChromDict(self):
+        firstLoci = {"I": (4123, 10231, "Y74C9A.3", 6108),
+                     "II": (1866, 4662, "2L52.1", 2796),
+                     "III": (1270, 2916, "cTel54X.1", 1646),
+                     "IV": (694, 14925, "Y38C1AB.4", 14231),
+                     "V": (1479, 3038, "cTel3X.1", 1559),
+                     "X": (3622, 7153, "CE7X_3.1", 3531),
+                     "MtDNA": (112, 548, "MTCE.3", 436)
+        }
+
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, useCDS=False))
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, downstream=1, useCDS=False))
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=-1, downstream=-1, useCDS=False, lengthCDS=1))
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstreamSpanTSS=True, lengthCDS=1))
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, downstream=1, lengthCDS=1))
+        self.assertEquals({}, commoncode.getLocusByChromDict(self.genome, upstream=1, lengthCDS=-1))
+
+        locusDict = commoncode.getLocusByChromDict(self.genome)
+        for chrom in locusDict.keys():
+            self.assertTrue(chrom in self.celegansChroms)
+            self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
+
+        regionDict = {"I": [("new region", 100, 150, 50)]}
+        locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
+        self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
+        locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
+        self.assertEquals((100, 150, "new region", 50, "+"), locusDict["I"][0])
+
+        # Long Test
+        #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, upstream=100)
+        #self.assertEquals((150, 250, "new region", 100), locusDict["I"][0])
+
+        # Long Test
+        #locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, useCDS=False, downstream=10)
+        #self.assertEquals((90, 100, "new region", 10), locusDict["I"][0])
+
+
+    def testComputeRegionBins(self):
+        regionsByChromDict = {}
+        hitDict = {}
+        bins = 4
+        readlen = 10
+        result = ({}, {})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        result = ({"regionID": [0.0, 0.0, 0.0, 0.0]}, {})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[1, "+", 1.0]]}
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+                              "2": [("regionID2", 1, 1000, 1000, "F")]
+        }
+        hitDict = {"1": [[1, "+", 1.0]],
+                   "2": [[1, "+", 1.0]]
+        }
+        result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5]]}
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [15, "+", 1.0]]}
+        result = ({"regionID": [2.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [200, "+", 2.0]]}
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[1, "+", 1.0]]}
+        regionList = ["regionID"]
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[1, "+", 1.0]]}
+        regionList = ["empty region"]
+        result = ({"empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+                              "2": [("regionID2", 1, 1000, 1000, "F")]
+        }
+        hitDict = {"1": [[1, "+", 1.0]],
+                   "2": [[1, "+", 1.0]]
+        }
+        regionList = ["regionID", "regionID2"]
+        result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+                              "2": [("regionID2", 1, 1000, 1000, "F")]
+        }
+        hitDict = {"1": [[1, "+", 1.0]],
+                   "2": [[1, "+", 1.0]]
+        }
+        regionList = ["empty region", "regionID2"]
+        result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
+                              "2": [("regionID2", 1, 1000, 1000, "F")]
+        }
+        hitDict = {"1": [[1, "+", 1.0]],
+                   "2": [[1, "+", 1.0]]
+        }
+        regionList = ["regionID2"]
+        result = ({"regionID2": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[1, "+", 1.0]]}
+        result = ({"regionID": [2.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizedTag=2.0))
+
+        regionsByChromDict = {"1": [(1, 100, "regionID", 100, "F")]}
+        hitDict = {"1": [[1, "+", 1.0]]}
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, defaultRegionFormat=False))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0]]}
+        fixedFirstBin = 20
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0]]}
+        fixedFirstBin = 5
+        result = ({"regionID": [0.0, 1.0, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        fixedFirstBin = 20
+        result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[80, "+", 1.0], [85, "+", 0.5]]}
+        fixedFirstBin = 5
+        result = ({"regionID": [0.0, 1.5, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        binLength = 25
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        binLength = 50
+        result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        binLength = 15
+        result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
+        hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+        binLength = 15
+        result = ({"regionID": [1.0, 0.0, 0.7, 0.5]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+        regionsByChromDict = {"1": [("regionID", 1, 100, 100, "R")]}
+        hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+        result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
+        result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
+        fixedFirstBin = 10
+        result = ({"regionID": [0.0, 2.2, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+        fixedFirstBin = 20
+        result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
+        binLength = 50
+        result = ({"regionID": [0.5, 1.7, 0.0, 0.0]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+        binLength = 10
+        result = ({"regionID": [0.0, 0.5, 0.0, 1.7]}, {"regionID": 100})
+        self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCommoncode))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testErange.py b/test/testErange.py
new file mode 100644 (file)
index 0000000..d9392ff
--- /dev/null
@@ -0,0 +1,66 @@
+'''
+Runs all unit test for Erange.
+Functionality will eventually be incorporated into unittest in Python 2.7+
+Uses test suites until then
+
+Created on Sep 8, 2010
+
+@author: sau
+'''
+
+import sys
+import unittest
+import testAnalyzeGO
+import testChksnp
+import testCommoncode
+import testGeneMrnaCounts
+#import testGetFasta
+import testGetNovelSNPs
+import testGetSNPGeneInfo
+import testGetSNPs
+import testMakeBamFromRds
+import testmakebedfromrds
+#import testMakeGraphs
+import testMakeRdsFromBam
+import testMakeSNPTrack
+import testMarkLinkers
+import testPeaksToRegion
+import testProcessVelvet
+import testReadDataset
+import testRnaAToIFilter
+import testRnaEditing
+import testRNAPATH
+import testTranscripts
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    suite = unittest.TestSuite()
+    suite.addTest(testAnalyzeGO.suite())
+    suite.addTest(testChksnp.suite())
+    suite.addTest(testCommoncode.suite())
+    suite.addTest(testGeneMrnaCounts.suite())
+    #suite.addTest(testGetFasta.suite())
+    suite.addTest(testGetNovelSNPs.suite())
+    suite.addTest(testGetSNPGeneInfo.suite())
+    suite.addTest(testGetSNPs.suite())
+    suite.addTest(testMakeBamFromRds.suite())
+    suite.addTest(testmakebedfromrds.suite())
+    #suite.addTest(testMakeGraphs.suite())
+    suite.addTest(testMakeRdsFromBam.suite())
+    suite.addTest(testMakeSNPTrack.suite())
+    suite.addTest(testMarkLinkers.suite())
+    suite.addTest(testPeaksToRegion.suite())
+    suite.addTest(testProcessVelvet.suite())
+    suite.addTest(testReadDataset.suite())
+    suite.addTest(testRnaAToIFilter.suite())
+    suite.addTest(testRnaEditing.suite())
+    suite.addTest(testRNAPATH.suite())
+    #suite.addTest(testTranscripts.suite())
+
+    unittest.TextTestRunner(verbosity=2).run(suite)
+
+if __name__ == '__main__':
+    main(sys.argv)
\ No newline at end of file
diff --git a/test/testGeneMrnaCounts.py b/test/testGeneMrnaCounts.py
new file mode 100644 (file)
index 0000000..62f1649
--- /dev/null
@@ -0,0 +1,220 @@
+'''
+Created on Aug 19, 2010
+
+@author: sau
+
+Located feature 728439 by:
+    from Erange.commoncode import getFeaturesByChromDict
+    genome = Genome(self.genomeName)
+    featuresByChromDict = getFeaturesByChromDict(genome)
+    print featuresByChromDict["1"][:3]
+
+'''
+import unittest
+import os
+from Erange import geneMrnaCounts
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+from Erange.commoncode import readDataset
+
+
+class TestGeneMrnaCounts(unittest.TestCase):
+    idb = geneinfoDB(cache=True)
+    testDBName = "testRDS.rds"
+    genomeName = "hsapiens"
+    outfilename = "testGeneMrnaCounts.txt"
+
+    def setUp(self):
+        self.rds = readDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
+
+
+    def tearDown(self):
+        del(self.rds)
+        os.remove(self.testDBName)
+
+
+    def testGeneMrnaCounts(self):
+        geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename)
+        outfile = open(self.outfilename, "r")
+        for line in outfile:
+            fields = line.split("\t")
+            self.assertEquals("0\n", fields[2])
+
+        outfile.close()
+        os.remove(self.outfilename)
+
+        rdsEntryList = [("testRead", "chr1", 18700, 18800, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename)
+        possibleCounts = ["0\n", "1\n"]
+        outfile = open(self.outfilename, "r")
+        for line in outfile:
+            fields = line.split("\t")
+            self.assertTrue(fields[2] in possibleCounts)
+
+        outfile.close()
+        os.remove(self.outfilename)
+
+        geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
+                                      markGID=True, trackStrand=True)
+        
+        possibleCounts = ["0\n", "1\n"]
+        outfile = open(self.outfilename, "r")
+        for line in outfile:
+            fields = line.split("\t")
+            self.assertTrue(fields[2] in possibleCounts)
+
+        outfile.close()
+        os.remove(self.outfilename)
+        reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+        self.assertEquals("728439", reads["1"][0]["flag"])
+
+        geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
+                                      countFeats=True, markGID=True, cachePages=150000)
+
+        possibleCounts = ["0\n", "1\n"]
+        outfile = open(self.outfilename, "r")
+        for line in outfile:
+            fields = line.split("\t")
+            self.assertTrue(fields[2] in possibleCounts)
+
+        outfile.close()
+        os.remove(self.outfilename)
+        reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+        self.assertEquals("728439", reads["1"][0]["flag"])
+
+
+    def testCountFeatures(self):
+        testDict = {}
+        self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": []}
+        self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+        #TODO: This is likely not the result we want
+        testDict = {"chr1": "not a list"}
+        self.assertEquals(10, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": 10}
+        self.assertEquals(0, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": 10,
+                    "chr2": ["f1"]}
+        self.assertEquals(1, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": ["f1", "f2"]}
+        self.assertEquals(2, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": ["f1", "f2"],
+                    "chr2": []}
+        self.assertEquals(2, geneMrnaCounts.countFeatures(testDict))
+
+        testDict = {"chr1": ["f1", "f2"],
+                    "chr2": ["f1"]}
+        self.assertEquals(3, geneMrnaCounts.countFeatures(testDict))
+
+
+    def testGetGeneSymbol(self):
+        # Case: Null/None inputs
+        gid = ""
+        searchGID = False
+        geneInfoDict = {}
+        idb = None
+        genomeName = ""
+        geneAnnotDict = {}
+        self.assertEquals("LOC", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+        # Case: symbol is in geneInfoDict
+        gid = "1"
+        searchGID = False
+        geneInfoDict = {"1": [["gene1", "wrong name"], ["wrong name 2"]]}
+        idb = None
+        genomeName = "test"
+        geneAnnotDict = {("test", "1"): ["wrong name 3"]}
+        self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+        # Case: symbol not in geneInfoDict, is in geneAnnotDict
+        gid = "1"
+        searchGID = False
+        geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+        idb = None
+        genomeName = "test"
+        geneAnnotDict = {("test", "1"): ["gene1"]}
+        self.assertEquals("gene1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+        # Case: symbol not in geneInfoDict or geneAnnotDict - non-null/None inputs
+        gid = "1"
+        searchGID = False
+        geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+        idb = None
+        genomeName = "test"
+        geneAnnotDict = {("test", "0"): ["wrong name 3"]}
+        self.assertEquals("LOC1", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+        # Case: using search, gid not in idb
+        gid = "almostCertainlyNotInTheIDB"
+        searchGID = True
+        geneInfoDict = {"0": [["wrong name"], ["wrong name 2"]]}
+        idb = self.idb
+        genomeName = "human"
+        geneAnnotDict = {("human", "0"): ["wrong name 3"]}
+        self.assertEquals("LOCalmostCertainlyNotInTheIDB", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+        # Case: using search
+        # sql to get gid: select gID from gene_info where genome="human" and locustag !="-" and locustag != symbol limit 5;
+        gid = "RP11-177A2.3"
+        searchGID = True
+        geneInfoDict = {"27": [["correct"], ["wrong name 2"]]}
+        idb = self.idb
+        genomeName = "human"
+        geneAnnotDict = {("human", "0"): ["wrong name 3"]}
+        self.assertEquals("correct", geneMrnaCounts.getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict))
+
+
+    def testWriteOutputFile(self):
+        genome = Genome(self.genomeName)
+        gidList = ["RP11-177A2.3"]
+        gidCount = {"RP11-177A2.3": 1}
+        geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False)
+
+        outfile = open(self.outfilename, "r")
+        line = outfile.readline()
+        result = "RP11-177A2.3\tLOCRP11-177A2.3\t1\n"
+        self.assertEquals(result, line)
+        outfile.close()
+        os.remove(self.outfilename)
+
+        genome = Genome("hsapiens")
+        gidList = ["RP11-177A2.3"]
+        gidCount = {"something else": 1}
+        geneMrnaCounts.writeOutputFile(self.outfilename, genome, gidList, gidCount, searchGID=False)
+
+        outfile = open(self.outfilename, "r")
+        line = outfile.readline()
+        result = "RP11-177A2.3\tLOCRP11-177A2.3\t0\n"
+        self.assertEquals(result, line)
+        outfile.close()
+        os.remove(self.outfilename)
+
+    def testMain(self):
+        argv = ["geneMRNACounts", self.genomeName, self.testDBName, self.outfilename]
+        geneMrnaCounts.main(argv)
+        outfile = open(self.outfilename, "r")
+        for line in outfile:
+            fields = line.split("\t")
+            self.assertEquals("0\n", fields[2])
+
+        outfile.close()
+        os.remove(self.outfilename)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGeneMrnaCounts))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testGetFasta.py b/test/testGetFasta.py
new file mode 100644 (file)
index 0000000..cb71685
--- /dev/null
@@ -0,0 +1,231 @@
+'''
+Created on Aug 27, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import getfasta
+#from Erange import ReadDataset
+from Erange.commoncode import readDataset
+
+testDBName = "testRDS.rds"
+
+
+class TestGetFasta(unittest.TestCase):
+
+
+    def setUp(self):
+        self.regionDict = {}
+        self.minHitThresh = -1
+        self.maxsize = 3000
+        self.outfilename = "testFileForTestGetFasta.fa"
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.outfilename)
+        except OSError:
+            print "fasta file does not exist"
+
+        try:
+            os.remove(testDBName)
+        except OSError:
+            print "RDS file does not exist"
+
+
+    def testGetDefaultRegion(self):
+        self.assertEquals({}, getfasta.getDefaultRegion(self.regionDict, self.maxsize))
+
+        regionDict = {"1": [],
+                      "2": []
+        }
+        result = {"2": [],
+                  "1": []
+        }
+        self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10)],
+                      "2": []
+        }
+        result = {"2": [],
+                  "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+        }
+        self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10)],
+                      "2": [(11, 21, 11)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+        }
+        self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10), (100, 4000, 3900)],
+                      "2": [(11, 21, 11)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [-1]}]
+        }
+        self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10), (100, 4000, 3900), (50, 60, 10)],
+                      "2": [(11, 21, 11)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [-1]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [-1]},
+                        {"start": 50, "length": 10, "topPos": [-1]}]
+        }
+        self.assertEquals(result, getfasta.getDefaultRegion(regionDict, self.maxsize))
+
+
+    def testGetRegionUsingPeaks(self):
+        self.assertEquals({}, getfasta.getRegionUsingPeaks(self.regionDict, self.minHitThresh, self.maxsize))
+
+        regionDict = {"1": [],
+                      "2": []
+        }
+        result = {"2": [],
+                  "1": []
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10, 15, 1)],
+                      "2": []
+        }
+        result = {"2": [],
+                  "1": [{"start": 10, "length": 10, "topPos": [5]}]
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+        result = {"2": [],
+                  "1": []
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, 3, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10, 15, 1)],
+                      "2": [(11, 21, 11, 18, 1)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [5]}]
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1)],
+                      "2": [(11, 21, 11, 18, 1)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [5]}]
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+        regionDict = {"1": [(10, 20, 10, 15, 1), (100, 4000, 3900, 111, 1), (50, 60, 10, 59, 1)],
+                      "2": [(11, 21, 11, 18, 1)]
+        }
+        result = {"2": [{"start": 11, "length": 11, "topPos": [7]}],
+                  "1": [{"start": 10, "length": 10, "topPos": [5]},
+                        {"start": 50, "length": 10, "topPos": [9]}]
+        }
+        self.assertEquals(result, getfasta.getRegionUsingPeaks(regionDict, self.minHitThresh, self.maxsize))
+
+
+    #TODO: write test.  This seems to not make sense.  We are always returning a "topPos" of range(rlen).
+    # need to check to see if the issue might be with commoncode.findPeak as there is a lot of questionable
+    # logic in that one
+    def testGetRegionUsingRDS(self):
+        rds = readDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+        rds.insertMetadata([("readsize", "100")])
+        rdsEntryList = [("testRead", "chr1", 10, 100, "+", 1.0, "", "")]
+        rds.insertUniqs(rdsEntryList)
+        self.assertEquals({}, getfasta.getRegionUsingRDS(self.regionDict, rds, self.minHitThresh, self.maxsize))
+
+        regionDict = {"1": [],
+                      "2": []
+        }
+        result = {"2": [],
+                  "1": []
+        }
+        self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize))
+
+        # Ack with a capital ACK.
+        regionDict = {"1": [(1, 600, 5)],
+                      "2": []
+        }
+        result = {"1": [{"start": 1, "length": 5, "topPos": [0, 1, 2, 3, 4]}],
+                  "2": []
+        }
+        self.assertEquals(result, getfasta.getRegionUsingRDS(regionDict, rds, self.minHitThresh, self.maxsize))
+
+        del(rds)
+
+
+    def testWriteFastaFile(self):
+        ncregions = {}
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        for line in open(self.outfilename):
+            self.assertEquals("", line)
+
+        ncregions = {"1": [],
+                     "2": []
+        }
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        for line in open(self.outfilename):
+            self.assertEquals("", line)
+
+        ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}],
+                     "2": []
+        }
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        fastaFile = open(self.outfilename)
+        self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+        self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+        self.assertEquals("", fastaFile.readline())
+
+        ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]}],
+                     "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+        }
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        fastaFile = open(self.outfilename)
+        self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+        self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+        self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+        self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+        self.assertEquals("", fastaFile.readline())
+
+        ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6, 20]}],
+                     "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+        }
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        fastaFile = open(self.outfilename)
+        self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+        self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+        self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+        self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+        self.assertEquals("", fastaFile.readline())
+
+        ncregions = {"1": [{"start": 12000, "length": 50, "topPos": [6]},
+                           {"start": 15000, "length": 50, "topPos": [2]}
+                          ],
+                     "2": [{"start": 18000, "length": 50, "topPos": [30]}]
+        }
+        getfasta.writeFastaFile(ncregions, "hsapiens", self.outfilename)
+        fastaFile = open(self.outfilename)
+        self.assertEquals(">chr1:11956-12057\n", fastaFile.readline())
+        self.assertEquals("tcatagtcccctggccccattaatggattctgggatagacatgaggaccaagccaggTGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGA\n", fastaFile.readline())
+        self.assertEquals(">chr1:14952-15053\n", fastaFile.readline())
+        self.assertEquals("AGTGAATGAGGGAAAGGGCAGGGCCCGGGACTGGGGAATCTGTAGGGTCAATGGAGGAGTTCAGAGAAGGTGCAACATTTCTGACCCCCTACAAGGTGCTT\n", fastaFile.readline())
+        self.assertEquals(">chr2:17980-18081\n", fastaFile.readline())
+        self.assertEquals("ATCATTTCAAGGATGCTTTGAGGGTAAAAAGAATGATCAATTGTGAAGCAGTGAATTGTGCTGCCAGGCACAATTCATTGGGTAATAGAAAGCTTCATTTA\n", fastaFile.readline())
+        self.assertEquals("", fastaFile.readline())
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGetFasta))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testGetNovelSNPs.py b/test/testGetNovelSNPs.py
new file mode 100644 (file)
index 0000000..93865d5
--- /dev/null
@@ -0,0 +1,33 @@
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+
+
+class TestGetNovelSNPs(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testName(self):
+        pass
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGetNovelSNPs))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testGetSNPGeneInfo.py b/test/testGetSNPGeneInfo.py
new file mode 100644 (file)
index 0000000..ed33674
--- /dev/null
@@ -0,0 +1,131 @@
+'''
+Created on Aug 26, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import getSNPGeneInfo
+
+
+class TestGetSNPGeneInfo(unittest.TestCase):
+
+
+    def setUp(self):
+        self.geneDict = {}
+        self.snpDict = {}
+        self.rpkmDict = {}
+        self.withSense = False
+
+
+    def tearDown(self):
+        pass
+
+
+    def testDoNotProcessLine(self):
+        self.assertTrue(getSNPGeneInfo.doNotProcessLine("#anything"))
+        self.assertFalse(getSNPGeneInfo.doNotProcessLine("line to process"))
+
+
+    def testGetSNPGeneInfoList(self):
+        geneInfoList = getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+        self.assertEquals([], geneInfoList)
+
+        badGeneDict = {"badEntry": "foo"}
+        self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneInfoList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"}
+        self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneInfoList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+        self.snpDict[("1", 1)] = "chr1\tpos 1\n"
+        result = [{"symbol": "gene1",
+                   "rpkm": "N\\A",
+                   "geneID": "ID1",
+                   "snpDescription": "chr1\tpos 1" }
+        ]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        self.rpkmDict["ID1"] = 300
+        result = [{"symbol": "gene1",
+                   "rpkm": "300",
+                   "geneID": "ID1",
+                   "snpDescription": "chr1\tpos 1" }
+        ]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"}
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        result = [{"symbol": "gene1",
+                   "sense": "+",
+                   "rpkm": "300",
+                   "geneID": "ID1",
+                   "snpDescription": "chr1\tpos 1" }
+        ]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, True))
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"}
+        self.snpDict[("1", 10)] = "chr1\tpos 10\n"
+        result = [{"symbol": "gene1",
+                   "rpkm": "300",
+                   "geneID": "ID1",
+                   "snpDescription": "chr1\tpos 10" },
+                  {"symbol": "gene1",
+                   "rpkm": "300",
+                   "geneID": "ID1",
+                   "snpDescription": "chr1\tpos 1" }
+        ]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneInfoList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+
+    #TODO: write test
+    def testGetSNPGeneInfo(self):
+        pass
+
+
+    def testGetSNPGeneOutputList(self):
+        geneOutputList = getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+        self.assertEquals([], geneOutputList)
+
+        badGeneDict = {"badEntry": "foo"}
+        self.assertRaises(ValueError, getSNPGeneInfo.getSNPGeneOutputList, badGeneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1)], "sense": "+"}
+        self.assertRaises(KeyError, getSNPGeneInfo.getSNPGeneOutputList, self.geneDict, self.snpDict, self.rpkmDict, self.withSense)
+
+        self.snpDict[("1", 1)] = "chr1\tpos 1\n"
+        result = ["chr1\tpos 1\tgene1\tID1\tN\\A"]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        self.rpkmDict["ID1"] = 300
+        result = ["chr1\tpos 1\tgene1\tID1\t300"]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 1)], "sense": "+"}
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+        result = ["chr1\tpos 1\tgene1\tID1\t300\t+"]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, True))
+
+        self.geneDict[("gene1", "ID1")] = {"position": [("1", 1), ("1", 10)], "sense": "+"}
+        self.snpDict[("1", 10)] = "chr1\tpos 10\n"
+        result = ["chr1\tpos 10\tgene1\tID1\t300",
+                  "chr1\tpos 1\tgene1\tID1\t300"
+        ]
+        self.assertEquals(result, getSNPGeneInfo.getSNPGeneOutputList(self.geneDict, self.snpDict, self.rpkmDict, self.withSense))
+
+
+    #TODO: write test
+    def testWriteSNPGeneInfo(self):
+        pass
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGetSNPGeneInfo))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testGetSNPs.py b/test/testGetSNPs.py
new file mode 100644 (file)
index 0000000..68ef8c0
--- /dev/null
@@ -0,0 +1,84 @@
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import os, unittest
+from Erange.commoncode import readDataset
+from Erange import getSNPs
+
+
+class TestGetSNPs(unittest.TestCase):
+
+    def setUp(self):
+        self.rdsDNA = readDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
+
+        uniqueInsertList = [("uniqueID1", "chr1", 10, 20, "+", 1.0, "", ""),
+                            ("uniqueID2", "chr1", 100, 200, "+", 1.0, "", ""),
+                            ("uniqueID3", "chr1", 1000, 2000, "+", 1.0, "", "G10A")]
+
+        multiInsertList = [("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""),
+                           ("multiID1", "chr2", 1010, 1020, "+", 0.5, "", ""),
+                           ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+                           ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+                           ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", ""),
+                           ("multiID2", "chr2", 10100, 10200, "+", 0.25, "", "")]
+
+        self.rdsDNA.insertUniqs(uniqueInsertList)
+        self.rdsDNA.insertMulti(multiInsertList)
+
+
+    def tearDown(self):
+        os.remove("./testDNARDSForUnitTests.rds")
+        self.rdsDNA = None
+
+
+    def testGetMatchDict(self):
+        uniqueTestDict = getSNPs.getMatchDict(self.rdsDNA, "chr1", withSplices=False)
+
+        self.assertEqual(uniqueTestDict[10][0], 20, "incorrect result for unique chr position 10")
+        self.assertEqual(uniqueTestDict[100][0], 200, "incorrect result for unique chr position 100")
+        self.assertEqual(uniqueTestDict[1000][0], 2000, "incorrect result for unique chr position 1000")
+
+        self.assertRaises(KeyError, getSNPs.getMatchDict, self.rdsDNA, "chr2", withSplices=False)
+
+
+    def testGetMismatchDict(self):
+        mismatchDict = getSNPs.getMismatchDict(self.rdsDNA, "chr1")
+        result = {1009: {"totalBaseDict": {"A-G": 1},
+                         "uniqueReadCount": 1,
+                         "uniqBaseDict": {"A-G": 1},
+                         "back": "1000:A-G", "totalCount": 1
+                         }
+        }
+        self.assertEquals(result, mismatchDict)
+
+
+    #TODO: write unit test
+    def testGetSNPs(self):
+        pass
+
+
+    #TODO: write unit test
+    def testWriteSNPsToFile(self):
+        pass
+
+
+    def testDoNotProcessChromosome(self):
+        self.assertFalse(getSNPs.doNotProcessChromosome(True, "chr1"))
+        self.assertFalse(getSNPs.doNotProcessChromosome(False, "chr1"))
+        self.assertFalse(getSNPs.doNotProcessChromosome(False, "badName"))
+        self.assertTrue(getSNPs.doNotProcessChromosome(True, "badName"))
+        self.assertTrue(getSNPs.doNotProcessChromosome(True, ""))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGetSNPs))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeBamFromRds.py b/test/testMakeBamFromRds.py
new file mode 100644 (file)
index 0000000..8c0df53
--- /dev/null
@@ -0,0 +1,38 @@
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import MakeBamFromRds
+
+
+class TestMakeBamFromRds(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testGetMismatches(self):
+        mismatchString = "3A10T"
+        self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("A3G, T10A"))
+
+        mismatchString = ""
+        self.assertEqual(mismatchString, MakeBamFromRds.getMismatches(""))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeBamFromRds))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeGraphs.py b/test/testMakeGraphs.py
new file mode 100644 (file)
index 0000000..567e5d2
--- /dev/null
@@ -0,0 +1,83 @@
+'''
+Created on Jul 28, 2010
+
+@author: sau
+'''
+
+import os, unittest
+from Erange import makeGraphs
+
+testFileName = "/tmp/testEdgeFileForUnitTests.txt"
+
+class TestMakeGraphs(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testGetEdges(self):
+        nodeList = []
+        self.assertEquals({}, makeGraphs.getEdges(nodeList))
+
+        nodeEntry = "ex_node1\tex_node2\t1"
+        nodeList.append(nodeEntry)
+        result = {"ex_node1": [("ex_node2", 1)],
+                  "ex_node2": [("ex_node1", 1)]}
+        self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+        nodeEntry = "ex_node1\tex_node3\t2"
+        nodeList.append(nodeEntry)
+        result = {"ex_node1": [("ex_node2", 1), ("ex_node3", 2)],
+                  "ex_node2": [("ex_node1", 1)],
+                  "ex_node3": [("ex_node1", 2)]
+        }
+        self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+        result = {"node1": [("node2", 1), ("node3", 2)],
+                  "node2": [("node1", 1)],
+                  "node3": [("node1", 2)]
+        }
+        self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True))
+
+        nodeEntry = "ex:node1\tex:node2\t1"
+        nodeList = [nodeEntry]
+        result = {"ex:node1": [("ex:node2", 1)],
+                  "ex:node2": [("ex:node1", 1)]}
+        self.assertEquals(result, makeGraphs.getEdges(nodeList, shorten=True))
+
+        nodeEntry = "badLine"
+        nodeList = [nodeEntry]
+        self.assertEquals({}, makeGraphs.getEdges(nodeList))
+        nodeEntry = "node1\tnode2\t1"
+        nodeList.append(nodeEntry)
+        result = {"node1": [("node2", 1)],
+                  "node2": [("node1", 1)]}
+        self.assertEquals(result, makeGraphs.getEdges(nodeList))
+
+
+    def testGetEdgesFromFile(self):
+        self.edgeFile = open(testFileName, "w")
+        self.edgeFile.write("node1\tnode2\t1")
+        self.edgeFile.close()
+
+        result = {"node1": [("node2", 1)],
+                  "node2": [("node1", 1)]}
+        self.assertEquals(result, makeGraphs.getEdgesFromFile(testFileName))
+
+        os.remove(testFileName)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeGraphs))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeRdsFromBam.py b/test/testMakeRdsFromBam.py
new file mode 100644 (file)
index 0000000..4b4da1a
--- /dev/null
@@ -0,0 +1,66 @@
+'''
+Created on Jun 10, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import MakeRdsFromBam
+
+class TestMakeRdsFromBam(unittest.TestCase):
+
+
+    def testGetSpliceBounds(self):
+        start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
+
+        self.assertEqual(start, 0, "incorrect start position for 262")
+        self.assertEqual(startR, 8, "incorrect right start position for 262")
+        self.assertEqual(stopL, 2, "incorrect left stop position for 262")
+        self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+
+
+    def testGetMismatches(self):
+        querySequence = "GATTACA"
+        
+        resultString = "A3T"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "+"))
+        resultString = "T3A"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", querySequence, "-"))
+        resultString = "T7A"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("6T", querySequence, "+"))
+
+        resultString = "A3T,T7A"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A3T0", querySequence, "+"))
+
+        resultString = ""
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^T", querySequence, "+"))
+
+        resultString = "T5A"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "+"))
+        resultString = "A5T"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2^TT2T", querySequence, "-"))
+
+        resultString = "A3N"
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A", "", "+"))
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("2A"))
+
+        resultString = ""
+        self.assertEquals(resultString, MakeRdsFromBam.getMismatches("badMismatchTagData", querySequence, "+"))
+
+
+    def testIsSpliceEntry(self):
+        self.assertTrue(MakeRdsFromBam.isSpliceEntry([(1,6), (3, 4), (1, 2)]))
+        self.assertFalse(MakeRdsFromBam.isSpliceEntry([(1,6), (2, 4), (1, 2)]))
+        self.assertFalse(MakeRdsFromBam.isSpliceEntry([]))
+        self.assertFalse(MakeRdsFromBam.isSpliceEntry(""))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeRdsFromBam))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeSNPTrack.py b/test/testMakeSNPTrack.py
new file mode 100644 (file)
index 0000000..b52b546
--- /dev/null
@@ -0,0 +1,85 @@
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import makeSNPtrack
+
+
+class TestMakeSNPTrack(unittest.TestCase):
+
+    baseColor = {"A": "200, 0, 255",
+                 "T": "200, 0, 255",
+                 "C": "200, 0, 255",
+                 "G": "200, 0, 255"
+    }
+
+    specialColors = {"A-G": "255, 0, 0",
+                     "T-C": "0, 0, 255"
+    }
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testGetHeader(self):
+        track = "test track"
+        header = "track name=%s description=%s visibility=2 itemRgb=\"On\"\n" % (track, track)
+        self.assertEquals(header, makeSNPtrack.getHeader(track))
+
+
+    def testDoNotProcessLine(self):
+        self.assertTrue(makeSNPtrack.doNotProcessLine("#anything"))
+        self.assertFalse(makeSNPtrack.doNotProcessLine("line to process"))
+
+
+    def testGetBedOutputLine(self):
+        chromosome = "chr1"
+        readStart = 10
+        readStop = 11
+        readName = "A"
+        score = "0"
+        sense = "+"
+        color = self.baseColor[readName]
+        snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6", readName]
+        outline = "%s\t%d\t%d\t%s\t%s\t%s\t-\t-\t\t%s\n" % (chromosome, readStart, readStop, readName, score, sense, color)
+        self.assertEquals(outline, makeSNPtrack.getBedOutputLine(snpPropertiesList))
+
+        snpPropertiesList = ["0", "1", chromosome, 11, "4", "5", "6"]
+        self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+        snpPropertiesList = []
+        self.assertRaises(IndexError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+        snpPropertiesList = ["0", "1", chromosome, "some string", "4", "5", "6", readName]
+        self.assertRaises(ValueError, makeSNPtrack.getBedOutputLine, snpPropertiesList)
+
+
+    def testGetSNPColor(self):
+        for base in self.baseColor.keys():
+            self.assertEquals(self.baseColor[base], makeSNPtrack.getSNPColor(base))
+
+        for base in self.specialColors.keys():
+            self.assertEquals(self.specialColors[base], makeSNPtrack.getSNPColor(base))
+
+        defaultColor = "200, 0, 255"
+        self.assertEquals(defaultColor, makeSNPtrack.getSNPColor(""))
+        self.assertEquals(defaultColor, makeSNPtrack.getSNPColor("V"))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeSNPTrack))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMarkLinkers.py b/test/testMarkLinkers.py
new file mode 100644 (file)
index 0000000..775b2e0
--- /dev/null
@@ -0,0 +1,142 @@
+'''
+Created on Sep 15, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.chiapet import markLinkers
+
+
+class TestMarkLinkers(unittest.TestCase):
+    linkerFileName = "/Users/sau/Eclipse/erange/source/Erange/chiapet/linkers.fa"
+    inFileName = "linkerTestIn.txt"
+    outFileName = "linkerTestOut.txt"
+
+    def setUp(self):
+        infile = open(self.inFileName, "w")
+        infile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.inFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.outFileName)
+        except OSError:
+            pass
+
+
+    def testMarkLinkers(self):
+        markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName)
+        output = open(self.outFileName)
+        for line in output:
+            self.assertEquals("", line)
+
+        output.close()
+        os.remove(self.outFileName)
+
+        infile = open(self.inFileName, "w")
+        print >> infile, ""
+        print >> infile, "@Linker1"
+        print >> infile, "........................GTTGGATAAGATATCGCGG....."
+        print >> infile, "@NoLinker"
+        print >> infile, "GATTACA.GATTACA.GATTACA.GATTACA.GATTACA.GATTACA."
+        print >> infile, "@Linker2"
+        print >> infile, "........................GTTGGAATGTATATCGCGG....."
+        print >> infile, "@Linker1Short"
+        print >> infile, "..............GTTGGAATGTATATCGCGG..............."
+        print >> infile, "@Linker2Short"
+        print >> infile, "..............GTTGGAATGTATATCGCGG..............."
+        infile.close()
+
+        markLinkers.markLinkers(self.linkerFileName, self.inFileName, self.outFileName)
+        output = open(self.outFileName)
+        self.assertEquals(">L1_Linker1\n", output.readline())
+        self.assertEquals("....................\n", output.readline())
+        self.assertEquals(">NA_NoLinker\n", output.readline())
+        self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline())
+        self.assertEquals(">NA_NoLinker\n", output.readline())
+        self.assertEquals("GATTACA.GATTACA.GATT\n", output.readline())
+        self.assertEquals(">NA_Linker2\n", output.readline())
+        self.assertEquals("....................\n", output.readline())
+        self.assertEquals(">L2_Linker2\n", output.readline())
+        self.assertEquals("....................\n", output.readline())
+        self.assertEquals(">NA_Linker1Short\n", output.readline())
+        self.assertEquals("..............GTTGGA\n", output.readline())
+        self.assertEquals(">NA_Linker1Short\n", output.readline())
+        self.assertEquals("..............GTTGGA\n", output.readline())
+        self.assertEquals(">NA_Linker2Short\n", output.readline())
+        self.assertEquals("..............GTTGGA\n", output.readline())
+        self.assertEquals(">NA_Linker2Short\n", output.readline())
+        self.assertEquals("..............GTTGGA\n", output.readline())
+
+        output.close()
+        #TODO: Check that we really do want to output the same line
+        #multiple times in the case where neither linker is detected.
+        #See if downstream there is a real reason for doing it this way
+        #or if it was handled as a bug introduced at this stage of the
+        #analysis.
+
+
+    def testGetLinkerInformation(self):
+        linkerDict, linkerList = markLinkers.getLinkerInformation([])
+        resultDict = {}
+        resultList = []
+        self.assertEquals(resultDict, linkerDict)
+        self.assertEquals(resultList, linkerList)
+
+        linkerData = [">linker_b.1",
+                      "GTTGGATAAGATATCGCGG",
+                      ">linker_b.2",
+                      "GTTGGAATGTATATCGCGG"
+        ]
+        linkerDict, linkerList = markLinkers.getLinkerInformation(linkerData)
+        resultDict = {"linker_b.1": "GTTGGATAAG",
+                      "linker_b.2": "GTTGGAATGT"
+        }
+        resultList = ["linker_b.1", "linker_b.2"]
+        self.assertEquals(resultDict, linkerDict)
+        self.assertEquals(resultList, linkerList)
+
+
+    def testGetLinkerInformationFromFile(self):
+        linkerDict, linkerList = markLinkers.getLinkerInformationFromFile("bad file name")
+        resultDict = {}
+        resultList = []
+        self.assertEquals(resultDict, linkerDict)
+        self.assertEquals(resultList, linkerList)
+
+        linkerDict, linkerList = markLinkers.getLinkerInformationFromFile(self.linkerFileName)
+        resultDict = {"linker_b.1": "GTTGGATAAG",
+                      "linker_b.2": "GTTGGAATGT"
+        }
+        resultList = ["linker_b.1", "linker_b.2"]
+        self.assertEquals(resultDict, linkerDict)
+        self.assertEquals(resultList, linkerList)
+
+
+    def testMain(self):
+        argv = ["markLinkers", self.linkerFileName, self.inFileName, self.outFileName]
+        markLinkers.main(argv)
+        output = open(self.outFileName)
+        for line in output:
+            self.assertEquals("", line)
+
+        output.close()
+        os.remove(self.outFileName)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMarkLinkers))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testPeaksToRegion.py b/test/testPeaksToRegion.py
new file mode 100644 (file)
index 0000000..dd16d86
--- /dev/null
@@ -0,0 +1,79 @@
+'''
+Created on Oct 4, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import peakstoregion
+
+inFileName = "testPeaksToRegionInFile.txt"
+outFileName = "testPeaksToRegionOutFile.txt"
+
+
+class TestPeaksToRegion(unittest.TestCase):
+
+
+    def setUp(self):
+        self.inFile = open(inFileName, "w")
+        self.inFile.write("stuff\tpeak1\tchr1\t1000\t1.3\n")
+        self.inFile.write("stuff\tpeak2\tchr1\t800\t9.7\n")
+        self.inFile.write("stuff\tpeak3\tchr2\t1000\t3.0\n")
+        self.inFile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(outFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove(inFileName)
+        except OSError:
+            pass
+
+
+    def testPeaksToRegion(self):
+        peakstoregion.peakstoregion(inFileName, outFileName)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0])
+        self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1])
+        self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2])
+
+
+    def testMain(self):
+        argv = ["peakstoregion", inFileName, outFileName]
+        peakstoregion.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("peak1\tchr1\t500\t1500\t1.3\n", results[0])
+        self.assertEquals("peak2\tchr1\t300\t1300\t9.7\n", results[1])
+        self.assertEquals("peak3\tchr2\t500\t1500\t3.0\n", results[2])
+
+        argv = ["peakstoregion", inFileName, outFileName, 600, 2, 3, 1, -1]
+        peakstoregion.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("peak1\tchr1\t400\t1600\t1.3\n", results[0])
+        self.assertEquals("peak2\tchr1\t200\t1400\t9.7\n", results[1])
+        self.assertEquals("peak3\tchr2\t400\t1600\t3.0\n", results[2])
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestPeaksToRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testProcessVelvet.py b/test/testProcessVelvet.py
new file mode 100644 (file)
index 0000000..99ef5a3
--- /dev/null
@@ -0,0 +1,236 @@
+'''
+Created on Sep 15, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.rnapath import processvelvet
+
+
+class TestProcessVelvet(unittest.TestCase):
+    inFileName = "testProcessVelvetIn.txt"
+    filterFileName = "testProcessVelvetFilter.txt"
+    outFileName = "testProcessVelvetOut.txt"
+
+
+    def setUp(self):
+        infile = open(self.inFileName, "w")
+        infile.close()
+        filter = open(self.filterFileName, "w")
+        filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODE1-1_0\n")
+        filter.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.inFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.filterFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.outFileName)
+        except OSError:
+            pass
+
+
+    def testProcessVelvet(self):
+        processvelvet.processvelvet(self.inFileName, self.outFileName)
+        outfile = open(self.outFileName)
+        for line in outfile:
+            self.assertEquals("", line)
+
+        os.remove(self.outFileName)
+
+        infile = open(self.inFileName, "w")
+        print >> infile, ">NODE1-1_0"
+        print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+        infile.close()
+
+        processvelvet.processvelvet(self.inFileName, self.outFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr0\n", outfile.readline())
+        self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+        processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+
+        infile = open(self.inFileName, "w")
+        print >> infile, ">NODE1-1_1"
+        print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+        print >> infile, ">NODE1-1_0"
+        print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+        infile.close()
+
+        processvelvet.processvelvet(self.inFileName, self.outFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr1\n", outfile.readline())
+        self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+        self.assertEquals(">chr0\n", outfile.readline())
+        self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+        processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr1\n", outfile.readline())
+        self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+
+        infile = open(self.inFileName, "w")
+        print >> infile, ">NODE1-1_1"
+        print >> infile, "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+        print >> infile, ">NODE1-1_0"
+        print >> infile, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+        print >> infile, ">NODE1-1_2"
+        print >> infile, "GATTACA"
+        infile.close()
+
+        processvelvet.processvelvet(self.inFileName, self.outFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr1\n", outfile.readline())
+        self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+        self.assertEquals(">chr0\n", outfile.readline())
+        self.assertEquals("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", outfile.readline())
+        self.assertEquals(">chr2\n", outfile.readline())
+        self.assertEquals("GATTACA\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+        processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr1\n", outfile.readline())
+        self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+        self.assertEquals(">chr2\n", outfile.readline())
+        self.assertEquals("GATTACA\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        os.remove(self.outFileName)
+        processvelvet.processvelvet(self.inFileName, self.outFileName, filterFileName=self.filterFileName, minSize=10)
+        outfile = open(self.outFileName)
+        self.assertEquals(">chr1\n", outfile.readline())
+        self.assertEquals("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n", outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+
+
+    def testGetFilterList(self):
+        self.assertEquals([], processvelvet.getFilterList())
+        self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName))
+        self.assertEquals([], processvelvet.getFilterList("whatfile?"))
+
+        filter = open(self.filterFileName, "a")
+        filter.write("some fields without the key trigger string\n")
+        filter.close()
+        self.assertEquals(["NODE1-1_0"], processvelvet.getFilterList(self.filterFileName))
+
+        filter = open(self.filterFileName, "a")
+        filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n")
+        filter.close()
+        self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+        filter = open(self.filterFileName, "a")
+        filter.write("0\tNODEWrongField\n")
+        filter.close()
+        self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+        filter = open(self.filterFileName, "a")
+        filter.write("0\t1\t2\t3\t4\t5\t6\t7\t8\tNODEAnything\n")
+        filter.close()
+        self.assertEquals(["NODE1-1_0", "NODEAnything"], processvelvet.getFilterList(self.filterFileName))
+
+
+    def testWriteNode(self):
+        node = {"contigPrefix": "chr",
+                "completeID": "",
+                "currentSeq": ""
+        }
+
+        counts = {"acceptedSize": 0,
+                  "nSize": 0,
+                  "contigsAccepted": 0,
+                  "filteredSize": 0
+        }
+
+        filterList = []
+
+        outfile = open(self.outFileName, "w")
+        processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+        outfile.close()
+        results = open(self.outFileName)
+        self.assertEquals("", results.readline())
+        results.close()
+        os.remove(self.outFileName)
+
+        node["completeID"] = "<5"
+        node["currentSeq"] = "GATTACA\n"
+        outfile = open(self.outFileName, "w")
+        processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+        self.assertEquals(counts["filteredSize"], 7)
+        counts["filteredSize"] = 0
+        outfile.close()
+        results = open(self.outFileName)
+        self.assertEquals("", results.readline())
+        results.close()
+        os.remove(self.outFileName)
+
+        node["completeID"] = "NODE1_1"
+        node["currentSeq"] = "GATTACA\n"
+        filterList = ["NODE1_1"]
+        outfile = open(self.outFileName, "w")
+        processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+        self.assertEquals(counts["filteredSize"], 7)
+        counts["filteredSize"] = 0
+        outfile.close()
+        results = open(self.outFileName)
+        self.assertEquals("", results.readline())
+        results.close()
+        os.remove(self.outFileName)
+
+        node["completeID"] = "NODE1_1"
+        node["currentSeq"] = "GATTACA\n"
+        filterList = []
+        outfile = open(self.outFileName, "w")
+        processvelvet.writeNode(outfile, node, filterList, counts, minSize=0, keepCoverage=False)
+        self.assertEquals(counts["acceptedSize"], 7)
+        outfile.close()
+        results = open(self.outFileName)
+        self.assertEquals(">chr1\n", results.readline())
+        self.assertEquals("GATTACA\n", results.readline())
+        self.assertEquals("", results.readline())
+        results.close()
+        os.remove(self.outFileName)
+
+
+    def testMain(self):
+        argv = ["processVelvet", self.inFileName, self.outFileName]
+        processvelvet.main(argv)
+        outfile = open(self.outFileName)
+        for line in outfile:
+            self.assertEquals("", line)
+
+        os.remove(self.outFileName)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestProcessVelvet))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testRNAPATH.py b/test/testRNAPATH.py
new file mode 100644 (file)
index 0000000..e217ff1
--- /dev/null
@@ -0,0 +1,210 @@
+'''
+Created on Sep 10, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange.rnapath import RNAPATH
+
+compDict = {"A": "T",
+            "T": "A",
+            "G": "C",
+            "C": "G",
+            "S": "S",
+            "W": "W",
+            "R": "Y",
+            "Y": "R",
+            "M": "K",
+            "K": "M",
+            "H": "D",
+            "D": "H",
+            "B": "V",
+            "V": "B",
+            "N": "N",
+            "a": "t",
+            "t": "a",
+            "g": "c",
+            "c": "g",
+            "n": "n",
+            "z": "z"
+}
+
+
+class TestRNAPATH(unittest.TestCase):
+    incontigfilename = "contigIn.txt"
+    distalPairsfile = "distalPair.txt"
+    outpathfilename = "rnapathOut.txt"
+    outcontigfilename = "contigOut.txt"
+
+    def setUp(self):
+        inContigs = open(self.incontigfilename, "w")
+        inContigs.close()
+
+        distal = open(self.distalPairsfile, "w")
+        distal.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.incontigfilename)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.distalPairsfile)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.outpathfilename)
+        except OSError:
+            pass
+
+        try:
+            os.remove(self.outcontigfilename)
+        except OSError:
+            pass
+
+
+    def testCompNT(self):
+        for nt in compDict.keys():
+            self.assertEquals(compDict[nt], RNAPATH.compNT(nt))
+
+        self.assertEquals("N", RNAPATH.compNT("5"))
+        self.assertEquals("N", RNAPATH.compNT("anything"))
+
+
+    def testComplement(self):
+        self.assertEquals("", RNAPATH.complement(""))
+        for nt in compDict.keys():
+            self.assertEquals(compDict[nt], RNAPATH.complement(nt))
+
+        self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA"))
+        self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", 7))
+        self.assertEquals("TGTAATC", RNAPATH.complement("GATTACA", -75632))
+        self.assertEquals("TGTA", RNAPATH.complement("GATTACA", 4))
+
+        #TODO: do we want to return when length > seqlength?  This is
+        # the current return and it seems very wrong we only N fill
+        # after going more then seqlength in negative direction
+        self.assertEquals("TGTAATCTG", RNAPATH.complement("GATTACA", 9))
+        self.assertEquals("TGTAATCTGTAATCNNNNN", RNAPATH.complement("GATTACA", 19))
+
+    #TODO: write test
+    def testRnaPath(self):
+        RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename)
+        outfile = open(self.outpathfilename)
+        self.assertTrue("#settings:" in outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        outcontig = open(self.outcontigfilename)
+        self.assertEquals(0, len(outcontig.readlines()))
+        outcontig.close()
+
+        #infile = open(self.incontigfilename, "w")
+        #infile.write(">chr1 stuff\n")
+        #infile.write("GATTACA\n")
+        #infile.close()
+        #RNAPATH.rnaPath(self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename)
+        #outfile = open(self.outpathfilename)
+        #self.assertTrue("#settings:" in outfile.readline())
+        #self.assertEquals("", outfile.readline())
+        #outfile.close()
+
+
+    #TODO: write test
+    def testGetPath(self):
+        pass
+
+
+    #TODO: write test
+    def testTraverseGraph(self):
+        leafList = []
+        edgeMatrix = RNAPATH.EdgeMatrix(0)
+        pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+        self.assertEquals([], pathList)
+        self.assertEquals({}, visitedDict)
+
+        leafList = [1]
+        edgeMatrix = RNAPATH.EdgeMatrix(3)
+        edgeMatrix.edgeArray[2][1] = 3
+        edgeMatrix.edgeArray[1][2] = 3
+        pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+        self.assertEquals([ [1, 2] ], pathList)
+        self.assertEquals({1: "", 2: ""}, visitedDict)
+
+        leafList = [1, 2]
+        edgeMatrix = RNAPATH.EdgeMatrix(3)
+        edgeMatrix.edgeArray[2][1] = 3
+        edgeMatrix.edgeArray[1][2] = 3
+        pathList, visitedDict = RNAPATH.traverseGraph(leafList, edgeMatrix)
+        self.assertEquals([ [1, 2] ], pathList)
+        self.assertEquals({1: "", 2: ""}, visitedDict)
+
+
+    #TODO: write test
+    def testGetContigsFromFile(self):
+        contigNum, nameList, contigDict, origSize = RNAPATH.getContigsFromFile(self.incontigfilename)
+        self.assertEquals(0, contigNum)
+        self.assertEquals([], nameList)
+        self.assertEquals({}, contigDict)
+        self.assertEquals([], origSize)
+
+
+    #TODO: check for boundary condition and special cases
+    def testEdgeMatrix(self):
+        edgeMatrix = RNAPATH.EdgeMatrix(0)
+        result = "[]"
+        self.assertEquals(result, str(edgeMatrix.edgeArray))
+
+        edgeMatrix = RNAPATH.EdgeMatrix(3)
+        result = "[[0 0 0]\n [0 0 0]\n [0 0 0]]"
+        self.assertEquals(result, str(edgeMatrix.edgeArray))
+        self.assertEquals([], edgeMatrix.visitLink(0))
+
+        edgeMatrix.edgeArray[0][1] = 1
+        self.assertEquals([], edgeMatrix.visitLink(0))
+
+        edgeMatrix.edgeArray[0][1] = 2
+        result = [0]
+        self.assertEquals(result, edgeMatrix.visitLink(0))
+
+        edgeMatrix.edgeArray[2][1] = 2
+        result = []
+        self.assertEquals(result, edgeMatrix.visitLink(0))
+        edgeMatrix.edgeArray[2][1] = 2
+        result = []
+        self.assertEquals(result, edgeMatrix.visitLink(1))
+        edgeMatrix.edgeArray[2][1] = 2
+        result = [2]
+        self.assertEquals(result, edgeMatrix.visitLink(2))
+
+        edgeMatrix.edgeArray[2][1] = 3
+        edgeMatrix.edgeArray[1][2] = 3
+        result = [1, 2]
+        self.assertEquals(result, edgeMatrix.visitLink(1))
+
+
+    def testMain(self):
+        argv = ["RNAPATH", self.incontigfilename, self.distalPairsfile, self.outpathfilename, self.outcontigfilename]
+        RNAPATH.main(argv)
+        outfile = open(self.outpathfilename)
+        self.assertTrue("#settings:" in outfile.readline())
+        self.assertEquals("", outfile.readline())
+        outfile.close()
+        outcontig = open(self.outcontigfilename)
+        self.assertEquals(0, len(outcontig.readlines()))
+        outcontig.close()
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestRNAPATH))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testReadDataset.py b/test/testReadDataset.py
new file mode 100644 (file)
index 0000000..3ac8f54
--- /dev/null
@@ -0,0 +1,935 @@
+'''
+Created on Jul 21, 2010
+
+@author: sau
+'''
+import unittest
+import os
+import sqlite3 as sqlite
+from Erange import ReadDataset
+
+testDBName = "testRDS.rds"
+rnaTestDBName = "testRDSRNA.rds"
+
+class TestReadDataset(unittest.TestCase):
+
+
+    def setUp(self):
+        self.rds = ReadDataset.ReadDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+        self.rnaRds = ReadDataset.ReadDataset(rnaTestDBName, initialize=True, datasetType="RNA", verbose=False)
+
+
+    def tearDown(self):
+        del(self.rds)
+        os.remove(testDBName)
+        del(self.rnaRds)
+        os.remove(rnaTestDBName)
+
+
+    #TODO: rename and integrate
+    def testZeeNewStuff(self):
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", ""),
+                        ("dup start", "chr1", 1, 150, "+", 1.0, "", ""),
+                        ("new read", "chr1", 80, 100, "+", 1.0, "", ""),
+                        ("testRead", "chr2", 201, 400, "+", 1.0, "", ""),
+                        ("dup start", "chr2", 201, 450, "+", 1.0, "", ""),
+                        ("new read", "chr2", 280, 400, "+", 1.0, "", ""),
+                        ("three up", "chr3", 1, 80, "+", 1.0, "", ""),
+                        ("three two", "chr3", 201, 230, "+", 1.0, "", "")
+        ]
+        self.rds.insertUniqs(rdsEntryList)
+        dbcon = sqlite.connect(testDBName)
+        sql = dbcon.cursor()
+        sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1)")
+        result = [("chr1", 1), ("chr2", 201)]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        sql.execute("select chrom,start from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+        result = [("chr1", 80), ("chr2", 280), ("chr3", 1), ("chr3", 201)]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        sql.execute("select * from uniqs group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select * from uniqs group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+        result = [(2, "dup start", "chr1", 1, 150, "+", 1.0, "", ""),
+                  (3, "new read", "chr1", 80, 100, "+", 1.0, "", ""),
+                  (5, "dup start", "chr2", 201, 450, "+", 1.0, "", ""),
+                  (6, "new read", "chr2", 280, 400, "+", 1.0, "", ""),
+                  (7, "three up", "chr3", 1, 80, "+", 1.0, "", ""),
+                  (8, "three two", "chr3", 201, 230, "+", 1.0, "", "")
+        ]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        sql.execute("select chrom,start from uniqs where start > 100 group by chrom,start having ( count(start) > 1 and count(chrom) > 1) order by chrom,start")
+        result = [("chr2", 201)]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+
+        rdsEntryList = [("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 1, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr2", 80, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr2", 1, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr2", 5000, 25000, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr3", 1, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr3", 70, 500, "+", 0.5, "", "")
+        ]
+        self.rds.insertMulti(rdsEntryList)
+        sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1)")
+        result = [("chr1", 1), ("chr2", 201), ("chr3", 1)]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+        result = [("chr1", 80),
+                  ("chr2", 1), ("chr2", 80), ("chr2", 280), ("chr2", 5000),
+                  ("chr3", 70), ("chr3", 201)
+        ]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        sql.execute("select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union select chrom,start from (select chrom,start from uniqs union all select chrom,start from multi) group by chrom,start having ( count(start) = 1 and count(chrom) = 1)")
+        result = sql.fetchall()
+        result = [("chr1", 1), ("chr1", 80),
+                  ("chr2", 1), ("chr2", 80), ("chr2", 201), ("chr2", 280), ("chr2", 5000),
+                  ("chr3", 1), ("chr3", 70), ("chr3", 201)
+        ]
+        for eachEntry in sql.fetchall():
+            self.assertTrue(eachEntry in result)
+
+        result = {"1": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}],
+                  "3": [{"start": 1, "sense": "+"}, {"start": 70, "sense": "+"}, {"start": 201, "sense": "+"}],
+                  "2": [{"start": 1, "sense": "+"}, {"start": 80, "sense": "+"}, {"start": 201, "sense": "+"}, {"start": 280, "sense": "+"}, {"start": 5000, "sense": "+"}]
+        }
+        self.assertEquals(result, self.rds.getReadsDict(combine5p=True, doMulti=True))
+
+        print self.rds.getReadsDict(combine5p=True, doMulti=True, withWeight=True)
+
+    def testReadDatasetBuiltIns(self):
+        # Initialize an existing rds file
+        self.assertRaises(sqlite.OperationalError, ReadDataset.ReadDataset, testDBName, initialize=True, datasetType="DNA", verbose=True)
+        self.assertEquals(0, len(self.rds))
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, len(self.rds))
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+        self.assertEquals(2, len(self.rds))
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList)
+        self.rnaRds.insertSplices(rdsEntryList)
+        self.assertEquals(2, len(self.rds))
+        self.assertEquals(1, len(self.rnaRds))
+
+
+    def testInsertUniqs(self):
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, len(self.rds))
+
+        rdsEntryList = [("testRead2", "chr1", 200, 300, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(2, len(self.rds))
+
+
+    def testInsertMulti(self):
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+        self.assertEquals(1, len(self.rds))
+
+
+    def testInsertSplices(self):
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.assertRaises(sqlite.OperationalError, self.rds.insertSplices, rdsEntryList)
+        self.rnaRds.insertSplices(rdsEntryList)
+        self.assertEquals(0, len(self.rds))
+        self.assertEquals(1, len(self.rnaRds))
+
+
+    def testGetChromosomes(self):
+        result = []
+        self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True))
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        result = ["chr1"]
+        self.assertEqual(result, self.rds.getChromosomes(table="uniqs", fullChrom=True))
+
+        self.assertRaises(sqlite.OperationalError, self.rds.getChromosomes, table="badTableName")
+
+
+    #TODO: write unit test
+    def testAttachDB(self):
+        pass
+
+
+    #TODO: write unit test
+    def testDetachDB(self):
+        pass
+
+
+    #TODO: write unit test
+    def testImportFromDB(self):
+        pass
+
+
+    def testGetTables(self):
+        result = ["metadata", "uniqs", "multi"]
+        self.assertEquals(result, self.rds.getTables())
+
+        result = ["metadata", "uniqs", "multi", "splices"]
+        self.assertEquals(result, self.rnaRds.getTables())
+
+
+    def testHasIndex(self):
+        self.assertFalse(self.rds.hasIndex())
+        self.rds.buildIndex()
+        self.assertTrue(self.rds.hasIndex())
+
+
+    def testGetMetadata(self):
+        returnDict = self.rds.getMetadata()
+        self.assertTrue(returnDict.has_key("rdsVersion"))
+        self.assertEquals(returnDict["dataType"], "DNA")
+
+        result = {"dataType": "RNA"}
+        self.assertEquals(result, self.rnaRds.getMetadata("dataType"))
+
+        result = {}
+        self.assertEquals(result, self.rds.getMetadata("badMetaDataName"))
+
+
+    def testGetReadSize(self):
+        self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getReadSize)
+
+        self.rds.insertMetadata([("readsize", "100")])
+        self.assertEquals(100, self.rds.getReadSize())
+
+        self.rds.updateMetadata("readsize", 100)
+        self.assertEquals(100, self.rds.getReadSize())
+
+        self.rds.updateMetadata("readsize", "100 import")
+        self.assertEquals(100, self.rds.getReadSize())
+
+        self.rds.updateMetadata("readsize", "badReadSize")
+        self.assertRaises(ValueError, self.rds.getReadSize)
+
+
+    def testGetDefaultCacheSize(self):
+        self.assertEquals(100000, self.rds.getDefaultCacheSize())
+
+
+    def testGetMaxCoordinate(self):
+        self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1"))
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+        self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr2"))
+        self.assertEquals(0, self.rnaRds.getMaxCoordinate("chr1", doUniqs=False))
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+        self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True))
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        self.assertEquals(1, self.rnaRds.getMaxCoordinate("chr1"))
+        self.assertEquals(101, self.rnaRds.getMaxCoordinate("chr1", doMulti=True))
+        self.assertEquals(1150, self.rnaRds.getMaxCoordinate("chr1", doSplices=True))
+
+
+    def testGetReadsDict(self):
+        self.assertEquals({}, self.rds.getReadsDict())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        reads = self.rds.getReadsDict()
+        self.assertEquals(1, len(reads))
+        self.assertEquals(1, len(reads["1"]))
+        read = reads["1"][0]
+        self.assertEquals(["start", "sense"], read.keys())
+        self.assertEquals(1, read["start"])
+        self.assertEquals("+", read["sense"])
+
+        reads = self.rds.getReadsDict(bothEnds=True, noSense=False, fullChrom=True,
+                          withWeight=True, withFlag=True, withMismatch=True, withID=True,
+                          withChrom=True, readIDDict=True)
+        self.assertEquals(1, len(reads))
+        self.assertEquals(1, len(reads["testRead"]))
+        read = reads["testRead"][0]
+        self.assertEquals(["readID", "weight", "stop", "mismatch","start", "flag","sense", "chrom"], read.keys())
+        self.assertEquals("testRead", read["readID"])
+        self.assertEquals(1.0, read["weight"])
+        self.assertEquals(100, read["stop"])
+        self.assertEquals("", read["mismatch"])
+        self.assertEquals(1, read["start"])
+        self.assertEquals("", read["flag"])
+        self.assertEquals("+", read["sense"])
+        self.assertEquals("chr1", read["chrom"])
+
+        self.assertEquals({}, self.rds.getReadsDict(hasMismatch=True))
+        self.assertEquals({}, self.rds.getReadsDict(strand="-"))
+        self.assertEquals(1, len(self.rds.getReadsDict(strand="+")))
+
+        rdsEntryList = [("testRead2", "chr1", 201, 300, "-", 1.0, "A", "G22A")]
+        self.rds.insertUniqs(rdsEntryList)
+        reads = self.rds.getReadsDict()
+        self.assertEquals(1, len(reads))
+        reads = self.rds.getReadsDict()
+        self.assertEquals(2, len(reads["1"]))
+        read = reads["1"][1]
+        self.assertEquals(201, read["start"])
+        reads = self.rds.getReadsDict(strand="+")
+        self.assertEquals(1, len(reads))
+        read = reads["1"][0]
+        self.assertEquals("+", read["sense"])
+        reads = self.rds.getReadsDict(strand="-")
+        self.assertEquals(1, len(reads))
+        reads = self.rds.getReadsDict(start=199)
+        self.assertEquals(1, len(reads["1"]))
+        reads = self.rds.getReadsDict(hasMismatch=True)
+        self.assertEquals(1, len(reads["1"]))
+
+        rdsEntryList = [("testMultiRead", "chr2", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr2", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+        reads = self.rds.getReadsDict()
+        self.assertEquals(1, len(reads))
+        reads = self.rds.getReadsDict(doMulti=True)
+        self.assertEquals(2, len(reads))
+        reads = self.rds.getReadsDict(doUniqs=False, doMulti=True)
+        self.assertFalse(reads.has_key("1"))
+
+
+    def testGetSplicesDict(self):
+        self.assertRaises(sqlite.OperationalError, self.rds.getSplicesDict)
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        reads = self.rnaRds.getSplicesDict()
+        self.assertEquals(1, len(reads))
+        self.assertEquals(1, len(reads["1"]))
+        read = reads["1"][0]
+        result = ["startR", "stopL", "sense", "startL", "stopR"]
+        self.assertEquals(result, read.keys())
+        self.assertEquals(1000, read["startL"])
+        self.assertEquals("+", read["sense"])
+        reads = self.rnaRds.getSplicesDict(splitRead=True)
+        self.assertEquals(2, len(reads["1"]))
+        self.assertEquals(1000, reads["1"][0]["startL"])
+        self.assertFalse(reads["1"][0].has_key("startR"))
+        self.assertFalse(reads["1"][0].has_key("stopR"))
+        self.assertEquals(1150, reads["1"][1]["startR"])
+        self.assertFalse(reads["1"][1].has_key("startL"))
+        self.assertFalse(reads["1"][1].has_key("stopL"))
+        self.assertEquals(reads["1"][0]["sense"], reads["1"][1]["sense"])
+
+        reads = self.rnaRds.getSplicesDict(noSense=False, fullChrom=True,
+                          withWeight=True, withFlag=True, withMismatch=True, withID=True,
+                          withChrom=True, readIDDict=True)
+        self.assertEquals(1, len(reads))
+        self.assertEquals(1, len(reads["testSpliceRead"]))
+        read = reads["testSpliceRead"][0]
+        result = ["readID", "weight", "startR", "mismatch","stopR", "stopL", "flag", "startL", "sense", "chrom"]
+        self.assertEquals(result, read.keys())
+        self.assertEquals("testSpliceRead", read["readID"])
+        self.assertEquals(1.0, read["weight"])
+        self.assertEquals(1150, read["startR"])
+        self.assertEquals("", read["mismatch"])
+        self.assertEquals(1200, read["stopR"])
+        self.assertEquals(1100, read["stopL"])
+        self.assertEquals("", read["flag"])
+        self.assertEquals(1000, read["startL"])
+        self.assertEquals("+", read["sense"])
+        self.assertEquals("chr1", read["chrom"])
+
+        self.assertEquals({}, self.rnaRds.getSplicesDict(hasMismatch=True))
+        self.assertEquals({}, self.rnaRds.getSplicesDict(strand="-"))
+        self.assertEquals(1, len(self.rnaRds.getSplicesDict(strand="+")))
+
+        rdsEntryList = [("testSpliceRead2", "chr1", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        reads = self.rnaRds.getSplicesDict()
+        self.assertEquals(1, len(reads))
+        reads = self.rnaRds.getSplicesDict()
+        self.assertEquals(2, len(reads["1"]))
+        read = reads["1"][1]
+        self.assertEquals(2000, read["startL"])
+        reads = self.rnaRds.getSplicesDict(strand="+")
+        self.assertEquals(1, len(reads))
+        read = reads["1"][0]
+        self.assertEquals("+", read["sense"])
+        reads = self.rnaRds.getSplicesDict(strand="-")
+        self.assertEquals(1, len(reads))
+        reads = self.rnaRds.getSplicesDict(start=1199)
+        self.assertEquals(1, len(reads["1"]))
+        reads = self.rnaRds.getSplicesDict(hasMismatch=True)
+        self.assertEquals(1, len(reads["1"]))
+
+        rdsEntryList = [("testSpliceRead3", "chr2", 2000, 2100, 2150, 2200, "-", 1.0, "A", "G20T")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        reads = self.rnaRds.getSplicesDict()
+        self.assertEquals(2, len(reads))
+        self.assertEquals(2, len(reads["1"]))
+        self.assertEquals(1, len(reads["2"]))
+        reads = self.rnaRds.getSplicesDict(withID=True, chrom="chr2")
+        self.assertFalse(reads.has_key("1"))
+        self.assertEquals("testSpliceRead3", reads["2"][0]["readID"])
+
+
+    def testGetCounts(self):
+        self.assertEquals(0, self.rds.getCounts())
+        self.assertEquals((0, 0, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, self.rds.getCounts())
+        self.assertEquals((1, 0, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+        self.assertEquals(2, self.rds.getCounts(multi=True))
+        self.assertEquals((1, 1, 0), self.rds.getCounts(multi=True, reportCombined=False))
+
+        self.assertEquals(1, self.rds.getCounts(chrom="chr1"))
+        self.assertEquals(0, self.rds.getCounts(chrom="chr2"))
+        self.assertEquals(1, self.rds.getCounts(rmin=1))
+        self.assertEquals(1, self.rds.getCounts(rmin=1, rmax=1000))
+        self.assertEquals(1, self.rds.getCounts(rmax=1000))
+        self.assertEquals(0, self.rds.getCounts(rmin=1000))
+        self.assertEquals(0, self.rds.getCounts(rmax=0))
+        self.assertEquals(1, self.rds.getCounts(sense="+"))
+        self.assertEquals(0, self.rds.getCounts(sense="-"))
+
+        self.assertEquals(0, self.rnaRds.getCounts())
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        self.assertEquals(1, self.rnaRds.getCounts(splices=True))
+
+
+    def testGetTotalCounts(self):
+        self.assertEquals(0, self.rds.getTotalCounts())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, self.rds.getTotalCounts())
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+
+        self.assertEquals(2, self.rds.getTotalCounts())
+        self.assertEquals(2, self.rds.getTotalCounts(chrom="chr1"))
+        self.assertEquals(0, self.rds.getTotalCounts(chrom="chr2"))
+        self.assertEquals(2, self.rds.getTotalCounts(rmin=1))
+        self.assertEquals(2, self.rds.getTotalCounts(rmax=1000))
+        self.assertEquals(1, self.rds.getTotalCounts(rmin=101, rmax=1000))
+        self.assertEquals(1, self.rds.getTotalCounts(rmin=1, rmax=100))
+        self.assertEquals(0, self.rds.getTotalCounts(rmin=1000))
+        self.assertEquals(0, self.rds.getTotalCounts(rmax=0))
+
+
+    def testGetTableEntryCount(self):
+        table = "uniqs"
+        self.assertEquals(0, self.rds.getTableEntryCount(table))
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, self.rds.getTableEntryCount(table))
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+
+        self.assertEquals(1, self.rds.getTableEntryCount(table))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1"))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2"))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=101, rmax=1000))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' "))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' "))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True))
+
+        table="multi"
+        self.assertEquals(1, self.rds.getTableEntryCount(table))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, chrom="chr1"))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, chrom="chr2"))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=1))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, rmax=1000))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, rmin=101, rmax=1000))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, rmin=1000))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, rmax=0))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, restrict=" sense ='+' "))
+        self.assertEquals(0, self.rds.getTableEntryCount(table, restrict=" sense ='-' "))
+        self.assertEquals(1, self.rds.getTableEntryCount(table, distinct=True))
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        table="splices"
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table))
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, chrom="chr1"))
+        self.assertEquals(0, self.rnaRds.getTableEntryCount(table, chrom="chr2"))
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1, startField="startL"))
+        self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmin=1)
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmax=2000, startField="startL"))
+        self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, rmax=2000)
+        self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=999, startField="startL"))
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, rmin=1000, startField="startL"))
+        self.assertEquals(0, self.rnaRds.getTableEntryCount(table, rmax=0, startField="startL"))
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, restrict=" sense ='+' "))
+        self.assertEquals(0, self.rnaRds.getTableEntryCount(table, restrict=" sense ='-' "))
+        self.assertEquals(1, self.rnaRds.getTableEntryCount(table, distinct=True, startField="startL"))
+        self.assertRaises(sqlite.OperationalError, self.rnaRds.getTableEntryCount, table, distinct=True)
+
+
+    def testGetUniqsCount(self):
+        self.assertEquals(0, self.rds.getUniqsCount())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(1, self.rds.getUniqsCount())
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+
+        self.assertEquals(1, self.rds.getUniqsCount())
+        self.assertEquals(1, self.rds.getUniqsCount(chrom="chr1"))
+        self.assertEquals(0, self.rds.getUniqsCount(chrom="chr2"))
+        self.assertEquals(1, self.rds.getUniqsCount(rmin=1))
+        self.assertEquals(1, self.rds.getUniqsCount(rmax=1000))
+        self.assertEquals(0, self.rds.getUniqsCount(rmin=101, rmax=1000))
+        self.assertEquals(0, self.rds.getUniqsCount(rmin=1000))
+        self.assertEquals(0, self.rds.getUniqsCount(rmax=0))
+        self.assertEquals(1, self.rds.getUniqsCount(restrict=" sense ='+' "))
+        self.assertEquals(0, self.rds.getUniqsCount(restrict=" sense ='-' "))
+        self.assertEquals(1, self.rds.getUniqsCount(distinct=True))
+
+
+    def testGetSplicesCount(self):
+        self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        self.assertEquals(0, self.rnaRds.getSplicesCount())
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+
+        self.assertEquals(1, self.rnaRds.getSplicesCount())
+        self.assertEquals(1, self.rnaRds.getSplicesCount(chrom="chr1"))
+        self.assertEquals(0, self.rnaRds.getSplicesCount(chrom="chr2"))
+        self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1))
+        self.assertEquals(1, self.rnaRds.getSplicesCount(rmax=2000))
+        self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=999))
+        self.assertEquals(1, self.rnaRds.getSplicesCount(rmin=1000))
+        self.assertEquals(0, self.rnaRds.getSplicesCount(rmax=0))
+        self.assertEquals(1, self.rnaRds.getSplicesCount(restrict=" sense ='+' "))
+        self.assertEquals(0, self.rnaRds.getSplicesCount(restrict=" sense ='-' "))
+        self.assertEquals(1, self.rnaRds.getSplicesCount(distinct=True))
+
+
+    def testGetMultiCount(self):
+        self.assertEquals(0, self.rds.getMultiCount())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.assertEquals(0, self.rds.getMultiCount())
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rds.insertMulti(rdsEntryList)
+
+        self.assertEquals(1, self.rds.getMultiCount())
+        self.assertEquals(1, self.rds.getMultiCount(chrom="chr1"))
+        self.assertEquals(0, self.rds.getMultiCount(chrom="chr2"))
+        self.assertEquals(1, self.rds.getMultiCount(rmin=1))
+        self.assertEquals(1, self.rds.getMultiCount(rmax=1000))
+        self.assertEquals(0, self.rds.getMultiCount(rmin=1, rmax=100))
+        self.assertEquals(0, self.rds.getMultiCount(rmin=1000))
+        self.assertEquals(0, self.rds.getMultiCount(rmax=0))
+        self.assertEquals(1, self.rds.getMultiCount(restrict=" sense ='+' "))
+        self.assertEquals(0, self.rds.getMultiCount(restrict=" sense ='-' "))
+        self.assertEquals(1, self.rds.getMultiCount(distinct=True))
+
+
+    def testGetReadIDs(self):
+        self.assertEquals([], self.rnaRds.getReadIDs())
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        result = ["testRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs())
+
+        rdsEntryList = [("testMultiRead", "chr1", 101, 200, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 101, 200, "+", 0.5, "", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        result = ["testRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs())
+        result = ["testMultiRead", "testRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(multi=True))
+        
+        rdsEntryList = [("testRead2", "chr1", 201, 300, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        result = ["testRead", "testRead2"]
+        self.assertEquals(result, self.rnaRds.getReadIDs())
+        result = ["testRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(limit=1))
+        result = ["testMultiRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(multi=True, limit=1))
+
+        rdsEntryList = [("testPair/1", "chr1", 301, 400, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        result = ["testPair", "testRead", "testRead2"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(paired=True))
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        result = ["testSpliceRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(uniqs=False, splices=True))
+        result = ["testPair/1", "testRead", "testRead2", "testSpliceRead"]
+        self.assertEquals(result, self.rnaRds.getReadIDs(splices=True))
+
+
+    def testGetMismatches(self):
+        self.assertRaises(ReadDataset.ReadDatasetError, self.rds.getMismatches)
+        self.rds.insertMetadata([("readsize", "5")])
+
+        rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        result = {"chr1": []}
+        self.assertEquals(result, self.rds.getMismatches())
+
+        rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "C3T")]
+        self.rds.insertUniqs(rdsEntryList)
+        result = {"chr1": [[1, 3, "T", "C"]]}
+        self.assertEquals(result, self.rds.getMismatches())
+        result = {"chr2": []}
+        self.assertEquals(result, self.rds.getMismatches(mischrom="chr2"))
+
+        rdsEntryList = [("testRead", "chr1", 10, 15, "+", 1.0, "", "C3T")]
+        self.rds.insertUniqs(rdsEntryList)
+        result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]]}
+        self.assertEquals(result, self.rds.getMismatches())
+
+        rdsEntryList = [("testRead", "chr2", 10, 15, "+", 1.0, "", "C3T")]
+        self.rds.insertUniqs(rdsEntryList)
+        result = {"chr1": [[1, 3, "T", "C"], [10, 12, "T", "C"]],
+                  "chr2": [[10, 12, "T", "C"]]}
+        self.assertEquals(result, self.rds.getMismatches())
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "C41T")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        self.rnaRds.insertMetadata([("readsize", "150")])
+        result = {"chr1": [[1000, 1040, "T", "C"]]}
+        #TODO: This test case fails.  If there are only splice entries for a chromosome it shouldn't
+        #      be necessary to specify the chromosome.
+        #self.assertEquals(result, self.rnaRds.getMismatches())
+        self.assertEquals(result, self.rnaRds.getMismatches(mischrom="chr1"))
+
+
+    #TODO: needs fixing up
+    def testGetChromProfile(self):
+        chromProfile = self.rds.getChromProfile("chr1")
+        result = []
+        self.assertEquals(result, chromProfile.tolist())
+
+        rdsEntryList = [("testRead", "chr1", 1, 5, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        chromProfile = self.rds.getChromProfile("chr1")
+        result = []
+        self.assertEquals(result, chromProfile.tolist())
+
+        self.rds.insertMetadata([("readsize", "5")])
+        chromProfile = self.rds.getChromProfile("chr1")
+        result = [0.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEquals(result, chromProfile.tolist())
+
+        rdsEntryList = [("testRead2", "chr1", 7, 11, "+", 1.0, "", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        # This doesn't seem to make sense the default behavior is to only get the first readlen bases
+        chromProfile = self.rds.getChromProfile("chr1")
+        result = [0.0, 1.0, 1.0, 1.0, 1.0]
+        self.assertEquals(result, chromProfile.tolist())
+
+        # as it stands this doesn't see right either.  Getting an indexError at currentpos 5.
+        chromProfile = self.rds.getChromProfile("chr1", cstop=11)
+        result = [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        self.assertEquals(result, chromProfile.tolist())
+
+
+    def testInsertMetadata(self):
+        result = {}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.insertMetadata([("testMeta", "100")])
+        result = {"testMeta": "100"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.insertMetadata([("testMeta", "200")])
+        result = {"testMeta:2": "200", "testMeta": "100"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+
+    def testUpdateMetadata(self):
+        result = {}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.insertMetadata([("testMeta", "100")])
+        result = {"testMeta": "100"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.updateMetadata("testMeta", "200")
+        result = {"testMeta": "200"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.updateMetadata("testMeta", "300", "200")
+        result = {"testMeta": "300"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+        self.rds.updateMetadata("testMeta", "200", "200")
+        result = {"testMeta": "300"}
+        self.assertEquals(result, self.rds.getMetadata("testMeta"))
+
+
+    def testFlagReads(self):
+        readData = self.rnaRds.getReadsDict(withFlag=True)
+        self.assertEquals({}, readData)
+
+        rdsEntryList = [("testRead", "chr1", 1, 100, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        result = [""]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+        regions = [()]
+        self.assertRaises(sqlite.ProgrammingError, self.rnaRds.flagReads, regions)
+
+        regions = [("test", "chr1", "0", "1000")]
+        self.rnaRds.flagReads(regions)
+        result = ["test"]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+        regions = [("test2", "chr1", "600", "1000")]
+        self.rnaRds.flagReads(regions)
+        result = ["test"]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+        rdsEntryList = [("testRead2", "chr1", 101, 200, "+", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        regions = [("test2", "chr1", "101", "1000")]
+        self.rnaRds.flagReads(regions)
+        result = ["test", "test2"]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+        rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "", ""),
+                        ("testMultiRead", "chr1", 201, 300, "+", 0.5, "", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        regions = [("test", "chr1", "0", "1000")]
+        self.rnaRds.flagReads(regions)
+        result = ["test", "test", "", ""]
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+        self.assertEquals(result, flags)
+
+        regions = [("multi", "chr1", "1", "1000")]
+        self.rnaRds.flagReads(regions, uniqs=False, multi=True)
+        result = ["test", "test", "multi", "multi"]
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+        self.assertEquals(result, flags)
+
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+        regions = [("test", "chr1", "0", "1500")]
+        self.rnaRds.flagReads(regions)
+        result = ["", "test", "test", "multi", "multi"]
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+        self.assertEquals(result, flags)
+
+        regions = [("splice", "chr1", "1", "1500")]
+        self.rnaRds.flagReads(regions, uniqs=False, multi=False, splices=True)
+        result = [" L:splice R:splice", "test", "test", "multi", "multi"]
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+        self.assertEquals(result, flags)
+
+        rdsEntryList = [("testNegSense", "chr1", 301, 400, "-", 1.0, "", "")]
+        self.rnaRds.insertUniqs(rdsEntryList)
+        regions = [("test", "chr1", "0", "1500", "+")]
+        self.rnaRds.flagReads(regions, sense="anythingBut'Both'")
+        result = ["test", "test", ""]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+        regions = [("neg", "chr1", "0", "1500", "-")]
+        self.rnaRds.flagReads(regions, sense="anythingBut'Both'")
+        result = ["test", "test", "neg"]
+        flags = self.getRDSFlags("1", self.rnaRds)
+        self.assertEquals(result, flags)
+
+
+    def getRDSFlags(self, chromosome, rds, doMulti=False, splice=False):
+        if splice:
+            readData = rds.getSplicesDict(withFlag=True)
+        else:
+            readData = rds.getReadsDict(withFlag=True, doMulti=doMulti)
+
+        flags = []
+        for read in readData[chromosome]:
+            flags.append(read["flag"])
+
+        if splice:
+            nonSplice = self.getRDSFlags(chromosome, rds, doMulti, splice=False)
+            for flag in nonSplice:
+                flags.append(flag)
+
+        return flags
+
+
+    def testSetFlags(self):
+        rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.rnaRds.insertUniqs(rdsEntryList)
+        rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+                        ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+
+        result = ["reset"]
+        self.rds.setFlags("reset")
+        flags = self.getRDSFlags("1", self.rds)
+        self.assertEquals(result, flags)
+
+        result = ["splice", "uniq", "resetMulti", "resetMulti"]
+        self.rnaRds.setFlags("resetMulti", uniqs=False, splices=False)
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+        self.assertEquals(result, flags)
+
+        result = ["resetAll", "resetAll", "resetAll", "resetAll"]
+        self.rnaRds.setFlags("resetAll")
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True, splice=True)
+        self.assertEquals(result, flags)
+
+
+    def testResetFlags(self):
+        rdsEntryList = [("test", "chr1", 1, 100, "+", 1.0, "uniq", "")]
+        self.rds.insertUniqs(rdsEntryList)
+        self.rnaRds.insertUniqs(rdsEntryList)
+        rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+                        ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+        self.rnaRds.insertMulti(rdsEntryList)
+        rdsEntryList = [("testSpliceRead", "chr1", 1000, 1100, 1150, 1200, "+", 1.0, "splice", "")]
+        self.rnaRds.insertSplices(rdsEntryList)
+
+        self.rds.resetFlags()
+        result = [""]
+        flags = self.getRDSFlags("1", self.rds)
+        self.assertEquals(result, flags)
+
+        self.rnaRds.resetFlags()
+        result = ["", "", ""]
+        flags = self.getRDSFlags("1", self.rnaRds, doMulti=True)
+        self.assertEquals(result, flags)
+
+        self.rnaRds.resetFlags()
+        result = ["", ""]
+        flags = self.getRDSFlags("1", self.rnaRds, splice=True)
+        self.assertEquals(result, flags)
+
+
+    def testReweighMultireads(self):
+        rdsEntryList = [("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", ""),
+                        ("testMultiRead", "chr1", 201, 300, "+", 0.5, "multi", "")]
+        self.rds.insertMulti(rdsEntryList)
+        readData = ("0.25", "chr1", "201", "testMultiRead")
+        self.rds.reweighMultireads([readData])
+        readDict = self.rds.getReadsDict(withWeight=True, doMulti=True)
+        read = readDict["1"][0]
+        self.assertEquals(0.25, read["weight"])
+
+
+    #TODO: write unit test
+    def testSetSynchronousPragma(self):
+        pass
+
+
+    #TODO: write unit test
+    def testSetDBcache(self):
+        pass
+
+
+    #TODO: write unit test
+    def testExecute(self):
+        pass
+
+
+    #TODO: write unit test
+    def testExecuteCommit(self):
+        pass
+
+
+    def testBuildIndex(self):
+        self.assertFalse(self.rds.hasIndex())
+        self.rds.buildIndex()
+        self.assertTrue(self.rds.hasIndex())
+
+
+    def testDropIndex(self):
+        self.assertFalse(self.rds.hasIndex())
+        self.rds.buildIndex()
+        self.assertTrue(self.rds.hasIndex())
+        self.rds.dropIndex()
+        self.assertFalse(self.rds.hasIndex())
+
+        self.assertFalse(self.rnaRds.hasIndex())
+        self.rnaRds.buildIndex()
+        self.assertTrue(self.rnaRds.hasIndex())
+        self.rnaRds.dropIndex()
+        self.assertFalse(self.rnaRds.hasIndex())
+
+
+    #TODO: write unit test
+    def testMemSync(self):
+        pass
+
+
+    #TODO: write unit test
+    def testCopyDBEntriesToMemory(self):
+        pass
+
+
+    #TODO: write unit test
+    def testCopySpliceDBEntriesToMemory(self):
+        pass
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestReadDataset))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testRnaAToIFilter.py b/test/testRnaAToIFilter.py
new file mode 100644 (file)
index 0000000..d2fdfa5
--- /dev/null
@@ -0,0 +1,84 @@
+'''
+Created on Aug 25, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import rnaAToIFilter
+
+
+class TestRnaAToIFilter(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testRnaAToIFilter(self):
+        snpPropertiesList = []
+        self.assertEquals([], rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 7 8 9 10 11 12 13"]
+        result = []
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+        result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"]
+        result = ["0\t1\t2\t3\t4\t5\t6\tA-G\t8\t9\t10\t11\t12\tF"]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 R"]
+        result = []
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"]
+        result = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 T-C 8 9 10 11 12 F"]
+        result = []
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+                             "0 1 2 3 4 5 6 7 8 9 10 11 12 13"
+        ]
+        result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+                             "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"
+        ]
+        result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+                  "0 1 2 3 4 5 6 T-C 8 9 10 11 12 R"
+        ]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+                             "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"
+        ]
+        result = ["0 1 2 3 4 5 6 A-G 8 9 10 11 12 F",
+                  "0 1 2 3 4 5 6 A-G 8 9 10 11 12 F"
+        ]
+        self.assertEquals(result, rnaAToIFilter.rnaAToIFilter(snpPropertiesList))
+
+        snpPropertiesList = ["invalid entry"]
+        self.assertRaises(IndexError, rnaAToIFilter.rnaAToIFilter, snpPropertiesList)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestRnaAToIFilter))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testRnaAToIFilter']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testRnaEditing.py b/test/testRnaEditing.py
new file mode 100644 (file)
index 0000000..5eb4a0d
--- /dev/null
@@ -0,0 +1,49 @@
+'''
+Created on Aug 23, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import rnaEditing
+
+
+class TestRnaEditing(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testGetGenesWithMultipleSNPs(self):
+        snpList = []
+        self.assertEquals([], rnaEditing.getGenesWithMultipleSNPs(snpList))
+
+        snpList = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp1"],
+                   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp2"],
+                   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"]
+        ]
+
+        result = ["snp3", "snp2", "snp1"]
+        self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList))
+        result = []
+        self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2))
+
+        snpList.append([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, "snp3"])
+        result = ["snp3"]
+        self.assertEquals(result, rnaEditing.getGenesWithMultipleSNPs(snpList, minCount=2))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestRnaEditing))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testTranscripts.py b/test/testTranscripts.py
new file mode 100644 (file)
index 0000000..cf401a7
--- /dev/null
@@ -0,0 +1,98 @@
+'''
+Created on Oct 4, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from Erange import transcripts
+
+inFileName = "testTranscriptsInFile.txt"
+outFileName = "testTranscriptsOutFile.txt"
+
+
+class TestTranscripts(unittest.TestCase):
+
+
+    def setUp(self):
+        self.inFile = open(inFileName, "w")
+        self.inFile.write("line1\t3.5\n")
+        self.inFile.write("line2\t1.5\n")
+        self.inFile.write("line3\tpadding\t3.5\n")
+        self.inFile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(outFileName)
+        except OSError:
+            pass
+
+        try:
+            os.remove(inFileName)
+        except OSError:
+            pass
+
+
+    def testTranscripts(self):
+        transcripts.transcripts(inFileName, outFileName)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("line1\t700000.0\t2.3\n", results[0])
+        self.assertEquals("line2\t300000.0\t1.0\n", results[1])
+        self.assertEquals("line3\t700000.0\t2.3\n", results[2])
+
+    def testMain(self):
+        argv = ["transcripts.py", inFileName, outFileName]
+        transcripts.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("line1\t700000.0\t2.3\n", results[0])
+        self.assertEquals("line2\t300000.0\t1.0\n", results[1])
+        self.assertEquals("line3\t700000.0\t2.3\n", results[2])
+
+        argv = ["transcripts.py", inFileName, outFileName, "--transcriptome", "400000"]
+        transcripts.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("line1\t1400000.0\t4.7\n", results[0])
+        self.assertEquals("line2\t600000.0\t2.0\n", results[1])
+        self.assertEquals("line3\t1400000.0\t4.7\n", results[2])
+
+        argv = ["transcripts.py", inFileName, outFileName, "--cells", "5e5"]
+        transcripts.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("line1\t700000.0\t4.7\n", results[0])
+        self.assertEquals("line2\t300000.0\t2.0\n", results[1])
+        self.assertEquals("line3\t700000.0\t4.7\n", results[2])
+
+        argv = ["transcripts.py", inFileName, outFileName, "--efficiency", "0.15"]
+        transcripts.main(argv)
+        output = open(outFileName)
+        results = output.readlines()
+        output.close()
+        self.assertEquals(3, len(results))
+        self.assertEquals("line1\t700000.0\t4.7\n", results[0])
+        self.assertEquals("line2\t300000.0\t2.0\n", results[1])
+        self.assertEquals("line3\t700000.0\t4.7\n", results[2])
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestTranscripts))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testmakebedfromrds.py b/test/testmakebedfromrds.py
new file mode 100644 (file)
index 0000000..f11ccd6
--- /dev/null
@@ -0,0 +1,170 @@
+'''
+Created on Jun 4, 2010
+
+@author: sau
+'''
+import unittest
+from Erange import makebedfromrds
+
+
+class TestMakeBedFromRds(unittest.TestCase):
+
+    def testGetSenseColor(self):
+        senseColor = makebedfromrds.getSenseColor('+', .5)
+        self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for low weight and plus sense color")
+
+        senseColor = makebedfromrds.getSenseColor('-', .5)
+        self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for low weight and non-plus sense")
+
+        senseColor = makebedfromrds.getSenseColor('+', 5)
+        self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for high weight and plus sense")
+
+        senseColor = makebedfromrds.getSenseColor('-', 5)
+        self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for high weight and non-plus sense")
+
+
+    def testGetMultiSenseColor(self):
+        senseColor = makebedfromrds.getMultiSenseColor('+')
+        self.assertEqual(senseColor, makebedfromrds.MULTI_PLUS_COLOR, "incorrect color for plus sense")
+
+        senseColor = makebedfromrds.getMultiSenseColor('-')
+        self.assertEqual(senseColor, makebedfromrds.MULTI_MINUS_COLOR, "incorrect color for non-plus sense")
+
+
+    def testGetSingleSenseColor(self):
+        senseColor = makebedfromrds.getSingleSenseColor('+')
+        self.assertEqual(senseColor, makebedfromrds.PLUS_COLOR, "incorrect color for plus sense")
+
+        senseColor = makebedfromrds.getSingleSenseColor('-')
+        self.assertEqual(senseColor, makebedfromrds.MINUS_COLOR, "incorrect color for non-plus sense")
+
+
+    def testGetReadSizes(self):
+        numPieces = 3
+        startList = [0, 1, 2]
+        stopList = [3, 4, 5]
+        readSizes = makebedfromrds.getReadSizes(numPieces, startList, stopList)
+        self.assertEqual(readSizes, "3,3,3", "incorrect read size list")
+        
+        readSizes = makebedfromrds.getReadSizes(1, startList, stopList)
+        self.assertEquals(readSizes, "3", "incorrect read size list for numPieces=1")
+        
+        self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, [], stopList)
+        self.assertRaises(IndexError, makebedfromrds.getReadSizes, numPieces, startList, [])
+        self.assertRaises(IndexError, makebedfromrds.getReadSizes, 4, startList, stopList)
+
+
+    def testGetReadCoords(self):
+        numPieces = 3
+        startList = [0, 1, 2]
+        readCoords = makebedfromrds.getReadCoords(numPieces, startList)
+        self.assertEqual(readCoords, "0,1,2", "incorrect read coords list")
+        
+        readCoords = makebedfromrds.getReadCoords(1, startList)
+        self.assertEqual(readCoords, "0", "incorrect read coords list for numPieces=1")
+        
+        self.assertRaises(IndexError, makebedfromrds.getReadCoords, numPieces, [])
+        self.assertRaises(IndexError, makebedfromrds.getReadCoords, 4, startList)
+
+
+    def testGetSpliceColor(self):
+        lpart = 1
+        rpart = 2
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for hacktype 1 splice")
+        self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for hacktype 1 splice")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 1.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left unique")
+        self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left unique")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 0.0
+        rightweight = 1.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 right unique")
+        self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 right unique")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 1.0
+        rightweight = 1.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for hacktype 1 left and right unique")
+        self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for hacktype 1 left and right unique")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 multi")
+        self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 multi")
+        
+        lpart = 1
+        rpart = 1
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
+        self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for hacktype 1 lpart + rpart = 2")
+        self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for hacktype 1 lpart + rpart = 2")
+        
+        lpart = 2
+        rpart = 0
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+        self.assertEqual(aColor, makebedfromrds.SPLICE_COLOR, "incorrect first color for left splice")
+        self.assertEqual(bColor, makebedfromrds.SPLICE_COLOR, "incorrect second color for left splice")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 1.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+        self.assertEqual(aColor, makebedfromrds.UNIQUE_COLOR, "incorrect first color for left unique")
+        self.assertEqual(bColor, makebedfromrds.UNIQUE_COLOR, "incorrect second color for left unique")
+        
+        lpart = 0
+        rpart = 0
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+        self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for multi splice")
+        self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for multi splice")
+        
+        lpart = 1
+        rpart = 0
+        leftweight = 0.0
+        rightweight = 0.0
+        aColor, bColor = makebedfromrds.getSpliceColor(lpart, rpart, leftweight, rightweight)
+        self.assertEqual(aColor, makebedfromrds.MULTI_COLOR, "incorrect first color for lpart = 1 multi splice")
+        self.assertEqual(bColor, makebedfromrds.MULTI_COLOR, "incorrect second color for lpart = 1 multi splice")
+
+
+    def testDoNotOutputChromosome(self):
+        self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", True), "chrM is output when enforceChr=True")
+        self.assertTrue(makebedfromrds.doNotOutputChromosome("chrM", False), "chrM is output when enforceChr=False")
+        self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", True), "chr is not output when enforceChr=True")
+        self.assertFalse(makebedfromrds.doNotOutputChromosome("chrAny", False), "chr is not output when enforceChr=False")
+        self.assertTrue(makebedfromrds.doNotOutputChromosome("Bad", True), "bad name chr is output when enforceChr=True")
+        self.assertFalse(makebedfromrds.doNotOutputChromosome("Bad", False), "bad name chr is not output when enforceChr=True")
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeBedFromRds))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/transcripts.py b/transcripts.py
new file mode 100755 (executable)
index 0000000..53b6aea
--- /dev/null
@@ -0,0 +1,53 @@
+#
+#  transcripts.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 1/25/08.
+#
+""" usage: python %s rpkmFile outFile [--transcriptome size] [--cells count] [--efficiency fraction]
+           where transcriptome size is in Gbp, cell count is in arbitrary units and efficiency is a fraction
+"""
+
+import sys, optparse
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "%prog: version 3.0"
+    usage = "usage: python %prog rpkmFile outFile [options]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--transcriptome", type="float", dest="tSize",
+                      help="transcriptome size in Gbp [default 200000.0]")
+    parser.add_option("--cells", type="float", dest="cellCount",
+                      help="arbitrary units [default 1e6]")
+    parser.add_option("--efficiency", type="float", dest="efficiency",
+                      help="fraction [default 0.3]")
+    parser.set_defaults(tSize=200000.0, cellCount=1e6, efficiency=0.3)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
+        sys.exit(1)
+
+    infile = args[0]
+    outfile = args[1]
+    
+    transcripts(infile, outfile, options.tSize, options.cellCount, options.efficiency)
+
+
+def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency=0.3):
+    infile = open(infilename)
+    outfile = open(outfilename, "w")
+    for line in infile:
+        fields = line.strip().split()
+        rpkm = float(fields[-1])
+        transcripts = rpkm * tSize
+        transPerCell = transcripts / cellCount / efficiency
+        outfile.write("%s\t%.1f\t%.1f\n" % (fields[0], transcripts, transPerCell))
+    infile.close()
+    outfile.close()
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/trimreads.py b/trimreads.py
new file mode 100755 (executable)
index 0000000..d246e15
--- /dev/null
@@ -0,0 +1,115 @@
+#
+#  trimquery.py
+#  ENRAGE
+#
+#  Created by Ali Mortazavi on 8/12/08.
+#
+
+import sys, optparse
+from cistematic.core import complement
+
+print "%prog: version 2.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %prog length infile outfile [--fastq] [--fromback] [--paired] [--flip] [--filter maxN]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--fastq", action="store_true", dest="fastq")
+    parser.add_option("--fromback", action="store_true", dest="fromBack")
+    parser.add_option("--paired", action="store_true", dest="paired")
+    parser.add_option("--flip", action="store_true", dest="flipseq")
+    parser.add_option("--filter", type="int", dest="maxN")
+    parser.set_defaults(fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        print "\t where paired fragments are separated by a : when given the -paired flag" 
+        sys.exit(1)
+
+    length = int(args[0])
+    infile = args[1]
+    outfile = args[2]
+
+    trimreads(length, infile, outfile, options.fastq, options.fromBack, options.paired, options.flipseq, options.maxN)
+
+
+def trimreads(length, inFileName, outFileName, fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None):
+    infile = open(inFileName)
+    outfile = open(outFileName, "w")
+
+    if paired:
+        pairedlength = 2 * length
+    index = 0
+
+    if fromBack:
+        length = -1 * length
+
+    filtering = False
+    if maxN is not None:
+        filtering = True
+        print "filtering out reads with more than %d Ns" % maxN
+    else:
+        maxN = 2
+
+    print "trimming reads from %s to %d bp and saving them in %s" % (inFileName, length, outFileName)
+    
+    filtered = 0
+    header = ""
+    for line in infile:
+        line = line.strip()
+        if len(line) == 0:
+            continue
+
+        firstChar = line[0]
+        if (not fastq and firstChar == ">") or (fastq and firstChar in ["@", "+"]): 
+            header = line + "\n"
+        else:
+            if filtering:
+                if line.count("N") > maxN:
+                    filtered += 1
+                    continue
+
+            seq1 = line[length:]
+            seq2 = line[:length]
+            if flipseq:
+                try:
+                    tempseq1 = seq1
+                    seq1 = complement(tempseq1)
+                except:
+                    seq1 = tempseq1
+
+                try:
+                    tempseq2 = seq2
+                    seq2 = complement(tempseq2)
+                except:
+                    seq2 = tempseq2
+
+            if paired:
+                if len(line) < pairedlength:
+                    continue
+
+                outfile.write("%s%s:%s\n" % (header, seq1, seq2))
+            else:
+                if fromBack:
+                    outfile.write("%s%s\n" % (header, seq1))
+                else:
+                    outfile.write("%s%s\n" % (header, seq2))
+
+            index += 1
+            if index % 1000000 == 0:
+                print ".",
+
+            sys.stdout.flush()
+
+    outfile.close()
+    print "returned %d reads" % index
+    if filtering:
+        print "%d additional reads filtered" % filtered
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/utrChanges.py b/utrChanges.py
new file mode 100755 (executable)
index 0000000..d95d18c
--- /dev/null
@@ -0,0 +1,91 @@
+#
+#  utrChanges.py
+#  ENRAGE
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+import sys
+from commoncode import getMergedRegions, getLocusByChromDict
+from cistematic.genomes import Genome
+
+print "%s: version 1.3" % sys.argv[0]
+
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    if len(argv) < 4:
+        print "usage: python %s genome acceptedfile outfile" % argv[0]
+        sys.exit(1)
+
+    genome = argv[1]
+    acceptfile =  argv[2]
+    outfile = argv[3]
+
+    utrChanges(genome, acceptfile, outfile)
+
+
+def utrChanges(genome, acceptfile, outFileName):
+    acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+    outfile = open(outFileName, "w")
+
+    hg = Genome(genome)
+
+    origLocusByChromDict = getLocusByChromDict(hg, keepSense = True)
+    newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict = acceptDict, keepSense = True)
+
+    new3utr = 0
+    new5utr = 0
+    changedGene = 0
+
+    for chrom in origLocusByChromDict:
+        for (gstart, gstop, gid, glen, sense) in origLocusByChromDict[chrom]:   
+            for (newstart, newstop, newgid, newlen, newsense) in newLocusByChromDict[chrom]:
+                if gid == newgid:
+                    changedBoundary = False
+                    new3p = "F"
+                    new5p = "F"
+                    if newstart < gstart:
+                        if sense == "R":
+                            new3utr += 1
+                            new3p = "T"
+                            changedBoundary = True
+                        elif sense == "F":
+                            new5utr += 1
+                            new5p = "T"
+                            changedBoundary = True
+                        else:
+                            print sense
+
+                    if newstop > gstop:
+                        if sense == "R":
+                            new5utr += 1
+                            new5p = "T"
+                            changedBoundary = True
+                        elif sense == "F":
+                            new3utr += 1
+                            new3p = "T"
+                            changedBoundary = True
+                        else:
+                            print sense
+
+                    if changedBoundary:
+                        changedGene += 1
+                        outfile.write("%s\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\t%s\n" % (gid, chrom, gstart, gstop, sense, chrom, newstart, newstop, new5p, new3p))
+
+                    continue
+
+    outfile.close()
+    print "%d new 5'utr" % new5utr
+    print "%d new 3'utr" % new3utr
+    print "%s affected genes" % changedGene
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
diff --git a/weighMultireads.py b/weighMultireads.py
new file mode 100755 (executable)
index 0000000..ed27edf
--- /dev/null
@@ -0,0 +1,300 @@
+#
+#  weightMultireads.py
+#  ENRAGE
+#
+
+#  Created by Ali Mortazavi on 10/02/08.
+#
+
+try:
+    import psyco
+    psyco.full()
+except:
+    pass
+
+from commoncode import readDataset
+import sys, time, string, optparse
+
+print "%prog: version 3.1"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--radius", type="int", dest="radius")
+    parser.add_option("--noradius", action="store_false", dest="doRadius")
+    parser.add_option("--usePairs", type="int", dest="pairDist")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 1:
+        print usage
+        sys.exit(1)
+
+    rdsfile = args[0]
+
+    weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
+
+
+def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
+
+    if radius is not None:
+        doRadius = True
+    else:
+        radius = 100
+
+    usePairs = False
+    if pairDist is not None:
+        usePairs = True
+
+    tooFar = pairDist * 10
+    
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = 1
+
+    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    readlen = RDS.getReadSize()
+    halfreadlen = readlen / 2
+
+    if cachePages > RDS.getDefaultCacheSize():
+        RDS.setDBcache(cachePages)
+
+    if verbose:
+        print time.ctime()
+
+    multiIDs = RDS.getReadIDs(uniqs=False,multi=True)
+    if verbose:
+        print "got multiIDs ", time.ctime()
+
+    fixedPair = 0
+    fixedReads = []
+    if usePairs:
+        print "doing pairs with pairDist = %d" % pairDist
+        uidDict = {}
+        midDict = {}
+        jointList = []
+        bothMultiList = []
+        mainIDList = []
+        guDict = {}
+        muDict = {}
+
+        if RDS.dataType == "RNA":
+            uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True)
+        else:
+            uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False)
+
+        if verbose:
+            print "got uniqIDs ", time.ctime()
+
+        for readID in uniqIDs:
+            (mainID, pairID) = readID.split("/")
+            try:
+                uidDict[mainID].append(pairID)
+            except:
+                uidDict[mainID] = [pairID]
+                mainIDList.append(mainID)
+
+        if verbose:
+            print "uidDict all ", len(uidDict), time.ctime()
+
+        for mainID in mainIDList:
+            if len(uidDict[mainID]) == 2:
+                del uidDict[mainID]
+
+        if verbose:
+            print "uidDict first candidates ", len(uidDict), time.ctime()
+
+        for readID in multiIDs:
+            (frontID, multiplicity) = readID.split("::")
+            (mainID, pairID) = frontID.split("/")
+            try:
+                if pairID not in midDict[mainID]:
+                    midDict[mainID].append(pairID)
+            except:
+                midDict[mainID] = [pairID]
+
+        if verbose:
+            print "all multis ", len(midDict), time.ctime()
+
+        mainIDList = uidDict.keys()
+        for mainID in mainIDList:
+            if mainID not in midDict:
+                del uidDict[mainID]
+
+        if verbose:
+            print "uidDict actual candidates ", len(uidDict), time.ctime()
+
+        for readID in midDict:
+            listLen = len(midDict[readID])
+            if listLen == 1:
+                if readID in uidDict:
+                    jointList.append(readID)
+            elif listLen == 2:
+                bothMultiList.append(readID)
+
+        if verbose:
+            print "joint ", len(jointList), time.ctime()
+            print "bothMulti ", len(bothMultiList), time.ctime()
+
+        del uidDict
+        del midDict
+        del mainIDList
+        del uniqIDs
+
+        uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+        if verbose:
+            print "got uniq dict ", len(uniqDict), time.ctime()
+
+        if RDS.dataType == "RNA":
+            spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
+            if verbose:
+                print "got splice dict ", len(spliceDict), time.ctime()
+
+        for readID in jointList:
+            try:
+                guDict[readID] = uniqDict[readID][0]
+            except:
+                if RDS.dataType == "RNA":
+                    guDict[readID] = spliceDict[readID][0]
+
+        del uniqDict
+        del spliceDict
+        if verbose:
+            print "guDict actual ", len(guDict), time.ctime()
+
+        multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
+        if verbose:
+            print "got multi dict ", len(multiDict), time.ctime()
+
+        for readID in jointList:
+            muDict[readID] = multiDict[readID]
+
+        for readID in bothMultiList:
+            muDict[readID] = multiDict[readID]
+
+        del multiDict
+        if verbose:
+            print "muDict actual ", len(muDict), time.ctime()
+
+        RDS.setSynchronousPragma("OFF")
+        for readID in jointList:
+            try:
+                (ustart, uchrom, upair) = guDict[readID]
+                ustop = ustart + readlen
+            except:
+                (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID]
+
+            muList = muDict[readID]
+            muLen = len(muList)
+            bestMatch = [tooFar] * muLen
+            found = False
+            for index in range(muLen):
+                (mstart, mchrom, mpair) = muList[index]
+                if uchrom != mchrom:
+                    continue
+
+                if abs(mstart - ustart) < pairDist:
+                    bestMatch[index] = abs(mstart - ustart)
+                    found = True
+                elif abs(mstart - ustop) < pairDist:
+                    bestMatch[index] = abs(mstart - ustop)
+                    found = True
+
+            if found:
+                theMatch = -1
+                theDist = tooFar
+                reweighList = []
+                for index in range(muLen):
+                    if theDist > bestMatch[index]:
+                        theMatch = index
+                        theDist = bestMatch[index]
+
+                theID = string.join([readID, mpair], "/")
+                for index in range(muLen):
+                    if index == theMatch:
+                        score = 1 - (muLen - 1) / (100. * (muLen))
+                    else:
+                        score = 1 / (100. * muLen)
+
+                    start = muList[index][0]
+                    chrom = "chr%s" % muList[index][1]
+                    reweighList.append((round(score,3), chrom, start, theID))
+
+                if theMatch > 0:
+                    RDS.reweighMultireads(reweighList)
+                    fixedPair += 1
+                    if verbose and fixedPair % 10000 == 1:
+                        print "fixed %d" % fixedPair
+                        print guDict[readID]
+                        print muDict[readID]
+                        print reweighList
+
+                    fixedReads.append(theID)
+
+        RDS.setSynchronousPragma("ON")
+
+        del guDict
+        del muDict
+        print "fixed %d pairs" % fixedPair
+        print time.ctime()
+
+    skippedReads = 0
+    if doRadius:
+        print "doing uniq read radius with radius = %d" % radius
+        multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
+        print "got multiDict"
+        RDS.setSynchronousPragma("OFF")
+        rindex = 0
+        for readID in multiIDs:
+            theID = readID
+            if theID in fixedReads:
+                skippedReads += 1
+                continue
+
+            if "::" in readID:
+                (readID, multiplicity) = readID.split("::")
+
+            scores = []
+            coords = []
+            for read in multiDict[readID]:
+                (start, weight, rID, chrom) = read
+                achrom = "chr%s" % chrom
+                regionStart = start + halfreadlen - radius
+                regionStop = start + halfreadlen + radius 
+                uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
+                scores.append(uniqs + 1)
+                coords.append((achrom, start, theID))
+
+            total = float(sum(scores))
+            reweighList = []
+            for index in range(len(scores)):
+                reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
+
+            RDS.reweighMultireads(reweighList)
+            rindex += 1
+            if rindex % 10000 == 0:
+                print rindex
+
+        RDS.setSynchronousPragma("ON")
+        if verbose:
+            print "skipped ", skippedReads
+
+        print "reweighted ", rindex
+
+    if doCache:
+        RDS.saveCacheDB(rdsfile)
+
+    if verbose:
+        print "finished", time.ctime()
+    
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file