erange version 4.0a dev release

author Sean Upchurch <sau@caltech.edu>

Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)

committer Sean Upchurch <sau@caltech.edu>

Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)
author Sean Upchurch <sau@caltech.edu>
Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)
committer Sean Upchurch <sau@caltech.edu>
Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)
diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py

index 935a04e4bfe973ba9250dbca1c90f12f03c17d67..bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59 100644 (file)
--- a/MakeBamFromRds.py
+++ b/MakeBamFromRds.py
@@ -17,37 +17,24 @@ import sys
  import re
  import optparse
  import random
+import string
  import pysam
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "MakeBamFromRds: version 1.0"
+    verstring = "makeBamFromRds: version 1.0"
      print verstring
  
      doPairs = False
      
      usage = "usage: python %prog rdsFile bamFile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
-    parser.add_option("--nomulti", action="store_false", dest="withMulti")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--flag", dest="withFlag")
-    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
-    parser.add_option("--pairs", action="store_true", dest="doPairs")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
-    parser.add_option("--chrom", action="append", dest="chromList")
-    parser.add_option("--fasta", dest="fastaFileName")
-    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
-                        doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
-                        doCache=False, cachePages=100000, fastaFileName="",
-                        chromList=[])
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -67,6 +54,40 @@ def main(argv=None):
                       options.chromList, options.fastaFileName)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", action="store_true", dest="doPairs")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--fasta", dest="fastaFileName")
+
+    configParser = getConfigParser()
+    section = "MakeBamFromRds"
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+    enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
+
+    parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
+                        doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+                        doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
+                        chromList=[])
+
+    return parser
+
+
  def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
                       doSplices=False, doPairs=False, withFlag="",
                       useFlagLike=False, enforceChr=False, allChrom=True,
@@ -77,7 +98,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
          sys.exit(1)
  
      print "\nsample:"
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
  
      if cachePages > RDS.getDefaultCacheSize():
          RDS.setDBcache(cachePages)
@@ -120,42 +141,43 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
      outfile = pysam.Samfile(outfilename, "wb", header=header)
  
      totalWrites = 0
-    noncanonicalSplices = 0
+    noncanonicalSpliceCount = 0
      for chrom in chromList:
          index = 0
          print "chromosome %s" % (chrom)
          if withUniqs or withMulti:
              hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
-                                       withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
-                                       flagLike=useFlagLike, entryDict=True)
+                                       doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+                                       flagLike=useFlagLike, withMismatch=True)
  
              for read in hitDict[chrom]:
                  writeBAMEntry(outfile, chrom, read, readlength)
                  index += 1
  
          if doSplices:
-            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
              index += numSpliceReadsWritten
-            noncanonicalSplices += noncanonical
+            noncanonicalSpliceCount += noncanonical
  
          print index
          totalWrites += index
  
      outfile.close()
      print "%d total reads written" % totalWrites
-    print "%d non-canonical splices" % noncanonicalSplices
+    print "%d non-canonical splices" % noncanonicalSpliceCount
  
  
-def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
      index = 0
      noncanonicalSplices = 0
-    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
+                                    withMismatch=True)
      if chrom not in spliceDict:
          pass
      else:
          for read in spliceDict[chrom]:
              if fastaSequenceDict.has_key(chrom):
-                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
                  noncanonicalSplices += noncanonical
  
              writeBAMEntry(outfile, chrom, read, readlength)
@@ -165,28 +187,37 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, f
  
  
  def writeBAMEntry(outfile, chrom, outputDict, readlength):
+    """ We need to subtract 1 from the position because rds is 1 based and
+        most of the rest of the entire world is 0 based.
+    """
      tagList = []
      alignedRead = pysam.AlignedRead()
-    alignedRead.qname = outputDict["readID"]
+    try:
+        (readID, pairID) = outputDict["readID"].split("/")
+        paired = True
+    except ValueError:
+        readID = outputDict["readID"]
+        paired = False
+
+    alignedRead.qname = readID
      if outputDict["sense"] == "-":
          alignedRead.is_reverse = True
  
      alignedRead.rname = outfile.references.index(chrom)
  
      if outputDict.has_key("startL"):
-        startL = outputDict["startL"]
-        stopL = outputDict["stopL"]
-        startR = outputDict["startR"]
-        stopR = outputDict["stopR"]
+        startL = outputDict["startL"] - 1
+        stopL = outputDict["stopL"] - 1
+        startR = outputDict["startR"] - 1
+        stopR = outputDict["stopR"] - 1
          alignedRead.pos = startL
-        alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
-        tagList.append(("XS", outputDict["sense"]))
+        alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
+        tagList.append(("XS", str(outputDict["sense"])))
      else:
-        alignedRead.pos = outputDict["start"]
+        alignedRead.pos = outputDict["start"] - 1
          alignedRead.cigar = [(0, readlength)]
  
-    if outputDict.has_key("pairID"):
-        pairID = outputDict["pairID"]
+    if paired:
          if pairID == "1":
              alignedRead.is_read1 = True
              alignedRead.is_proper_pair = True
@@ -199,20 +230,22 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
      if outputDict.has_key("mismatch"):
          mismatchTag = getMismatches(outputDict["mismatch"])
          if mismatchTag:
-            tagList.append(("MD", mismatchTag))
-    
+            tagList.append(("MD", str(mismatchTag)))
+
      if tagList:
-        alignedRead.tags = tagList
+        alignedRead.tags = tuple(tagList)
  
      outfile.write(alignedRead)
  
  
  def getMismatches(mismatchString):
-    mismatch = ""
+    mismatchList = []
      positions = re.findall("\d+", mismatchString)
      nucleotides = re.findall("([ACGTN])\d+", mismatchString)
      for index in range(0, len(positions)):
-        mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+        mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
+
+    mismatch = string.join(mismatchList, "")
  
      return mismatch
  
@@ -250,7 +283,7 @@ def getFastaSequenceDictionary(fastaFileName):
      return fastaSeqDict
  
  
-def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
      spliceSense = {"GTAG": "+",
                     "GCAG": "+",
                     "ATAC": "+",
@@ -264,8 +297,9 @@ def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
      intronlen = startRight - stopLeft
      leftJunctionSig =fastaSequence[intronstart:intronstart+2]
      rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
-    spliceJunction = leftJunctionSig + rightJunctionSig
+    spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
      spliceJunction = spliceJunction.upper()
+    print spliceJunction
      if spliceSense.has_key(spliceJunction):
          sense = spliceSense[spliceJunction]
      else:
diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py

index e9df847ab167825f972b670690c953d443eff806..969d4ccf556524200904e6f78f53ddc1cd1018b2 100644 (file)
--- a/MakeRdsFromBam.py
+++ b/MakeRdsFromBam.py
@@ -12,11 +12,16 @@ try:
  except:
      pass
  
-import sys, string, optparse, re
+import sys
+import string
+import optparse
+import re
  import pysam
-from commoncode import readDataset, writeLog
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
+import ReadDataset
  
-verstring = "%prog: version 1.0"
+INSERT_SIZE = 100000
+verstring = "makeRdsFromBam: version 1.0"
  
  
  def main(argv=None):
@@ -28,24 +33,7 @@ def main(argv=None):
      usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
              \ninput reads must be sorted to properly record multireads"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--append", action="store_false", dest="init",
-                      help="append to existing rds file [default: create new]")
-    parser.add_option("--RNA", action="store_true", dest="rnaDataType",
-                      help="set data type to RNA [default: DNA]")
-    parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
-                      help="input file is in sam format")
-    parser.add_option("--index", action="store_true", dest="doIndex",
-                      help="index the output rds file")
-    parser.add_option("--cache", type="int", dest="cachePages",
-                      help="number of cache pages to use [default: 100000")
-    parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
-                      help="multi counts over this value are discarded [default: 10]")
-    parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
-                      help="use the raw read names")
-    parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
-                        maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      try:
@@ -70,6 +58,39 @@ def main(argv=None):
                     options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init",
+                      help="append to existing rds file [default: create new]")
+    parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+                      help="set data type to RNA [default: DNA]")
+    parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+                      help="input file is in sam format")
+    parser.add_option("--index", action="store_true", dest="doIndex",
+                      help="index the output rds file")
+    parser.add_option("--cache", type="int", dest="cachePages",
+                      help="number of cache pages to use [default: 100000")
+    parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+                      help="multi counts over this value are discarded [default: 10]")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+                      help="use the raw read names")
+
+    configParser = getConfigParser()
+    section = "makeRdsFromBam"
+    init = getConfigBoolOption(configParser, section, "init", True)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
+    rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+    trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+
+    parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
+                        maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
+
+    return parser
+
+
  def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
                     cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
  
@@ -91,7 +112,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
  
      writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
  
-    rds = readDataset(outDbName, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
      if not init and doIndex:
          try:
              if rds.hasIndex():
@@ -119,16 +140,15 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
      if len(propertyList) > 0:
          rds.insertMetadata(propertyList)
  
-    countReads = {"unmapped": 0,
-                  "total": 0,
-                  "unique": 0,
-                  "multi": 0,
-                  "multiDiscard": 0,
-                  "splice": 0
+    totalReadCounts = {"unmapped": 0,
+                       "total": 0,
+                       "unique": 0,
+                       "multi": 0,
+                       "multiDiscard": 0,
+                       "splice": 0
      }
  
      readsize = 0
-    insertSize = 100000
  
      uniqueInsertList = []
      multiInsertList = []
@@ -143,11 +163,11 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
  
      for read in samFileIterator:
          if read.is_unmapped:
-            countReads["unmapped"] += 1
+            totalReadCounts["unmapped"] += 1
              continue
  
          if readsize == 0:
-            take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+            take = (0, 1) # CIGAR operation (M/match, I/insertion)
              readsize = sum([length for op,length in read.cigar if op in take])
              if init:
                  rds.insertMetadata([("readsize", readsize)])
@@ -161,7 +181,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
          pairReadSuffix = getPairedReadNumberSuffix(read)
          readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
          if trimReadID:
-            rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+            rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
          else:
              rdsEntryName = read.qname
  
@@ -186,27 +206,27 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
              else:
                  uniqueReadDict[readName] = (read, rdsEntryName)
  
-        if countReads["total"] % insertSize == 0:
+        if totalReadCounts["total"] % INSERT_SIZE == 0:
              for entry in uniqueReadDict.keys():
                  (readData, rdsEntryName) = uniqueReadDict[entry]
                  chrom = samfile.getrname(readData.rname)
                  uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
-                countReads["unique"] += 1
+                totalReadCounts["unique"] += 1
  
              for entry in spliceReadDict.keys():
                  (readData, rdsEntryName) = spliceReadDict[entry]
                  chrom = samfile.getrname(readData.rname)
                  spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
-                countReads["splice"] += 1
+                totalReadCounts["splice"] += 1
  
              for entry in multiReadDict.keys():
                  (readData, count, rdsEntryName) = multiReadDict[entry]
                  chrom = samfile.getrname(readData.rname)
                  if count > maxMultiReadCount:
-                    countReads["multiDiscard"] += 1
+                    totalReadCounts["multiDiscard"] += 1
                  else:
                      multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
-                    countReads["multi"] += 1
+                    totalReadCounts["multi"] += 1
  
              rds.insertUniqs(uniqueInsertList)
              rds.insertMulti(multiInsertList)
@@ -223,14 +243,14 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
              sys.stdout.flush()
              processedEntryDict = {}
  
-        countReads["total"] += 1
+        totalReadCounts["total"] += 1
  
      if len(uniqueReadDict.keys()) > 0:
          for entry in uniqueReadDict.keys():
              (readData, rdsEntryName) = uniqueReadDict[entry]
              chrom = samfile.getrname(readData.rname)
              uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
-            countReads["unique"] += 1
+            totalReadCounts["unique"] += 1
  
          rds.insertUniqs(uniqueInsertList)
  
@@ -239,32 +259,32 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
              (readData, count, rdsEntryName) = multiReadDict[entry]
              chrom = samfile.getrname(readData.rname)
              if count > maxMultiReadCount:
-                countReads["multiDiscard"] += 1
+                totalReadCounts["multiDiscard"] += 1
              else:
                  multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
-                countReads["multi"] += 1
+                totalReadCounts["multi"] += 1
  
-        countReads["multi"] += len(multiInsertList)
+        totalReadCounts["multi"] += len(multiInsertList)
  
      if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
          for entry in spliceReadDict.keys():
              (readData, rdsEntryName) = spliceReadDict[entry]
              chrom = samfile.getrname(readData.rname)
              spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
-            countReads["splice"] += 1
+            totalReadCounts["splice"] += 1
  
          rds.insertSplices(spliceInsertList)
  
-    countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
-    countString += "\t%d unique reads" % countReads["unique"]
-    countString += "\t%d multi reads" % countReads["multi"]
-    countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
+    countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
+    countStringList.append("%d unique reads" % totalReadCounts["unique"])
+    countStringList.append("%d multi reads" % totalReadCounts["multi"])
+    countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
      if dataType == "RNA":
-        countString += "\t%d spliced reads" % countReads["splice"]
+        countStringList.append("%d spliced reads" % totalReadCounts["splice"])
  
-    print countString.replace("\t", "\n")
-
-    writeLog("%s.log" % outDbName, verstring, countString)
+    print string.join(countStringList, "\n")
+    outputCountText = string.join(countStringList, "\t")
+    writeLog("%s.log" % outDbName, verstring, outputCountText)
  
      if doIndex:
          print "building index...."
@@ -277,7 +297,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
  
  def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
      start = int(alignedRead.pos)
-    stop = int(start+readSize)
+    stop = int(start + readSize)
      sense = getReadSense(alignedRead.is_reverse)
      try:
          mismatchTag = alignedRead.opt("MD")
@@ -352,11 +372,11 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
                  genomicNucleotide = "N"
  
              if sense == "-":
-                mismatch = getComplementNucleotide(mismatch)
-                genomicNucleotide  = getComplementNucleotide(genomicNucleotide)
+                mismatch = getReverseComplement(mismatch)
+                genomicNucleotide  = getReverseComplement(genomicNucleotide)
  
-            elandCompatiblePosition = int(position + 1)
-            output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+            erange1BasedElandCompatiblePosition = int(position + 1)
+            output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
              position += 1
          except IndexError:
              if logErrors:
@@ -368,17 +388,6 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
      return string.join(output, ",")
  
  
-def getComplementNucleotide(nucleotide):
-    complement = {"A": "T",
-                  "T": "A",
-                  "C": "G",
-                  "G": "C",
-                  "N": "N"
-    }
-
-    return complement[nucleotide]
-
-
  def getSpliceBounds(start, readsize, cigarTupleList):
      stopR = int(start + readsize)
      offset = 0
diff --git a/Peak.py b/Peak.py

new file mode 100644 (file)

index 0000000..ae41a0f
--- /dev/null
+++ b/Peak.py
@@ -0,0 +1,33 @@
+class Peak(object):
+    """
+        Class describing a peak.
+    """
+
+    def __init__(self, topPos, numHits, smoothArray, numPlus, numLeftPlus=0, shift=0):
+        self._topPos = topPos
+        self._numHits = numHits
+        self.smoothArray = smoothArray
+        self.numPlus = numPlus
+        self.numLeftPlus = numLeftPlus
+        self.shift = shift
+
+
+    @property
+    def topPos(self):
+        return self._topPos
+
+
+    @topPos.setter
+    def topPos(self, topPos):
+        self._topPos = topPos
+
+
+    @property
+    def numHits(self):
+        return self._numHits
+
+
+    @numHits.setter
+    def numHits(self, numHits):
+        self._numHits = numHits
+        
+\ No newline at end of file
diff --git a/ReadDataset.py b/ReadDataset.py

index ef80d657c721c837cbbc6c64d2a007d0fc6aa6e5..5ff60e2e6954c862888c5807495ef49142e14fa3 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -1,25 +1,12 @@
-"""
-Created on Jul 1, 2010
-
-@author: sau
-"""
-
  import sqlite3 as sqlite
  import string
  import tempfile
  import shutil
  import os
-from os import environ
  from array import array
-from commoncode import getReverseComplement
-
-if environ.get("CISTEMATIC_TEMP"):
-    cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
-    cisTemp = "/tmp"
+from commoncode import getReverseComplement, getConfigParser, getConfigOption
  
-tempfile.tempdir = cisTemp
-currentRDSVersion = "1.1"
+currentRDSVersion = "2.0"
  
  
  class ReadDatasetError(Exception):
@@ -140,6 +127,9 @@ class ReadDataset():
      def cacheDB(self, filename):
          """ copy geneinfoDB to a local cache.
          """
+        configParser = getConfigParser()
+        cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
+        tempfile.tempdir = cisTemp
          self.cachedDBFile =  "%s.db" % tempfile.mktemp()
          shutil.copyfile(filename, self.cachedDBFile)
  
@@ -385,6 +375,8 @@ class ReadDataset():
          and which can be restricted by chromosome or custom-flag.
          Returns unique reads by default, but can return multireads
          with doMulti set to True.
+        
+        Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
          """
          whereClause = []
          resultsDict = {}
diff --git a/Region.py b/Region.py

new file mode 100644 (file)

index 0000000..0d0a15b
--- /dev/null
+++ b/Region.py
@@ -0,0 +1,70 @@
+import string
+
+class Region(object):
+    """
+        Region description
+    """
+
+
+    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+        self.label = label
+        self.index = index
+        self.chrom = chrom
+        self.start = start
+        self.stop = stop
+        self.numReads = numReads
+        self.foldRatio = foldRatio
+        self.multiP = multiP
+        self.peakDescription = peakDescription
+        self.shift = shift
+        self.length = abs(self.stop - self.start)
+        self.peakPos = peakPos
+        self.peakHeight = peakHeight
+
+
+    def printRegion(self, delimiter="\t"):
+        fields = ["%s%d" % (self.label, self.index),
+                  "%s" % self.chrom,
+                  "%d" % self.start,
+                  "%d" % self.stop,
+                  "%.1f" % self.numReads,
+                  "%.1f" % self.foldRatio,
+                  "%.1f" % self.multiP,
+                  "%s" % self.peakDescription
+        ]
+
+        return string.join(fields, delimiter)
+
+
+    def printRegionWithShift(self, delimiter="\t"):
+        fields = [self.printRegion(delimiter)]
+        fields.append("%d" % self.shift)
+
+        return string.join(fields, delimiter)
+
+
+class DirectionalRegion(Region):
+    """
+        Region with percentage of plus reads.
+    """
+
+    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., plusP=0., leftP=0., peakDescription="", shift=0):
+        Region.__init__(self, start, stop, label, index, chrom, numReads, foldRatio, multiP, peakDescription, shift)
+        self.plusP = plusP
+        self.leftP = leftP
+
+
+    def printRegion(self, delimiter="\t"):
+        fields = ["%s%d" % (self.label, self.index),
+                  "%s" % self.chrom,
+                  "%d" % self.start,
+                  "%d" % self.stop,
+                  "%.1f" % self.numReads,
+                  "%.1f" % self.foldRatio,
+                  "%.1f" % self.multiP,
+                  "%.1f" % self.plusP,
+                  "%.1f" % self.leftP,
+                  "%s" % self.peakDescription
+        ]
+
+        return string.join(fields, delimiter)
+\ No newline at end of file
diff --git a/altSpliceCounts.py b/altSpliceCounts.py

index 1517ef888c917bd92d4b22fabde6018c470c9581..12077c1ec7d9d7e3ba499c67c99734a8a9a54157 100755 (executable)
--- a/altSpliceCounts.py
+++ b/altSpliceCounts.py
@@ -4,10 +4,12 @@ try:
  except:
      print 'psyco not running'
  
-print 'version 3.6'
+print "altSpliceCounts: version 3.7"
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption
  
  def main(argv=None):
      if not argv:
@@ -15,10 +17,7 @@ def main(argv=None):
  
      usage = "usage: python %s rdsfile outfilename [--cache pages]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cache", type="int", dest="numCachePages",
-                      help="number of cache pages to use [default: 100000]")
-    parser.set_defaults(numCachePages=None)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -38,12 +37,26 @@ def main(argv=None):
      altSpliceCounts(hitfile, outfilename, doCache, cachePages)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", type="int", dest="numCachePages",
+                      help="number of cache pages to use [default: 100000]")
+
+    configParser = getConfigParser()
+    section = "altSpliceCounts"
+    numCachePages = getConfigOption(configParser, section, "numCachePages", None)
+
+    parser.set_defaults(numCachePages=numCachePages)
+
+    return parser
+
+
  def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
      startDict = {}
      stopDict = {}
      resultDict = {}
  
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
@@ -58,7 +71,9 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
  
      index = 0
      for chrom in hitDict:
-        for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]:
+        for read in hitDict[chrom]:
+            tagStart = read["startL"]
+            tagStop = read["stopR"]
              index += 1
              length = tagStop - tagStart
              if length < readlen + 5:
diff --git a/analyzego.py b/analyzego.py

index d4f9f6f95500c1a1b351d6f717629eb7018cc9ea..6542b1bf24c256e5907c799c785b0ab060fb9eaf 100755 (executable)
--- a/analyzego.py
+++ b/analyzego.py
@@ -4,11 +4,12 @@ try:
  except:
      print "psyco not running"
  
-import sys, optparse
+import sys
+import optparse
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
  from cistematic.cisstat.analyzego import calculateGOStats
-from cistematic.core.geneinfo import geneinfoDB
  
-print "version 2.1"
+print "analyzego: version 2.2"
  
  def main(argv=None):
      if not argv:
@@ -16,12 +17,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome infilename prefix [--geneName] [--field fieldID]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--geneName", action="store_true", dest="translateGene",
-                      help="translate gene")
-    parser.add_option("--field", type="int", dest="fieldID",
-                      help="column containing gene ID/Name")
-    parser.set_defaults(translateGene=False, fieldID=None)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -42,6 +38,23 @@ def main(argv=None):
      analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--geneName", action="store_true", dest="translateGene",
+                      help="translate gene")
+    parser.add_option("--field", type="int", dest="fieldID",
+                      help="column containing gene ID/Name")
+
+    configParser = getConfigParser()
+    section = "analyzego"
+    translateGene = getConfigOption(configParser, section, "translateGene", False)
+    fieldID = getConfigOption(configParser, section, "fieldID", None)
+
+    parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
+
+    return parser
+
+
  def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
      infile = open(infilename)
      analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
@@ -50,12 +63,7 @@ def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1
  
  def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
      if translateGene:
-        idb = geneinfoDB(cache=True)
-        geneinfoDict = idb.getallGeneInfo(genome)
-        symbolToGidDict = {}
-        for gid in geneinfoDict:
-            symbol = geneinfoDict[gid][0][0].strip()
-            symbolToGidDict[symbol] = gid
+        symbolToGidDict = getSymbolDict(genome)
  
      locusList = []
      for line in geneInfoList:
@@ -82,5 +90,16 @@ def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
      if len(locusList) > 0:
          calculateGOStats(locusList, prefix)
  
+
+def getSymbolDict(genome):
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
+    symbolToGidDict = {}
+    for gid in geneinfoDict:
+        symbol = geneinfoDict[gid][0][0].strip()
+        symbolToGidDict[symbol] = gid
+
+    return symbolToGidDict
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/bedtoregion.py b/bedtoregion.py

index d6c44de72100938477de417666667aa05bfa4f1f..3bcd554e5406a09f31ecc988e8c60733616d0dd5 100755 (executable)
--- a/bedtoregion.py
+++ b/bedtoregion.py
@@ -1,4 +1,5 @@
-import sys, string
+import sys
+import string
  
  def main(argv=None):
      if not argv:
diff --git a/binstocdf.py b/binstocdf.py

index 938186632002ca6a86fb7f0fea27cdac22c03126..63aa9558b8848183d4528bb16149cd0ced38aadd 100755 (executable)
--- a/binstocdf.py
+++ b/binstocdf.py
@@ -1,6 +1,6 @@
  import sys
  
-print 'version 1.0'
+print "binstocdf: version 1.1"
  
  def main(argv=None):
      if not argv:
diff --git a/buildMatrix.py b/buildMatrix.py

index 361f56e20d2e23d19606a4958ef955479d3fc135..c7b6dd0c801025552c365003cf9067b2550c6b0c 100755 (executable)
--- a/buildMatrix.py
+++ b/buildMatrix.py
@@ -4,10 +4,12 @@
  #
  #  Created by Ali Mortazavi on 3/6/09.
  #
-import sys, string, optparse
-from commoncode import writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption
  
-versionString = "%prog: version 1.3"
+versionString = "buildMatrix: version 1.5"
  print versionString
  
  
@@ -17,11 +19,7 @@ def main(argv=None):
  
      usage = "usage: python %prog matrix.step.N-1 data.part matrix.step.N [--rescale] [--truncate maxRPKM] [--log altlogfile]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--rescale", action="store_true", dest="rescale")
-    parser.add_option("--truncate", type="int", dest="maxRPKM")
-    parser.add_option("--log", dest="logfilename")
-    parser.set_defaults(rescale=False, maxRPKM=None, logfilename="buildMatrix.log")
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -43,6 +41,23 @@ def main(argv=None):
                  options.rescale, options.logfilename)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--rescale", action="store_true", dest="rescale")
+    parser.add_option("--truncate", type="int", dest="maxRPKM")
+    parser.add_option("--log", dest="logfilename")
+
+    configParser = getConfigParser()
+    section = "buildMatrix"
+    rescale = getConfigBoolOption(configParser, section, "rescale", False)
+    maxRPKM = getConfigOption(configParser, section, "maxRPKM", None)
+    logfilename = getConfigOption(configParser, section, "logfilename", "buildMatrix.log")
+
+    parser.set_defaults(rescale=rescale, maxRPKM=maxRPKM, logfilename=logfilename)
+
+    return parser
+
+
  def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
                  maxRPKM=100000000, rescale=False, logfilename="buildMatrix.log"):
  
diff --git a/buildrmaskdb.py b/buildrmaskdb.py

index d1d6b00dfef3c604c54fbfa2bf1980d8d51486b0..f02a9ef06e4fca0f9d786dd940735cf565bc74bb 100755 (executable)
--- a/buildrmaskdb.py
+++ b/buildrmaskdb.py
@@ -12,7 +12,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "version 2.0"
+    print "buildrmaskdb: version 2.1"
      if len(argv) < 3:
          print "usage: python %s rmaskdir rmaskdbfile" % argv[0]
          exit(1)
diff --git a/buildsnpdb.py b/buildsnpdb.py

index 2510443cb54170eff293c276679c0eb7a5d5ec97..0ee76cf30f514d9956f45b88d8768672d9a78418 100755 (executable)
--- a/buildsnpdb.py
+++ b/buildsnpdb.py
@@ -29,7 +29,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "version 2.0"
+    print "buildsnpdb: version 2.1"
      if len(argv) < 3:
          print "usage: python %s snpfile snpdbname" % argv[0]
          sys.exit(1)
diff --git a/checkrmask.py b/checkrmask.py

index 9f58983195902265c5981d720487a4283d1ca870..99d74a50cdc3faec378b17070862ad90da99a954 100755 (executable)
--- a/checkrmask.py
+++ b/checkrmask.py
@@ -4,12 +4,14 @@ try:
  except:
      pass
  
-import sqlite3 as sqlite
-import sys, string, optparse
+import sys
+import string
+import optparse
  import os.path
-from commoncode import writeLog
+import sqlite3 as sqlite
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption
  
-versionString = "%prog: version 3.5"
+versionString = "checkrmask: version 3.6"
  print versionString
  
  
@@ -19,11 +21,7 @@ def main(argv=None):
  
      usage = "usage: python %prog dbfile infile outfile goodfile [--startField field] [--cache numPages] [--log logfile]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--startField", type="int", dest="startField")
-    parser.add_option("--log", dest="logfilename")
-    parser.set_defaults(cachePages=500000, startField=0, logfilename=None)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -38,6 +36,23 @@ def main(argv=None):
      checkrmask(dbfile, filename, outfile, goodfile, options.startField, options.cachePages, options.logfilename)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--startField", type="int", dest="startField")
+    parser.add_option("--log", dest="logfilename")
+
+    configParser = getConfigParser()
+    section = "checkrmask"
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 500000)
+    startField = getConfigIntOption(configParser, section, "startField", 0)
+    logfilename = getConfigOption(configParser, section, "logfilename", None)
+
+    parser.set_defaults(cachePages=cachePages, startField=startField, logfilename=logfilename)
+
+    return parser
+
+
  def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
  
      outfile = open(outFileName, "w")
diff --git a/chkSNPrmask.py b/chkSNPrmask.py

index 498ef491ca6e1ffda95da172e5c8a7096f75b6d5..d52c9405afa32654e0b02e4dcce0c8d5c3b1d503 100755 (executable)
--- a/chkSNPrmask.py
+++ b/chkSNPrmask.py
@@ -4,18 +4,19 @@ try:
  except:
      pass
  
-import sqlite3 as sqlite
  import sys
-import tempfile, shutil, os, optparse
-from os import environ
+import tempfile
+import shutil
+import os
+import optparse
+import sqlite3 as sqlite
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
  
-if environ.get("CISTEMATIC_TEMP"):
-    cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
-    cisTemp = "/tmp"
+configParser = getConfigParser()
+cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
  tempfile.tempdir = cisTemp
  
-print "version 3.3: %prog"
+print "chkSNPrmask: version 3.4"
  
  
  def main(argv=None):
@@ -24,10 +25,7 @@ def main(argv=None):
  
      usage = "usage: python %s dbfile snpsfile nr_snps_outfile [--cache numPages] [--repeats]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--repeats", action="store_true", dest="repeats")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(repeats=False, cachePages=None)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -41,6 +39,21 @@ def main(argv=None):
      chkSNPrmask(dbfile, filename, outfile, options.repeats, options.cachePages)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--repeats", action="store_true", dest="repeats")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "checkSNPrmask"
+    repeats = getConfigBoolOption(configParser, section, "repeats", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(repeats=repeats, cachePages=cachePages)
+
+    return parser
+
+
  def chkSNPrmask(dbfile, filename, outfile, repeats=False, cachePages=None):
      print dbfile
  
diff --git a/chksnp.py b/chksnp.py

index daf6b0adc53b871803eb5bd000f3b4f0363fc5ef..9567e5b699b1991b7208a1741442d5c80edd362a 100755 (executable)
--- a/chksnp.py
+++ b/chksnp.py
@@ -11,8 +11,9 @@ import shutil
  import os
  import string
  import sqlite3 as sqlite
+from commoncode import getConfigParser, getConfigOption
  
-print "version 3.6: %s" % sys.argv[0]
+print "chksnp: version 3.7"
  
  
  def main(argv=None):
@@ -95,11 +96,9 @@ def annotateSNPFromDB(snpLocationList, snpDict, dbFileName, cachePages=None):
  
  
  def annotateSNPFromDBList(snpLocationList, snpDict, dbList, cachePages=None):
-    if os.environ.get("CISTEMATIC_TEMP"):
-        cisTemp = os.environ.get("CISTEMATIC_TEMP")
-    else:
-        cisTemp = "/tmp"
  
+    configParser = getConfigParser()
+    cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
      tempfile.tempdir = cisTemp
  
      for dbFileName in dbList:
diff --git a/colsum.py b/colsum.py

index 703bd5ce47ae4b136321c42a6debe8f6b7908d01..f6d1ff968c5260e35e876fb79a82ac201b088585 100755 (executable)
--- a/colsum.py
+++ b/colsum.py
@@ -4,7 +4,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "version 1.2"
+    print "colsum: version 1.3"
      if len(argv) < 3:
          print "usage: python %s field filename" % argv[0]
          print "\n\tfields are counted starting at zero.\n"
diff --git a/combineRPKMs.py b/combineRPKMs.py

index 8fd8f9f56b80d6af257cccdfdbad8f2e8fef1c9b..ead4e1b81d0b37335fbf00908f1aea615cec33dd 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -3,14 +3,17 @@
  #  ENRAGE
  #
  
-print 'version 1.0'
+print "combineRPKMs: version 1.1"
  try:
      import psyco
      psyco.full()
  except:
      pass
  
-import sys, optparse
+import sys
+import optparse
+import string
+from commoncode import getConfigParser, getConfigBoolOption
  
  
  def main(argv=None):
@@ -18,9 +21,7 @@ def main(argv=None):
          argv = sys.argv
  
      usage = "usage: python %prog firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]"
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
-    parser.set_defaults(doFraction=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -35,22 +36,32 @@ def main(argv=None):
      combineRPKMs(firstfile, expandedfile, finalfile, outfile, options.doFraction)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--withmultifraction", action="store_true", dest="doFraction")
+
+    configParser = getConfigParser()
+    section = "combineRPKMs"
+    doFraction = getConfigBoolOption(configParser, section, "doFraction", False)
+
+    parser.set_defaults(doFraction=doFraction)
+
+    return parser
+
+
  def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
-    firstfile = open(firstfileName)
-    expandedfile = open(expandedfileName)
-    finalfile = open(finalfileName)
-    outfile = open(outfileName, "w")
  
      firstDict = {}
-    gidDict = {}
-    expandedDict = {}
-
+    firstfile = open(firstfileName)
      for line in firstfile:
          fields = line.strip().split()
          firstDict[fields[1]] = fields[-1]
  
      firstfile.close()
  
+    expandedDict = {}
+    gidDict = {}
+    expandedfile = open(expandedfileName)
      for line in expandedfile:
          fields = line.strip().split()
          expandedDict[fields[1]] = fields[-1]
@@ -63,21 +74,24 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      else:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\n"
  
+    outfile = open(outfileName, "w")
      outfile.write(header)
  
+    finalfile = open(finalfileName)
      for line in finalfile:
          fields = line.strip().split()
          gene = fields[0]
          rnakb = fields[1]
          finalRPKM = fields[2]
          firstRPKM = firstDict.get(gene, "")
-        outline = "%s\t%s\t%s\t%s\t%s\t%s" % (gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM)
+        outputFields = [gidDict[gene], rnakb, gene, firstRPKM, expandedDict[gene], finalRPKM]
  
          if doFraction:
              fraction = fields[3]
-            outline += "\t%s" % fraction
-    
-        outfile.write(outline + '\n')
+            outputFields.append(fraction)
+
+        outline = "%s\n" % string.join(outputFields, "\t")
+        outfile.write(outline)
  
      finalfile.close()
      outfile.close()
diff --git a/combinerds.py b/combinerds.py

index 7eac48affb101be5a1a7e570b3c152c86c58247e..4c826b04ed6c668a3d5b2a66ee4b842789a483a0 100755 (executable)
--- a/combinerds.py
+++ b/combinerds.py
@@ -10,9 +10,9 @@ except:
      pass
  
  import sys
-from commoncode import readDataset
+import ReadDataset
  
-print '%s: version 1.1' % sys.argv[0]
+print "combinerds: version 1.2"
  
  
  def main(argv=None):
@@ -43,16 +43,16 @@ def main(argv=None):
      print "destination RDS: %s" % datafile
  
      if '--initrna' in argv:
-        rds = readDataset(datafile, initialize=True, datasetType='RNA')
+        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
      elif '--init' in argv:
-        rds = readDataset(datafile, initialize=True)
+        rds = ReadDataset.ReadDataset(datafile, initialize=True)
  
      withFlag = ''
      if '--flag' in argv:
          withFlag = argv[sys.argv.index('-flag') + 1]
          print "restrict to flag = %s" % withFlag
  
-    rds = readDataset(datafile, verbose=True, cache=doCache)
+    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
  
      if cachePages > rds.getDefaultCacheSize():
          rds.setDBcache(cachePages)
diff --git a/commoncode.py b/commoncode.py

index 9d864738fef6828ad7bc8e5a4c45079eab84edbc..821936a30bc2524f0aa1664de1a66ecb8ae47fd6 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -3,25 +3,19 @@
  #  ENRAGE
  #
  
-import tempfile
-import shutil
+import ConfigParser
  import os
-from os import environ
  import string
-import sqlite3 as sqlite
  from time import strftime
  from array import array
  from collections import defaultdict
+import Peak
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.genomes import Genome
+import Region
  
-commoncodeVersion = 5.5
-currentRDSversion = 1.1
-
-if environ.get("CISTEMATIC_TEMP"):
-    cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
-    cisTemp = "/tmp"
-
-tempfile.tempdir = cisTemp
+commoncodeVersion = 5.6
+currentRDSversion = 2.0
  
  
  def getReverseComplement(base):
@@ -57,11 +51,91 @@ def writeLog(logFile, messenger, message):
      logfile.close()
  
  
+def getGeneInfoDict(genome, cache=False):
+    idb = geneinfoDB(cache=cache)
+    if genome == "dmelanogaster":
+        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
+    else:
+        geneinfoDict = idb.getallGeneInfo(genome)
+
+    return geneinfoDict
+
+
+def getGeneAnnotDict(genome, inRAM=False):
+    return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
+
+
+def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
+    hg = Genome(genome, inRAM=inRAM)
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    geneannotDict = hg.allAnnotInfo()
+
+    return geneannotDict
+
+
+def getConfigParser(fileList=[]):
+    configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
+    for filename in fileList:
+        configFiles.append(filename)
+
+    config = ConfigParser.SafeConfigParser()
+    config.read(configFiles)
+
+    return config
+
+
+def getConfigOption(parser, section, option, default=None):
+    try:
+        setting = parser.get(section, option)
+    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+        setting = default
+
+    return setting
+
+
+def getConfigIntOption(parser, section, option, default=None):
+    try:
+        setting = parser.getint(section, option)
+    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+        setting = default
+
+    return setting
+
+
+def getConfigFloatOption(parser, section, option, default=None):
+    try:
+        setting = parser.getfloat(section, option)
+    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
+        setting = default
+
+    return setting
+
+
+def getConfigBoolOption(parser, section, option, default=None):
+    try:
+        setting = parser.getboolean(section, option)
+    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
+        setting = default
+
+    return setting
+
+
+def getAllConfigSectionOptions(parser, section):
+    try:
+        setting = parser.items(section)
+    except ConfigParser.NoSectionError:
+        setting = []
+
+    return setting
+
+
  def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
-                     fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
+                     fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
  
-    """ returns a list of merged overlapping regions; 
+    """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
      can optionally filter regions that have a scoreField fewer than minHits.
      Can also optionally return the label of each region, as well as the
      peak, if supplied (peakPos and peakHeight should be the last 2 fields).
@@ -81,7 +155,7 @@ def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, kee
  def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                       fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
-    """ returns a list of merged overlapping regions; 
+    """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
      can optionally filter regions that have a scoreField fewer than minHits.
      Can also optionally return the label of each region, as well as the
      peak, if supplied (peakPos and peakHeight should be the last 2 fields).
@@ -159,7 +233,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
          if not fullChrom:
              chrom = chrom[3:]
  
-        length = abs(stop - start)
          if keepPeak:
              peakPos = int(fields[-2 - hasPvalue - hasShift])
              peakHeight = float(fields[-1 - hasPvalue - hasShift])
@@ -171,15 +244,9 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
  
          if doMerge and len(regions[chrom]) > 0:
              for index in range(len(regions[chrom])):
-                if keepLabel and keepPeak:
-                    (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
-                elif keepLabel:
-                    (rlabel, rstart, rstop, rlen) = regions[chrom][index]
-                elif keepPeak:
-                    (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index]
-                else:
-                    (rstart, rstop, rlen) = regions[chrom][index]
-
+                region = regions[chrom][index]
+                rstart = region.start
+                rstop = region.stop
                  if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
                      if start < rstart:
                          rstart = start
@@ -187,35 +254,38 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
                      if rstop < stop:
                          rstop = stop
  
-                    rlen = abs(rstop - rstart)
                      if keepPeak:
+                        rpeakPos = region.peakPos
+                        rpeakHeight = region.peakHeight
                          if peakHeight > rpeakHeight:
                              rpeakHeight = peakHeight
                              rpeakPos = peakPos
  
-                    if keepLabel and keepPeak:
-                        regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight)
-                    elif keepLabel:
-                        regions[chrom][index] = (label, rstart, rstop, rlen)
-                    elif keepPeak:
-                        regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight)
-                    else:
-                        regions[chrom][index] = (rstart, rstop, rlen)
+                    regions[chrom][index].start = rstart
+                    regions[chrom][index].stop = rstop
+                    regions[chrom][index].length = abs(rstop - rstart)
+                    if keepLabel:
+                        regions[chrom][index].label = label
+
+                    if keepPeak:
+                        regions[chrom][index].peakPos = rpeakPos
+                        regions[chrom][index].peakHeight = rpeakHeight
+
  
                      mergeCount += 1
                      merged = True
                      break
  
          if not merged:
-            if keepLabel and keepPeak:
-                regions[chrom].append((label, start, stop, length, peakPos, peakHeight))
-            elif keepLabel:
-                regions[chrom].append((label, start, stop, length))
-            elif keepPeak:
-                regions[chrom].append((start, stop, length, peakPos, peakHeight))
-            else:
-                regions[chrom].append((start, stop, length))
+            region = Region.Region(start, stop)
+            if keepLabel:
+                region.label = label
  
+            if keepPeak:
+                region.peakPos = peakPos
+                region.peakHeight = peakHeight
+
+            regions[chrom].append(region)
              count += 1
  
          if verbose and (count % 100000 == 0):
@@ -224,10 +294,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      regionCount = 0
      for chrom in regions:
          regionCount += len(regions[chrom])
-        if keepLabel:
-            regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1]))
-        else:
-            regions[chrom].sort()
+        regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
  
      if verbose:
          print "merged %d times" % mergeCount
@@ -257,7 +324,7 @@ def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
  
  
  def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
-             shift=0, returnShift=False, maxshift=75):
+             shift=0, maxshift=75):
      """ find the peak in a list of reads (hitlist) in a region
      of a given length and absolute start point. returns a
      list of peaks, the number of hits, a triangular-smoothed
@@ -268,82 +335,35 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      the peak, taken to be the first TopPos position.
      """
  
-    seqArray = array("f", [0.] * length)
-    smoothArray = array("f", [0.] * length)
-    numHits = 0.
-    numPlus = 0.
-    regionArray = []
      if shift == "auto":
          shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
  
-    # once we have the best shift, compute seqArray
-    for read in hitList:
-        currentpos = read[0] - start
-        if read[1] == "+":
-            currentpos += shift
-        else:
-            currentpos -= shift
-
-        if (currentpos <  1 - readlen) or (currentpos >= length):
-            continue
-
-        hitIndex = 0
-        if doWeight:
-            weight = read[2]
-        else:
-            weight = 1.0
-
-        numHits += weight
-        if leftPlus:
-            regionArray.append(read)
-
-        while currentpos < 0:
-            hitIndex += 1
-            currentpos += 1
-
-        while hitIndex < readlen and  currentpos < length:
-            seqArray[currentpos] += weight
-            hitIndex += 1
-            currentpos += 1
-
-        if read[1] == "+":
-            numPlus += weight
+    seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
  
      # implementing a triangular smooth
+    smoothArray = array("f", [0.] * length)
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
-    topNucleotide = 0
-    topPos = []
-    for currentpos in xrange(length):
-        if topNucleotide < smoothArray[currentpos]:
-            topNucleotide = smoothArray[currentpos]
-            topPos = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
-            topPos.append(currentpos)
+    topPos = getPeakPositionList(smoothArray, length)
+    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
  
      if leftPlus:
          numLeftPlus = 0
          maxPos = topPos[0]
          for read in regionArray:
              if doWeight:
-                weight = read[2]
+                weight = read["weight"]
              else:
                  weight = 1.0
  
-            currentPos = read[0] - start
-            if currentPos <= maxPos and read[1] == "+":
+            currentPos = read["start"] - start
+            if currentPos <= maxPos and read["sense"] == "+":
                  numLeftPlus += weight
  
-        if returnShift:
-            return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift)
-        else:
-            return (topPos, numHits, smoothArray, numPlus, numLeftPlus)
-    else:
-        if returnShift:
-            return (topPos, numHits, smoothArray, numPlus, shift)
-        else:
-            return (topPos, numHits, smoothArray, numPlus)
+        peak.numLeftPlus = numLeftPlus
+
+    return peak
  
  
  def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
@@ -352,8 +372,8 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
      for testShift in xrange(maxShift + 1):
          shiftArray = array("f", [0.] * length)
          for read in hitList:
-            currentpos = read[0] - start
-            if read[1] == "+":
+            currentpos = read["start"] - start
+            if read["sense"] == "+":
                  currentpos += testShift
              else:
                  currentpos -= testShift
@@ -362,11 +382,11 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
                  continue
  
              if doWeight:
-                weight = read[2]
+                weight = read["weight"]
              else:
                  weight = 1.0
  
-            if read[1] == "+":
+            if read["sense"] == "+":
                  shiftArray[currentpos] += weight
              else:
                  shiftArray[currentpos] -= weight
@@ -383,6 +403,59 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
      return bestShift
  
  
+def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
+    seqArray = array("f", [0.] * length)
+    numHits = 0.
+    numPlus = 0.
+    regionArray = []
+    for read in hitList:
+        currentpos = read["start"] - start
+        if read["sense"] == "+":
+            currentpos += shift
+        else:
+            currentpos -= shift
+
+        if (currentpos <  1 - readlen) or (currentpos >= length):
+            continue
+
+        if doWeight:
+            weight = read["weight"]
+        else:
+            weight = 1.0
+
+        numHits += weight
+        if leftPlus:
+            regionArray.append(read)
+
+        hitIndex = 0
+        while currentpos < 0:
+            hitIndex += 1
+            currentpos += 1
+
+        while hitIndex < readlen and  currentpos < length:
+            seqArray[currentpos] += weight
+            hitIndex += 1
+            currentpos += 1
+
+        if read["sense"] == "+":
+            numPlus += weight
+
+    return seqArray, regionArray, numHits, numPlus
+
+
+def getPeakPositionList(smoothArray, length):
+    topNucleotide = 0
+    peakList = []
+    for currentpos in xrange(length):
+        if topNucleotide < smoothArray[currentpos]:
+            topNucleotide = smoothArray[currentpos]
+            peakList = [currentpos]
+        elif topNucleotide  == smoothArray[currentpos]:
+            peakList.append(currentpos)
+
+    return peakList
+
+
  def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
                             restrictList=[], regionComplement=False, maxStop=250000000):
      """ return a dictionary of cistematic gene features. Requires
@@ -402,7 +475,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
      if len(additionalRegionsDict) > 0:
          sortList = []
          for chrom in additionalRegionsDict:
-            for (label, start, stop, length) in additionalRegionsDict[chrom]:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
                  if label not in sortList:
                      sortList.append(label)
  
@@ -412,7 +486,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
                  else:
                      sense = featuresDict[label][0][-1]
  
-                featuresDict[label].append(("custom", chrom, start, stop, sense))
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
  
          for gid in sortList:
              featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
@@ -525,7 +599,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
      if len(additionalRegionsDict) > 0:
          sortList = []
          for chrom in additionalRegionsDict:
-            for (label, start, stop, length) in additionalRegionsDict[chrom]:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
                  if label not in sortList:
                      sortList.append(label)
  
@@ -535,7 +610,7 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                  else:
                      sense = featuresDict[label][0][-1]
  
-                featuresDict[label].append(("custom", chrom, start, stop, sense))
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
  
          for gid in sortList:
              featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
@@ -705,7 +780,9 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
  
          print "%s\n" % chrom
          startRegion = 0
-        for (tagStart, sense, weight) in hitDict[chrom]:
+        for read in hitDict[chrom]:
+            tagStart = read["start"]
+            weight = read["weight"]
              index += 1
              if index % 100000 == 0:
                  print "read %d " % index,
@@ -773,1296 +850,3 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                      stopPoint = stop
  
      return (regionsBins, regionsLen)
-
-
-# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will
-# require going through all the code to make the changes needed.  Major project for another
-# day, but it really needs to be done
-class readDataset:
-    """ Class for storing reads from experiments. Assumes that custom scripts
-    will translate incoming data into a format that can be inserted into the
-    class using the insert* methods. Default class subtype ('DNA') includes
-    tables for unique and multireads, whereas 'RNA' subtype also includes a
-    splices table.
-    """
-
-    def __init__(self, datafile, initialize=False, datasetType='', verbose=False, 
-                 cache=False, reportCount=True):
-        """ creates an rds datafile if initialize is set to true, otherwise
-        will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
-        """
-        self.dbcon = ""
-        self.memcon = ""
-        self.dataType = ""
-        self.rdsVersion = "1.1"
-        self.memBacked = False
-        self.memChrom = ""
-        self.memCursor = ""
-        self.cachedDBFile = ""
-
-        if cache:
-            if verbose:
-                print "caching ...."
-
-            self.cacheDB(datafile)
-            dbfile = self.cachedDBFile
-        else:
-            dbfile = datafile
-
-        self.dbcon = sqlite.connect(dbfile)
-        self.dbcon.row_factory = sqlite.Row
-        self.dbcon.execute("PRAGMA temp_store = MEMORY")
-        if initialize:
-            if datasetType == "":
-                self.dataType = "DNA"
-            else:
-                self.dataType = datasetType
-
-            self.initializeTables(self.dbcon)
-        else:
-            metadata = self.getMetadata("dataType")
-            self.dataType = metadata["dataType"]
-
-        try:
-            metadata = self.getMetadata("rdsVersion")
-            self.rdsVersion = metadata["rdsVersion"]
-        except:
-            try:
-                self.insertMetadata([("rdsVersion", currentRDSversion)])
-            except:
-                print "could not add rdsVersion - read-only ?"
-                self.rdsVersion = "pre-1.0"
-
-        if verbose:
-            if initialize:
-                print "INITIALIZED dataset %s" % datafile
-            else:
-                print "dataset %s" % datafile
-
-            metadata = self.getMetadata()
-            print "metadata:"
-            pnameList = metadata.keys()
-            pnameList.sort()
-            for pname in pnameList:
-                print "\t" + pname + "\t" + metadata[pname]
-
-            if reportCount:
-                ucount = self.getUniqsCount()
-                mcount = self.getMultiCount()
-                if self.dataType == "DNA" and not initialize:
-                    try:
-                        print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
-                    except:
-                        print "\n%s unique reads and %s multireads" % (ucount, mcount)
-                elif self.dataType == 'RNA' and not initialize:
-                    scount = self.getSplicesCount()
-                    try:
-                        print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
-                    except:
-                        print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
-
-            print "default cache size is %d pages" % self.getDefaultCacheSize()
-            if self.hasIndex():
-                print "found index"
-            else:
-                print "not indexed"
-
-
-    def __len__(self):
-        """ return the number of usable reads in the dataset.
-        """
-        try:
-            total = self.getUniqsCount()
-        except:
-            total = 0
-
-        try:
-            total += self.getMultiCount()
-        except:
-            pass
-
-        if self.dataType == "RNA":
-            try:
-                total += self.getSplicesCount()
-            except:
-                pass
-
-        try:
-            total = int(total)
-        except:
-            total = 0
-
-        return total
-
-
-    def __del__(self):
-        """ cleanup copy in local cache, if present.
-        """
-        if self.cachedDBFile != "":
-            self.uncacheDB()
-
-
-    def cacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        self.cachedDBFile = tempfile.mktemp() + ".db"
-        shutil.copyfile(filename, self.cachedDBFile)
-
-
-    def saveCacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        shutil.copyfile(self.cachedDBFile, filename)
-
-
-    def uncacheDB(self):
-        """ delete geneinfoDB from local cache.
-        """
-        global cachedDBFile
-        if self.cachedDBFile != "":
-            try:
-                os.remove(self.cachedDBFile)
-            except:
-                print "could not delete %s" % self.cachedDBFile
-
-            self.cachedDB = ""
-
-
-    def attachDB(self, filename, asname):
-        """ attach another database file to the readDataset.
-        """
-        stmt = "attach '%s' as %s" % (filename, asname)
-        self.execute(stmt)
-
-
-    def detachDB(self, asname):
-        """ detach a database file to the readDataset.
-        """
-        stmt = "detach %s" % (asname)
-        self.execute(stmt)
-
-
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
-        """ import into current RDS the table (with columns destcolumns,
-            with default all columns) from the database file asname,
-            using the column specification of ascolumns (default all).
-        """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
-        if flagged != "":
-            stmt += " where flag = '%s' " % flagged
-
-        self.execute(stmt, forceCommit=True)
-
-
-    def getTables(self, asname=""):
-        """ get a list of table names in a particular database file.
-        """
-        resultList = []
-
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        if asname != "":
-            asname += "."
-
-        stmt = "select name from %ssqlite_master where type='table'" % asname
-        sql.execute(stmt)
-        results = sql.fetchall()
-
-        for row in results:
-            resultList.append(row["name"])
-
-        return resultList
-
-
-    def hasIndex(self):
-        """ check whether the RDS file has at least one index.
-        """
-        stmt = "select count(*) from sqlite_master where type='index'"
-        count = int(self.execute(stmt, returnResults=True)[0][0])
-        if count > 0:
-            return True
-
-        return False
-
-
-    def initializeTables(self, acon, cache=100000):
-        """ creates table schema in database connection acon, which is
-        typically a database file or an in-memory database.
-        """
-        acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
-        acon.execute("create table metadata (name varchar, value varchar)")
-        acon.execute("insert into metadata values('dataType','%s')" % self.dataType)
-        acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
-        acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)")
-        if self.dataType == "RNA":
-            acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)")
-
-        acon.commit()
-
-
-    def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
-
-
-    def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
-
-
-    def getMetadata(self, valueName=""):
-        """ returns a dictionary of metadata.
-        """
-        whereClause = ""
-        resultsDict = {}
-
-        if valueName != "":
-            whereClause = " where name = '%s' " % valueName
-
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        sql.execute("select name, value from metadata" + whereClause)
-        results = sql.fetchall()
-
-        for row in results:
-            pname = row["name"]
-            pvalue = row["value"]
-            if pname not in resultsDict:
-                resultsDict[pname] = pvalue
-            else:
-                trying = True
-                index = 2
-                while trying:
-                    newName = pname + ":" + str(index)
-                    if newName not in resultsDict:
-                        resultsDict[newName] = pvalue
-                        trying = False
-
-                    index += 1
-
-        return resultsDict
-
-
-    def getReadSize(self):
-        """ returns readsize if defined in metadata.
-        """
-        metadata = self.getMetadata()
-        if "readsize" not in metadata:
-            print "no readsize parameter defined - returning 0"
-            return 0
-        else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
-
-            return int(mysize)
-
-
-    def getDefaultCacheSize(self):
-        """ returns the default cache size.
-        """
-        return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
-
-
-    def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a list of distinct chromosomes in table.
-        """
-        statement = "select distinct chrom from %s" % table
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        sql.execute(statement)
-        results = []
-        for row in sql:
-            if fullChrom:
-                if row["chrom"] not in results:
-                    results.append(row["chrom"])
-            else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
-
-        results.sort()
-
-        return results
-
-
-    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
-                         doMulti=False, doSplices=False):
-        """ returns the maximum coordinate for reads on a given chromosome.
-        """
-        maxCoord = 0
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
-
-        if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
-
-        if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
-
-        if verbose:
-            print "%s maxCoord: %d" % (chrom, maxCoord)
-
-        return maxCoord
-
-
-    def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="",
-                     flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
-                     withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
-                     readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
-                     flagLike=False, strand="", entryDict=False, combine5p=False):
-        """ returns a dictionary of reads in a variety of formats
-        and which can be restricted by chromosome or custom-flag.
-        Returns unique reads by default, but can return multireads
-        with doMulti set to True.
-        """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
-
-        groupBy = []
-        if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
-        else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
-
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
-        if doUniqs:
-            stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
-            if doMulti:
-                stmt.append("UNION ALL")
-                stmt.append(selectQuery)
-                stmt.append("from multi")
-                stmt.append(whereQuery)
-                stmt.append(groupQuery)
-        else:
-            stmt = [selectQuery, "from multi", whereQuery]
-
-        if combine5p:
-            if findallOptimize:
-                selectQuery = "select start, sense, weight, chrom"
-
-            if doUniqs:
-                subSelect = [selectQuery, "from uniqs", whereQuery]
-                if doMulti:
-                    subSelect.append("union all")
-                    subSelect.append(selectQuery)
-                    subSelect.append("from multi")
-                    subSelect.append(whereQuery)
-            else:
-                subSelect = [selectQuery, "from multi", whereQuery]
-
-            sqlStmt = string.join(subSelect)
-            if findallOptimize:
-                selectQuery = "select start, sense, sum(weight)"
-
-            stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
-                    selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
-
-        if findallOptimize:
-            if self.memBacked:
-                self.memcon.row_factory = None
-                sql = self.memcon.cursor()
-            else:
-                self.dbcon.row_factory = None
-                sql = self.dbcon.cursor()
-
-            stmt.append("order by start")
-        elif readIDDict:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
-            stmt.append("order by readID, start")
-        else:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
-            stmt.append("order by chrom, start")
-
-        sqlQuery = string.join(stmt)
-        sql.execute(sqlQuery)
-
-        if findallOptimize:
-            resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql]
-            if self.memBacked:
-                self.memcon.row_factory = sqlite.Row
-            else:
-                self.dbcon.row_factory = sqlite.Row
-        else:
-            currentChrom = ""
-            currentReadID = ""
-            pairID = 0
-            for row in sql:
-                readID = row["readID"]
-                if fullChrom:
-                    chrom = row["chrom"]
-                else:
-                    chrom = row["chrom"][3:]
-
-                if not readIDDict and chrom != currentChrom:
-                    resultsDict[chrom] = []
-                    currentChrom = chrom
-                    dictKey = chrom
-                elif readIDDict:
-                    theReadID = readID
-                    if "::" in readID:
-                        (theReadID, multiplicity) = readID.split("::")
-
-                    if "/" in theReadID and withPairID:
-                        (theReadID, pairID) = readID.split("/")
-
-                    if theReadID != currentReadID:
-                        resultsDict[theReadID] = []
-                        currentReadID = theReadID
-                        dictKey = theReadID
-
-                if entryDict:
-                    newrow = {"start": int(row["start"])}
-                    if bothEnds:
-                        newrow["stop"] = int(row["stop"])
-
-                    if not noSense:
-                        newrow["sense"] = row["sense"]
-
-                    if withWeight:
-                        newrow["weight"] = float(row["weight"])
-
-                    if withFlag:
-                        newrow["flag"] = row["flag"]
-
-                    if withMismatch:
-                        newrow["mismatch"] = row["mismatch"]
-
-                    if withID:
-                        newrow["readID"] = readID
-
-                    if withChrom:
-                        newrow["chrom"] = chrom
-
-                    if withPairID:
-                        newrow["pairID"] = pairID
-                else:
-                    newrow = [int(row["start"])]
-                    if bothEnds:
-                        newrow.append(int(row["stop"]))
-
-                    if not noSense:
-                        newrow.append(row["sense"])
-
-                    if withWeight:
-                        newrow.append(float(row["weight"]))
-
-                    if withFlag:
-                        newrow.append(row["flag"])
-
-                    if withMismatch:
-                        newrow.append(row["mismatch"])
-
-                    if withID:
-                        newrow.append(readID)
-
-                    if withChrom:
-                        newrow.append(chrom)
-
-                    if withPairID:
-                        newrow.append(pairID)
-
-                resultsDict[dictKey].append(newrow)
-
-        return resultsDict
-
-
-    def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand="", entryDict=False):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand != "":
-            whereClause.append("sense = '%s'" % strand)
-
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
-
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
-        if not noSense:
-            selectClause.append("sense")
-
-        if withWeight:
-            selectClause.append("weight")
-
-        if withFlag:
-            selectClause.append("flag")
-
-        if withMismatch:
-            selectClause.append("mismatch")
-
-        selectQuery = string.join(selectClause, " ,")
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        if chrom == "" and not readIDDict:
-            stmt = "select distinct chrom from splices %s" % whereQuery
-            sql.execute(stmt)
-            for row in sql:
-                if fullChrom:
-                    chrom = row["chrom"]
-                else:
-                    chrom = row["chrom"][3:]
-
-                resultsDict[chrom] = []
-        elif chrom != "" and not readIDDict:
-            resultsDict[chrom] = []
-
-        stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
-        sql.execute(stmt)
-        currentReadID = ""
-        for row in sql:
-            pairID = 0
-            readID = row["readID"]
-            if fullChrom:
-                chrom = row["chrom"]
-            else:
-                chrom = row["chrom"][3:]
-
-            if readIDDict:
-                if "/" in readID:
-                    (theReadID, pairID) = readID.split("/")
-                else:
-                    theReadID = readID
-
-                if theReadID != currentReadID:
-                    resultsDict[theReadID] = []
-                    currentReadID = theReadID
-                    dictKey = theReadID
-            else:
-                dictKey = chrom
-
-            if entryDict:
-                newrow = {"startL": int(row["startL"])}
-                newrow["stopL"] = int(row["stopL"])
-                newrow["startR"] = int(row["startR"])
-                newrow["stopR"] = int(row["stopR"])
-                if not noSense:
-                    newrow["sense"] = row["sense"]
-
-                if withWeight:
-                    newrow["weight"] = float(row["weight"])
-
-                if withFlag:
-                    newrow["flag"] = row["flag"]
-
-                if withMismatch:
-                    newrow["mismatch"] = row["mismatch"]
-
-                if withID:
-                    newrow["readID"] = readID
-
-                if withChrom:
-                    newrow["chrom"] = chrom
-
-                if withPairID:
-                    newrow["pairID"] = pairID
-
-                if splitRead:
-                    leftDict = newrow
-                    del leftDict["startR"]
-                    del leftDict["stopR"]
-                    rightDict = newrow
-                    del rightDict["start"]
-                    del rightDict["stopL"]
-                    resultsDict[dictKey].append(leftDict)
-                    resultsDict[dictKey].append(rightDict)
-                else:
-                    resultsDict[dictKey].append(newrow)
-            else:
-                newrow = [int(row["startL"])]
-                newrow.append(int(row["stopL"]))
-                newrow.append(int(row["startR"]))
-                newrow.append(int(row["stopR"]))
-                if not noSense:
-                    newrow.append(row["sense"])
-
-                if withWeight:
-                    newrow.append(float(row["weight"]))
-
-                if withFlag:
-                    newrow.append(row["flag"])
-
-                if withMismatch:
-                    newrow.append(row["mismatch"])
-
-                if withID:
-                    newrow.append(readID)
-
-                if withChrom:
-                    newrow.append(chrom)
-
-                if withPairID:
-                    newrow.append(pairID)
-
-                if splitRead:
-                    resultsDict[dictKey].append(newrow[:2] + newrow[4:])
-                    resultsDict[dictKey].append(newrow[2:])
-                else:
-                    resultsDict[dictKey].append(newrow)
-
-        return resultsDict
-
-
-    def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
-                  splices=False, reportCombined=True, sense="both"):
-        """ return read counts for a given region.
-        """
-        ucount = 0
-        mcount = 0
-        scount = 0
-        restrict = ""
-        if sense in ["+", "-"]:
-            restrict = " sense ='%s' " % sense
-
-        if uniqs:
-            try:
-                ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
-            except:
-                ucount = 0
-
-        if multi:
-            try:
-                mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
-            except:
-                mcount = 0
-
-        if splices:
-            try:
-                scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
-            except:
-                scount = 0
-
-        if reportCombined:
-            total = ucount + mcount + scount
-            return total
-        else:
-            return (ucount, mcount, scount)
-
-
-    def getTotalCounts(self, chrom="", rmin="", rmax=""):
-        return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
-
-
-    def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the uniqs table.
-        """
-        whereClause = []
-        count = 0
-
-        if chrom !=""  and chrom != self.memChrom:
-            whereClause = ["chrom='%s'" % chrom]
-
-        if rmin != "":
-            whereClause.append("%s >= %s" % (startField, str(rmin)))
-
-        if rmax != "":
-            whereClause.append("%s <= %s" % (startField, str(rmax)))
-
-        if restrict != "":
-            whereClause.append(restrict)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
-
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        if distinct:
-            sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery))
-        else:
-            sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
-
-        result = sql.fetchone()
-
-        try:
-            count = int(result[0])
-        except:
-            count = 0
-
-        return count
-
-
-    def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of row in the splices table.
-        """
-        return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
-
-
-    def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of distinct readIDs in the uniqs table.
-        """
-        return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
-
-
-    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the total weight of readIDs in the multi table.
-        """
-        return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
-
-
-    def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
-        """ get readID's.
-        """
-        stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
-        if uniqs:
-            stmt.append("select readID from uniqs")
-
-        if multi:
-            stmt.append("select readID from multi")
-
-        if splices:
-            stmt.append("select readID from splices")
-
-        if len(stmt) > 0:
-            selectPart = string.join(stmt, " union ")
-        else:
-            selectPart = ""
-
-        sqlQuery = "%s group by readID %s" (selectPart, limitPart)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        sql.execute(sqlQuery)
-        result = sql.fetchall()
-
-        if paired:
-            return [x.split("/")[0][0] for x in result]
-        else:
-            return [x[0] for x in result]
-
-
-    def getMismatches(self, mischrom = None, verbose=False, useSplices=True):
-        """ returns the uniq and spliced mismatches in a dictionary.
-        """
-        revcomp = {"A": "T",
-                   "T": "A",
-                   "G": "C",
-                   "C": "G",
-                   "N": "N"
-        }
-
-        readlen = self.getReadSize()
-        if mischrom:
-            hitChromList = [mischrom]
-        else:
-            hitChromList = self.getChromosomes()
-            hitChromList.sort()
-
-        snpDict = {}
-        for achrom in hitChromList:
-            if verbose:
-                print "getting mismatches from chromosome %s" % (achrom)
-
-            snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True)
-            if useSplices and self.dataType == "RNA":
-                spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
-                spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0]
-                    spMismatchList = mismatches.split(",")
-                    for mismatch in spMismatchList:
-                        if "N" in mismatch:
-                            continue
-
-                        change_len = len(mismatch)
-                        if sense == "+":
-                            change_from = mismatch[0]
-                            change_base = mismatch[change_len-1]
-                            change_pos = int(mismatch[1:change_len-1])
-                        elif sense == "-":
-                            change_from = revcomp[mismatch[0]]
-                            change_base = revcomp[mismatch[change_len-1]]
-                            change_pos = readlen - int(mismatch[1:change_len-1]) + 1
-
-                        firsthalf = int(lefthalf)-int(startpos)+1
-                        secondhalf = 0
-                        if int(change_pos) <= int(firsthalf):
-                            change_at = startpos + change_pos - 1
-                        else:
-                            secondhalf = change_pos - firsthalf
-                            change_at = rightstart + secondhalf
-
-                        snpDict[achrom].append([startpos, change_at, change_base, change_from])
-
-            if achrom not in hitDict:
-                continue
-
-            for (start, sense, mismatches) in hitDict[achrom]:
-                mismatchList = mismatches.split(",")
-                for mismatch in mismatchList:
-                    if "N" in mismatch:
-                        continue
-
-                    change_len = len(mismatch)
-                    if sense == "+":
-                        change_from = mismatch[0]
-                        change_base = mismatch[change_len-1]
-                        change_pos = int(mismatch[1:change_len-1])
-                    elif sense == "-":
-                        change_from = revcomp[mismatch[0]]
-                        change_base = revcomp[mismatch[change_len-1]]
-                        change_pos = readlen - int(mismatch[1:change_len-1]) + 1
-
-                    change_at = start + change_pos - 1
-                    snpDict[achrom].append([start, change_at, change_base, change_from])
-
-        return snpDict
-
-
-    def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
-                        keepStrand="both", shiftValue=0):
-        """return a profile of the chromosome as an array of per-base read coverage....
-            keepStrand = 'both', 'plusOnly', or 'minusOnly'.
-            Will also shift position of unique and multireads (but not splices) if shift is a natural number
-        """
-        metadata = self.getMetadata()
-        readlen = int(metadata["readsize"])
-        dataType = metadata["dataType"]
-        scale = 1. / normalizationFactor
-        shift = {}
-        shift["+"] = int(shiftValue)
-        shift["-"] = -1 * int(shiftValue)
-
-        if cstop > 0:
-            lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
-        else:
-            lastNT = cstop - cstart + readlen + shift["+"]
-
-        chromModel = array("f", [0.] * lastNT)
-        hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
-        if cstart < 0:
-            cstart = 0
-
-        for (hstart, sense, weight) in hitDict[chromosome]:
-            hstart = hstart - cstart + shift[sense]
-            for currentpos in range(hstart,hstart+readlen):
-                try:
-                    if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
-                        chromModel[currentpos] += scale * weight
-                    elif sense == '-' and keepStrand != "plusOnly":
-                        chromModel[currentpos] -= scale * weight
-                except:
-                    continue
-
-        del hitDict
-        if useSplices and dataType == "RNA":
-            if cstop > 0:
-                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
-            else:
-                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
-   
-            if chromosome in spliceDict:
-                for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]:
-                    if (Rstop - cstart) < lastNT:
-                        for index in range(abs(Lstop - Lstart)):
-                            currentpos = Lstart - cstart + index
-                            # we only track unique splices
-                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
-                                chromModel[currentpos] += scale
-                            elif rsense == "-" and keepStrand != "plusOnly":
-                                chromModel[currentpos] -= scale
-
-                        for index in range(abs(Rstop - Rstart)):
-                            currentpos = Rstart - cstart + index
-                            # we only track unique splices
-                            if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
-                                chromModel[currentpos] += scale
-                            elif rsense == "-" and keepStrand != "plusOnly":
-                                chromModel[currentpos] -= scale
-
-            del spliceDict
-
-        return chromModel
-
-
-    def insertMetadata(self, valuesList):
-        """ inserts a list of (pname, pvalue) into the metadata
-        table.
-        """
-        self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def updateMetadata(self, pname, newValue, originalValue=""):
-        """ update a metadata field given the original value and the new value.
-        """
-        stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
-        if originalValue != "":
-            stmt += " and value='%s' " % str(originalValue)
-
-        self.dbcon.execute(stmt)
-        self.dbcon.commit()
-
-
-    def insertUniqs(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the uniqs table.
-        """
-        self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertMulti(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the multi table.
-        """
-        self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertSplices(self, valuesList):
-        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
-        into the splices table.
-        """
-        self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
-        """ update reads on file database in a list region of regions for a chromosome to have a new flag.
-            regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
-            sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
-        """
-        restrict = ""
-        if sense != "both":
-            restrict = " and sense = ? "
-
-        if uniqs:
-            self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
-
-        if multi:
-            self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
-
-        if self.dataType == "RNA" and splices:
-            self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
-            self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
-
-        self.dbcon.commit()
-
-
-    def setFlags(self, flag, uniqs=True, multi=True, splices=True):
-        """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
-        """
-        if uniqs:
-            self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
-
-        if multi:
-            self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
-
-        if self.dataType == 'RNA' and splices:
-            self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
-
-        self.dbcon.commit()
-
-
-    def resetFlags(self, uniqs=True, multi=True, splices=True):
-        """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
-        """
-        if uniqs:
-            self.dbcon.execute("UPDATE uniqs SET flag = ''")
-
-        if multi:
-            self.dbcon.execute("UPDATE multi SET flag = ''")
-
-        if self.dataType == "RNA" and splices:
-            self.dbcon.execute("UPDATE splices SET flag = ''")
-
-        self.dbcon.commit()
-
-
-    def reweighMultireads(self, readList):
-        self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
-
-
-    def setSynchronousPragma(self, value="ON"):
-        try:
-            self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
-        except:
-            print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
-
-
-    def setDBcache(self, cache, default=False):
-        self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
-        if default:
-            self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache)
-
-
-    def execute(self, statement, returnResults=False, forceCommit=False):
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
-        sql.execute(statement)
-        if returnResults:
-            result = sql.fetchall()
-            return result
-
-        if forceCommit:
-            if self.memBacked:
-                self.memcon.commit()
-            else:
-                self.dbcon.commit()
-
-
-    def buildIndex(self, cache=100000):
-        """ Builds the file indeces for the main tables.
-            Cache is the number of 1.5 kb pages to keep in memory.
-            100000 pages translates into 150MB of RAM, which is our default.
-        """
-        if cache > self.getDefaultCacheSize():
-            self.setDBcache(cache)
-        self.setSynchronousPragma("OFF")
-        self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-        print "built uPosIndex"
-        self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
-        print "built uChromIndex"
-        self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-        print "built mPosIndex"
-        self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
-        print "built mChromIndex"
-
-        if self.dataType == "RNA":
-            self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
-            print "built sPosIndex"
-            self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
-            print "built sPosIndex2"
-            self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
-            print "built sChromIndex"
-
-        self.dbcon.commit()
-        self.setSynchronousPragma("ON")
-
-
-    def dropIndex(self):
-        """ drops the file indices for the main tables.
-        """
-        try:
-            self.setSynchronousPragma("OFF")
-            self.dbcon.execute("DROP INDEX uPosIndex")
-            self.dbcon.execute("DROP INDEX uChromIndex")
-            self.dbcon.execute("DROP INDEX mPosIndex")
-            self.dbcon.execute("DROP INDEX mChromIndex")
-
-            if self.dataType == "RNA":
-                self.dbcon.execute("DROP INDEX sPosIndex")
-                try:
-                    self.dbcon.execute("DROP INDEX sPosIndex2")
-                except:
-                    pass
-
-                self.dbcon.execute("DROP INDEX sChromIndex")
-
-            self.dbcon.commit()
-        except:
-            print "problem dropping index"
-
-        self.setSynchronousPragma("ON")
-
-
-    def memSync(self, chrom="", index=False):
-        """ makes a copy of the dataset into memory for faster access.
-        Can be restricted to a "full" chromosome. Can also build the
-        memory indices.
-        """
-        self.memcon = ""
-        self.memcon = sqlite.connect(":memory:")
-        self.initializeTables(self.memcon)
-        cursor = self.dbcon.cursor()
-        whereclause = ""
-        if chrom != "":
-            print "memSync %s" % chrom
-            whereclause = " where chrom = '%s' " % chrom
-            self.memChrom = chrom
-        else:
-            self.memChrom = ""
-
-        self.memcon.execute("PRAGMA temp_store = MEMORY")
-        self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
-        # copy metadata to memory
-        self.memcon.execute("delete from metadata")
-        results = cursor.execute("select name, value from metadata")
-        results2 = []
-        for row in results:
-            results2.append((row["name"], row["value"]))
-
-        self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
-        # copy uniqs to memory
-        results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause)
-        results2 = []
-        for row in results:
-            results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
-        self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
-        # copy multi to memory
-        results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause)
-        results2 = []
-        for row in results:
-            results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
-        self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2)
-        # copy splices to memory
-        if self.dataType == "RNA":
-            results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause)
-            results2 = []
-            for row in results:
-                results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
-
-            self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2)
-        if index:
-            if chrom != "":
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
-            else:
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
-
-        self.memBacked = True
-        self.memcon.row_factory = sqlite.Row
-        self.memcon.commit()
diff --git a/crossmatch.py b/crossmatch.py

index 6a3675812e614ba7018dba4d8e6a37d375f0cb3a..ac4aa99b66b27220c21d05038191a584b236fda8 100755 (executable)
--- a/crossmatch.py
+++ b/crossmatch.py
@@ -12,7 +12,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "version 1.1"
+    print "crossmatch: version 1.2"
      if len(argv) < 7:
          print "usage: python %s prefix directory genome1 genefile1 genome2 genefile2 [genome3 genefile3 .....]" % argv[0]
          sys.exit(1)
diff --git a/distalPairs.py b/distalPairs.py

index d24781a0db7c0ee3702e71cf97f6c2d62ea21c41..5bc25321b25b8a30e9b09511cabbee4355010b71 100755 (executable)
--- a/distalPairs.py
+++ b/distalPairs.py
@@ -12,15 +12,18 @@ try:
  except:
      pass
  
-from commoncode import readDataset
-import sys, time, optparse
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "%prog: version 3.3"
+    print "distalPairs: version 3.4"
      print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
      usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
  
@@ -44,6 +47,27 @@ def main(argv=None):
      distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "distalPairs"
+    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 1000000000)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(sameChromOnly=sameChromOnly, doSplices=doSplices, doVerbose=doVerbose, maxDist=maxDist, cachePages=cachePages)
+
+    return parser
+
+
  def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
      if cachePages is not None:
          doCache = True
@@ -51,7 +75,7 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
          doCache = False
          cachePages = -1
  
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
      if not RDS.hasIndex():
          print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
          sys.exit(1)
@@ -95,8 +119,12 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
          readList = uniqDict[readID]
          if len(readList) == 2:
              total += 1
-            (start1, sense1, chrom1, pair1) = readList[0]
-            (start2, sense2, chrom2, pair2) = readList[1]
+            start1 = readList[0]["start"]
+            sense1 = readList[0]["sense"]
+            chrom1 = readList[0]["chrom"]
+            start2 = readList[1]["start"]
+            sense2 = readList[1]["sense"]
+            chrom2 = readList[1]["chrom"]
  
              if chrom1 != chrom2:
                  diffChrom += 1
diff --git a/docs/README.build-rds b/docs/README.build-rds

index ef668d2bd184a73643dae51dea1bf7128bc799af..1c08a27a90f72a7c984660d4bae14c5c4b0f534d 100644 (file)
--- a/docs/README.build-rds
+++ b/docs/README.build-rds
@@ -137,9 +137,9 @@ single file e.g. test.comb.eland2
  The makerdsfromeland2.py script is used to import the reads 
  into RDS:
  
-python makerdsfromeland2.py label infilename outrdsfile [-append] [-RNA ucscGeneModels] 
-[propertyName::propertyValue] [-index] [-paired 1 or 2] [-extended] [-verbose] 
-[-olddelimiter] [-maxlines num] [-cache numPages]
+python makerdsfromeland2.py label infilename outrdsfile [--append] [--RNA ucscGeneModels] 
+[propertyName::propertyValue] [--index] [--paired 1 or 2] [--extended] [--verbose] 
+[--olddelimiter] [--maxlines num] [--cache numPages]
  
  The first 3 arguments are required:
  - label is any label that you wish (a combination flowcell+lane# 
@@ -149,18 +149,18 @@ is a good choice)
  - outdbname is the name of the rds file, e.g. test.rds
  
  If the reads are from paired-end runs, enter each eland_multi 
-(or extended) file separately with the "-paired 1" or "-paired 2" 
+(or extended) file separately with the "--paired 1" or "--paired 2" 
  flag, as appropriate.
  
-If entering more than one lane, use -append for all subsequent 
-lanes. Upon entering the last lane, use -index to build a read 
+If entering more than one lane, use --append for all subsequent 
+lanes. Upon entering the last lane, use --index to build a read 
  index. Refer to MANIPULATING RDS METADATA AND CACHING for 
  information on the optional property::value pairs and caching.
  
  For RNA-seq, you must in addition specify the path to knownGene.txt 
-using the -RNA flag, e.g.
+using the --RNA flag, e.g.
  
-python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+python $ERANGEPATH/makerdsfromeland2.py myRNAlabel myRNA.eland_multi.txt rnatest.rds --RNA ../mm9/knownGene.txt [more options]
  
  
  6. MAPPING READS WITH BOWTIE
@@ -187,13 +187,13 @@ python $ERANGEPATH/makerdsfrombowtie.py testLabel s1.mm9.bowtie.txt bowtietest.r
  The options for the script are:
  
  python makerdsfrombowtie.py label infilename outrdsfile 
-[-RNA ucscGeneModels]  [-append]  [-index] [propertyName::propertyValue] 
-[-rawreadID] [-verbose] [-cache numPages]
+[--RNA ucscGeneModels]  [--append]  [--index] [propertyName::propertyValue] 
+[--rawreadID] [--verbose] [--cache numPages]
  
  Refer to "MAPPING READS WITH ELAND" for a description of label, 
-infilename, outdbname, '-append', '-index', and '-cache'.
+infilename, outdbname, '--append', '--index', and '--cache'.
  
-****REMEMBER TO USE -index WHEN LOADING THE LAST LANE OF YOUR 
+****REMEMBER TO USE --index WHEN LOADING THE LAST LANE OF YOUR 
  DATASET.****
  
  The script assumes that the read ID are from Illumina, i.e. that 
@@ -210,9 +210,9 @@ throw_away:uniqueid if unpaired
  throw_away:uniqueid/1 and throw_away:uniqueid/2 for paired-ends.
  
  For RNA-seq, you must in addition specify the path to knownGene.txt 
-using the -RNA flag, e.g.
+using the --RNA flag, e.g.
  
-python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds -RNA ../mm9/knownGene.txt [more options]
+python $ERANGEPATH/makerdsfrombowtie.py myRNAlabel myRNA.bowtie.txt rnatest.rds --RNA ../mm9/knownGene.txt [more options]
  
  
  7. MAPPING READS WITH BLAT
@@ -239,20 +239,20 @@ Once the reads have been filtered, the makerdsfromblat.py
  script is used to import the mapped reads (in the example 
  above s3_1.hg18.blatbetter) into RDS:
  
-python makerdsfromblat.py label infilename outrdsfile [-append] [-index] [propertyName::propertyValue] 
-[-rawreadID] [-forceRNA]  [-flag] [-strict minSpliceLen] [-spliceonly] [-verbose] [-cache numPages]
+python makerdsfromblat.py label infilename outrdsfile [--append] [--index] [propertyName::propertyValue] 
+[--rawreadID] [--forceRNA]  [--flag] [--strict minSpliceLen] [--spliceonly] [--verbose] [--cache numPages]
  
  If you are using BLAT for RNA-seq, please be sure to use
--forceRNA in order to import spliced reads and consider 
-using -strict to require a minimum length of bases on 
+--forceRNA in order to import spliced reads and consider 
+using --strict to require a minimum length of bases on 
  each side of the splice. 
  
  You can combine BOWTIE and BLAT by mapping reads with BOWTIE 
  first, and then using BLAT to map the unmapped reads. In 
  that case, you may want to only load the spliced reads 
-using the -spliceonly flag. To track those reads in the RDS 
-file, use -flag ; you can then retrieve those reads using 
-the options "-flag blat -flagLike" with the makebedfromrds.py 
+using the --spliceonly flag. To track those reads in the RDS 
+file, use --flag ; you can then retrieve those reads using 
+the options "--flag blat --flagLike" with the makebedfromrds.py 
  script.
  
  
@@ -266,7 +266,7 @@ have neither the multireads nor the spliced reads.
  The command line options are similar to those for other 
  scripts described in part 5-7:
  
-python makerdsfrombed.py label bedfile outrdsfile [-append] [-index] [propertyName::propertyValue] [-cache numPages]
+python makerdsfrombed.py label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]
  
  
  9.  COMBINING RDS FILES
@@ -277,7 +277,7 @@ of importing all tables or specific ones (e.g. uniqs, splices).
  
  The combinerds.py command options are:
  
-python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [-init] [-initrna] [-index] [-cache pages]
+python combinerds.py destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]
  
  
  10. MANIPULATING RDS METADATA AND CACHING
@@ -320,6 +320,7 @@ basis for mammalian genomes.
  
  RELEASE HISTORY
  
+version 3.3    November 2010 - updated command line options
  version 3.2    October  2009 - added combinerds.py
  version 3.01   February 2009 - bug fixes
  version 3.0    January  2009 - added logging to buildrdsfrom*
diff --git a/docs/README.chip-seq b/docs/README.chip-seq

index 6529a6fa1b9404c8d1d0b42484f83bc48c0f048c..846a441a434f8e3cff4506108f27126855a2fb47 100644 (file)
--- a/docs/README.chip-seq
+++ b/docs/README.chip-seq
@@ -86,7 +86,7 @@ given radius
  
  (a) is the default in the current release of ERANGE. 
  Simply proceed to RUNNING THE PEAK FINDER for (a) and 
-(a). You can ignore multireads (b) by using the -nomulti 
+(a). You can ignore multireads (b) by using the --nomulti 
  flag with findall.py. For (c), use weighMultireads.py 
  to weigh multireads based on a unique reads in the 
  respective radius of each potential location. Once run, 
@@ -98,7 +98,7 @@ proceed to the section below.
  To run the peak finder without read shifting, use the 
  following command:
  
-python $ERANGEPATH/findall.py label chip.rds chip.regions.txt -control control.rds -listPeak -revbackground
+python $ERANGEPATH/findall.py label chip.rds chip.regions.txt --control control.rds --listPeak --revbackground
  
  which will run the peak finder on chip.rds / control.rds , 
  store the enriched region coordinates in chip.regions.txt, 
@@ -119,40 +119,40 @@ fragment sizes, on the order of 40-60 bp.
  You will *NEED* to change some of the default parameters 
  if working in smaller genomes (e.g. use smaller -spacing), 
  if working with certain types of IPs such as histones and 
-polymerases (test with and without -notrim and 
--nodirectionality), if working with rather weak IPs
-(e.g. -minimum and -ratio), or if working with larger 
+polymerases (test with and without --notrim and 
+--nodirectionality), if working with rather weak IPs
+(e.g. --minimum and --ratio), or if working with larger 
  fragment sizes (see the paragraph below discussing read 
  shifting). 
  
  findall.py returns a per-peak p-value. By default, this 
  is calculated using a Poisson distribution of peak RPMs 
-(or counts, if using -raw) for each chromosome in the IP. 
+(or counts, if using --raw) for each chromosome in the IP. 
  P-value calculations can be turned off using 
-'-pvalue none '. Alternatively, the p-value can be 
+'--pvalue none '. Alternatively, the p-value can be 
  calculated from the background using the option 
-'-pvalue back ', which must be combined with the option 
--revbackground.
+'--pvalue back ', which must be combined with the option 
+--revbackground.
  
  By default, findall.py does not try to adjust the location 
  of the reads based on half the size of the expected fragment 
  length (the "shift"). If you believe that you need to shift 
  your peaks, findall.py can try to pick the best shift based 
  on the best shift for strong sites using the parameter 
-'-shift learn '. You can also either manually specify a 
-shift value using '-shift #bp ' or ou can calculate a 
-"best shift" for each region using '-autoshift'. If you 
+'--shift learn '. You can also either manually specify a 
+shift value using '--shift #bp ' or ou can calculate a 
+"best shift" for each region using '--autoshift'. If you 
  need to using the shift options, the recommended usage is:
-(i) first run findall.py with '-shift learn ', which will 
+(i) first run findall.py with '--shift learn ', which will 
  peak a shift if there are at least 30 regions that meet 
  its training criteria.
  (ii) if (i) couldn't pick a shift, run findall.py with 
--autoshift and -reportshift
+--autoshift and --reportshift
  (iii) look at the mode (most common #) for the shift
-(iv) rerun findall.py with -shift #bp where #bp is the mode
+(iv) rerun findall.py with --shift #bp where #bp is the mode
    
  If you are storing the RDS files on an network-mounted 
-directory, make sure to use '-cache XXXXX' to enable 
+directory, make sure to use '--cache XXXXX' to enable 
  local caching, where is as large as appropriate as 
  described in section 9 of README.build-rds . 
  
@@ -223,6 +223,7 @@ for example.
  
  RELEASE HISTORY
  
+version 3.2    November 2010 - updated command line options
  version 3.1    February 2009 - support for read shifting
  version 3.0    February 2009 - support for UCSC narrowPeak format in regiontobed.py
  version 3.0rc1 December 2008 - added parameter to control peak-trimming
diff --git a/docs/README.rna-seq b/docs/README.rna-seq

index 5a866f31a7e2a0ece21a116751263e60a771401c..605c48592359b6fefe129cd1e945fa624e7d6a3c 100644 (file)
--- a/docs/README.rna-seq
+++ b/docs/README.rna-seq
@@ -117,7 +117,7 @@ python $ERANGEPATH/combineRPKMs.py
  and get back a version number and all possible command line options:
  
  version 1.0
-usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [-withmultifraction]
+usage: python $ERANGEPATH/combineRPKMs.py firstRPKM expandedRPKM finalRPKM combinedOutfile [--withmultifraction]
  
  where fields in brackets are optional.
  
@@ -141,11 +141,11 @@ In alternative 1, we use reads that did not match an existing gene
  model to identify candidate regions:
  
  # Alternative 1: find new regions outside of gene models with reads piled up 
-python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
  
  # Alternative 1: filter out new regions that overlap repeats more than a certain fraction 
  #                use "none" if you don't have a repeatmask database
-python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1
  
  In alternative 2, we pool multiple RNA-seq datasets into a single 
  RDS database, run it through the two scripts of alternative 1 above, 
@@ -212,9 +212,9 @@ convert it into the format that cistematic expects using
  
  $ERANGEPATH/gfftocis.py infile.gff outfile.cis
  
-NOTE THAT YOU WILL MOST LIKELY HAVE TO EDIT THIS FILE TO 
-ACCOMODATE YOUR SPECIFIC GFF FORMAT TO THE CISTEMATIC 
-FORMAT, WHICH IS
+NOTE THAT THIS FILE IS PROVIDED AS AN EXAMPLE ONLY. YOU WILL MOST
+LIKELY HAVE TO EDIT THIS FILE TO ACCOMODATE YOUR SPECIFIC GFF
+FORMAT TO THE CISTEMATIC FORMAT, WHICH IS
  
  geneID<tab>uniqRef<tab>chrom<tab>start<tab>stop<tab>sense<tab>type<return>
  
@@ -256,6 +256,7 @@ in README.rna-esnp .
  
  RELEASE HISTORY
  
+version 3.3    November 2010 - updated command line options
  version 3.2    December 2009 - support for custom genome annotations with Cistematic 3.0
  version 3.1    April    2009 - modified normalizeFinalExonic.py to remove genome
  version 3.0    January  2009 - added logging to shell pipelines
diff --git a/docs/README.rnapath b/docs/README.rnapath

index c64579b2f206c8ec8fe47c4027261ba5fe284072..8a35232a1f8d84ff2baafe2cfb98e8ff9320d6cc 100644 (file)
--- a/docs/README.rnapath
+++ b/docs/README.rnapath
@@ -37,13 +37,14 @@ the list of paired reads that do not map to the same contig. This involves
  specifying a distance to distalPairs.py that is greater than the length of the 
  largest existing genomic contig. For example:
  
-python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs -splices -cache 20000000
+python ../commoncode/distalPairs.py 20000 rna_on_genomic.rds rna_on_genomic.crosspairs --splices --cache 20000000
  
  4. RUNNING RNAPATH.py
  
-You can now run RNAPATH.py. I suggest optionallly using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
+You can now run RNAPATH.py. I suggest optionally using the included script processvelvet.py to rename the contigs, before running blat and generating the crosspair data.
  
  Example: $ERANGEPATH/rnapath/RNAPATH.py genomic_contigs.fa rna_on_genomic.crosspairs RNAPATH.log genome.RNAPATH.fa
  
-version 3.2    May  2010 - first release
+version 3.3    November 2010 - updated command line options
+version 3.2    May      2010 - first release
  
diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt

index e9a5213ce8b9e4d139bd7c719b83b1e8bc487b70..b0a1c0f09e270ad2cc47c9af05c84564b5325aaf 100644 (file)
--- a/docs/RNA-seq.analysisSteps.txt
+++ b/docs/RNA-seq.analysisSteps.txt
@@ -8,7 +8,7 @@
  #              export CISTEMATIC_ROOT=/my/path/to/cistematic_genomes
  #
  # preliminary: set ERANGEPATH, e.g. 
-#              export ERANGEPATH=/proj/genome/experiments/commoncode
+#              export ERANGEPATH=/my/path/to/erange
  #
  # preliminary: set CISTEMATIC_TEMP to a local directory with ample space (default is /tmp), e.g. 
  #              export CISTEMATIC_TEMP=/any/local/dir
@@ -30,29 +30,29 @@
  # create rds file with one lane's worth of data (add -index if using only one lane)
  # The example below sets the default cache to 1000000 
  # The name::value pairs are optional documentart metadata, and can be set to any desired name or value
-python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX  
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs7.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 library::10213 cellLine::LHCN genome::hg18v2 cellState::confluent flowcell::200GFAAXX  
  
  # can change a database cache size using rdsmetadata.py to speed up indexing and index-based lookups
  # rule of thumb for RNA-seq: set the cache size to half of the RAM on the computer
-#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds -defaultcache 2000000 -nocount
+#python $ERANGEPATH/rdsmetadata.py LHCN10213.rds --defaultcache 2000000 --nocount
  
  # append more data (only add -index when adding last lane)
-python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds -strict 5 -cache 1000000 -append -index  
+python $ERANGEPATH/makerdsfromblat.py 200GFAAXX 200GFAAXXs6.hg19.psl LHCN10213.rds --strict 5 --cache 1000000 --append --index  
  
  # count the unique reads falling on the gene models ; the nomatch files are 
  # mappable reads that fell outside of the Cistematic gene models and not the 
  # unmappable of Eland (i.e, the "NM" reads)
-python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count -markGID -cache 1
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.uniqs.count --markGID --cache 1
  
  # count splice reads
-python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count -splices -noUniqs -cache 1
+python $ERANGEPATH/geneMrnaCounts.py hsapiens LHCN10213.rds LHCN10213.splices.count --splices --noUniqs --cache 1
  
  # calculate a first-pass RPKM to re-weigh the unique reads,
  # using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.count none LHCN10213.firstpass.rpkm --cache
  
  # recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount -uniq -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.firstpass.rpkm LHCN10213.uniqs.recount --uniq --cache 1
  
  # There is a choice of either identifying new regions from the data alone 
  # (Alternative 1), or using a pre-computed list of new regions (presumably 
@@ -60,28 +60,28 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.fi
  # file (Alternative 2)
  
  # Alternative 1: find new regions outside of gene models with reads piled up 
-python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR LHCN10213.rds LHCN10213.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
  
  # Alternative 1: filter out new regions that overlap repeats more than a certain fraction 
  #                use "none" if you don't have a repeatmask database
-python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good -log rna.log -startField 1 -cache 1
+python $ERANGEPATH/checkrmask.py ../hg19repeats/rmask.db LHCN10213.newregions.txt LHCN10213.newregions.repstatus LHCN10213.newregions.good --log rna.log --startField 1 --cache 1
  
  # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  #python2.5 $ERANGEPATH/regionCounts.py ../RNAFAR/all.newregions.good LHCN10213.rds LHCN10213.newregions.good
  
  # map all candidate regions that are within a 20kb radius of a gene in bp
  # take out -cache if running locally
-python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 -radius 20001 -trackfar -cache
+python $ERANGEPATH/getallgenes.py hsapiens LHCN10213.newregions.good LHCN10213 --radius 20001 --trackfar --cache
  
  # calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py hsapiens LHCN10213.rds LHCN10213.uniqs.recount LHCN10213.splices.count LHCN10213.expanded.rpkm LHCN10213.candidates.txt LHCN10213.accepted.rpkm --cache
  
  # create bed file of accepted candidate regions
  python2.5 $ERANGEPATH/regiontobed.py RNAFAR LHCN10213.accepted.rpkm RNAFAR.bed 255,0,0
  
  # weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count -accept LHCN10213.accepted.rpkm -multi -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py hsapiens LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count --accept LHCN10213.accepted.rpkm --multi --cache 1
  
  # calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm -multifraction -withGID -cache
+python $ERANGEPATH/normalizeFinalExonic.py LHCN10213.rds LHCN10213.expanded.rpkm LHCN10213.multi.count LHCN10213.final.rpkm --multifraction --withGID --cache
  
diff --git a/docs/buildMatrix.sh b/docs/buildMatrix.sh

index 957507181b0e9e812ad61502d60b27df2385e363..7d67fcc150745b5ab7baede58024c1f33398b96b 100644 (file)
--- a/docs/buildMatrix.sh
+++ b/docs/buildMatrix.sh
@@ -1,20 +1,20 @@
  #!/bin/bash
-echo 'buildMatrix.sh: version 1.1'
+echo 'buildMatrix.sh: version 1.2'
  
  indexPrev=0
  indexCur=0
  
  truncateRPKM=""
  if [ $# -eq 3 ]; then
-    truncateRPKM="-truncate "$3
+    truncateRPKM="--truncate "$3
  fi
  
  if [ $# -eq 4 ]; then
-    truncateRPKM="-rescale -truncate "$3
+    truncateRPKM="--rescale --truncate "$3
  fi
  
  if [ $# -lt 2 ]; then
-       echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [-rescale]'
+       echo 'usage: buildMatrix.sh name datalist.file [truncateRPKM] [--rescale]'
         echo
         echo 'where the datalist file is a comma-delimited list of prefix and rds-files'
         echo
diff --git a/docs/partition.sh b/docs/partition.sh

index 1955e992ce2f6c4f44dc5894be56ab08cdbae471..734fd19c1da1cc7a2d366ce3384522234ae8aa78 100644 (file)
--- a/docs/partition.sh
+++ b/docs/partition.sh
@@ -30,5 +30,5 @@ else
                 FILELIST=$FILELIST$line
                 let N=N+1
         done < $3
-       python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part -minFeature $MINSIZE -nomerge -locid -norandom
+       python $ERANGEPATH/partition.py $N.way $FILELIST $PARTNAME$N.part --minFeature $MINSIZE --nomerge --locid --norandom
  fi
diff --git a/docs/regionCounts.sh b/docs/regionCounts.sh

index 13c60ad4bb6fd58fed267ff0ffb9772e728bb562..8875513266f785ee537137ec841c790d6d014e04 100644 (file)
--- a/docs/regionCounts.sh
+++ b/docs/regionCounts.sh
@@ -3,7 +3,7 @@ echo 'regionCounts.sh: version 1.0'
  
  cachepages=""
  if [ $# -eq 3 ]; then
-    cachepages="-cache "$3
+    cachepages="--cache "$3
  fi
  
  if [ $# -lt 2 ]; then
@@ -19,7 +19,7 @@ else
                 prefix=`echo $line | cut -f 1 -d ','`
                 rds=`echo $line | cut -f 2 -d ','`
                 if [ -e $rds ]; then
-                       python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount -force -nomerge -rpkm $cachepages
+                       python $ERANGEPATH/regionCounts.py $1 $rds $prefix.partcount --force --nomerge --rpkm $cachepages
                 else
                         echo "could not find $rds - skipping"
                         python $ERANGEPATH/recordLog.py regionCounts.log regionCounts.sh "could not find $rds - skipping"
diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh

index baf7f04664aa669a0c92211ad410f4bc79172dad..cf75b6a5c9047ebb94332f26e8af234b68f3e382 100755 (executable)
--- a/docs/runRNAPairedAnalysis.sh
+++ b/docs/runRNAPairedAnalysis.sh
@@ -11,25 +11,25 @@
  
  if [ -z "$ERANGEPATH" ]
  then
-    ERANGEPATH='../commoncode'
+    ERANGEPATH='../erange'
  fi
  
-echo 'runRNAPairedAnalysis.sh: version 3.7'
+echo 'runRNAPairedAnalysis.sh: version 3.8'
  
  models=""
  if [ $# -eq 5 ]; then
-    models=" -models "$5
+    models=" --models "$5
  fi
  
  replacemodels=""
  if [ $# -eq 6 ]; then
-    replacemodels=" -models $5 -replacemodels "
+    replacemodels=" --models $5 --replacemodels "
  fi
  
  if [ -z "$1" ]
  then
      echo
-    echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [-replacemodels]'
+    echo 'usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]'
      echo
      echo 'where rdsprefix is the name of the rds file without the .rds extension'
      echo 'use "none" for the repeatmaskdb if you do not have one'
@@ -45,37 +45,37 @@ python $ERANGEPATH/recordLog.py rna.log runRNAPairedAnalysis.sh "with parameters
  # mappable reads that fell outside of the Cistematic gene models and not the 
  # unmappable of Eland (i.e, the "NM" reads)
  echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels
  
  # calculate a first-pass RPKM to re-weigh the unique reads,
  # using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache $models $replacemodels
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache $models $replacemodels
  
  # recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1 $models $replacemodels
  
  # count splice reads
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -markGID -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --markGID --cache 1 $models $replacemodels
  
  # find new regions outside of gene models with reads piled up 
-python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
  
  # filter out new regions that overlap repeats more than a certain fraction
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked -startField 1 -log rna.log -cache 1
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.checked --startField 1 --log rna.log --cache 1
  
  # calculate the read densities
-python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good -markRDS -cache -log rna.log
+python $ERANGEPATH/regionCounts.py $2.newregions.checked $2.rds $2.newregions.good --markRDS --cache --log rna.log
  
  # map all candidate regions that have paired ends overlapping with known genes
-python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt -cache $models $replacemodels
+python $ERANGEPATH/rnafarPairs.py $1 $2.newregions.good $2.rds $2.candidates.txt --cache $models $replacemodels
  
  # calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache $models $replacemodels
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache $models $replacemodels
  
  # weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1 $models $replacemodels
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1 $models $replacemodels
  
  # calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache 
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache 
  
  fi
diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh

index 0e4ff92ffe137759519bce2af3e49b4a33a3b1c2..103b576e73ac450c0af315645abfca41d7abbb40 100755 (executable)
--- a/docs/runSNPAnalysis.sh
+++ b/docs/runSNPAnalysis.sh
@@ -9,19 +9,19 @@
  
  if [ -z "$ERANGEPATH" ]
  then
-    ERANGEPATH='../commoncode'
+    ERANGEPATH='../erange'
  fi
  
-echo 'runSNPAnalysis.sh: version 3.1'
+echo 'runSNPAnalysis.sh: version 3.2'
  
  cachepages=""
  if [ $# -eq 9 ]; then
-    cachepages="-cache "$9
+    cachepages="--cache "$9
  fi
  
  nosplices=""
  if [ $# -eq 10 ]; then
-    nosplices=" -nosplices "
+    nosplices=" --nosplices "
  fi
  
  if [ $# -lt 8 ]; then
@@ -36,7 +36,7 @@ echo 'running with settings: ' $arguments
  python $ERANGEPATH/recordLog.py snp.log runSNPAnalysis.sh "with parameters: $arguments"
  
  # get all SNPs by extracting it from the RDS
-python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt -enforceChr $cachepages $nosplices
+python $ERANGEPATH/getSNPs.py $2 $6 $7 $3.snps.txt --enforceChr $cachepages $nosplices
  
  # get SNPs in non-repeat regions only
  python $ERANGEPATH/chkSNPrmask.py $4 $3.snps.txt $3.nr_snps.txt $cachepages
diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh

index 6d83297bf4f39964793ced9800e46a3eb7bbcbc0..aa5fe60588a30ceba6a65301db81b13ac4f6051f 100755 (executable)
--- a/docs/runStandardAnalysis.sh
+++ b/docs/runStandardAnalysis.sh
@@ -11,25 +11,25 @@
  
  if [ -z "$ERANGEPATH" ]
  then
-    ERANGEPATH='../commoncode'
+    ERANGEPATH='../erange'
  fi
  
-echo 'runStandardAnalysis.sh: version 4.2'
+echo 'runStandardAnalysis.sh: version 4.3'
  
  models=""
  if [ $# -eq 5 ]; then
-    models=" -models "$5
+    models=" --models "$5
  fi
  
  replacemodels=""
  if [ $# -eq 6 ]; then
-    replacemodels=" -models $5 -replacemodels "
+    replacemodels=" --models $5 --replacemodels "
  fi
  
  if [ -z "$1" ]
  then
      echo
-    echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [-replacemodels]'
+    echo 'usage:runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]'
      echo
      echo 'where rdsprefix is the name of the rds file without the .rds extension'
      echo 'use "none" for the repeatmaskdb if you do not have one'
@@ -44,48 +44,48 @@ python $ERANGEPATH/recordLog.py rna.log runStandardAnalysis.sh "with parameters:
  # count the unique reads falling on the gene models ; the nomatch files are 
  # mappable reads that fell outside of the Cistematic gene models and not the 
  # unmappable of Eland (i.e, the "NM" reads)
-echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -markGID -cache 1 $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --markGID --cache 1 $models $replacemodels
  
  # calculate a first-pass RPKM to re-weigh the unique reads,
  # using 'none' for the splice count
-echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache  $models $replacemodels"
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache  $models $replacemodels
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache  $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache  $models $replacemodels
  
  # recount the unique reads with weights calculated during the first pass
-echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1  $models $replacemodels"
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -uniq -cache 1  $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --uniq --cache 1  $models $replacemodels
  
  # count splice reads
-echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1  $models $replacemodels"
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -splices -noUniqs -cache 1  $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --splices --noUniqs --cache 1  $models $replacemodels
  
  # Alternative 1: find new regions outside of gene models with reads piled up 
-echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1"
-python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -log rna.log -cache 1
+echo "python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1"
+python $ERANGEPATH/findall.py RNAFAR $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --log rna.log --cache 1
  
  # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -cache 1"
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -log rna.log -startField 1 -cache 1
+echo "python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --log rna.log --startField 1 --cache 1"
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --log rna.log --startField 1 --cache 1
  
  # map all candidate regions that are within a given radius of a gene in bp
-echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache  $models $replacemodels"
-python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -cache  $models $replacemodels
+echo "python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --cache  $models $replacemodels"
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --cache  $models $replacemodels
  
  # make sure candidates.txt file exists
  echo "touch $2.candidates.txt"
  touch $2.candidates.txt
  
  # calculate expanded exonic read density
-echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache  $models $replacemodels"
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache  $models $replacemodels
+echo "python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache  $models $replacemodels"
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache  $models $replacemodels
  
  # weigh multi-reads
-echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1  $models $replacemodels"
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -multi -cache 1  $models $replacemodels
+echo "python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1  $models $replacemodels"
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --multi --cache 1  $models $replacemodels
  
  # calculate final exonic read density
-echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache"
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  
  fi
 \ No newline at end of file
diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh

index 2626ed05ff47dc879869fbd0006c09b761ec692a..44459019f601732e67f652eed96becc38d0d920d 100755 (executable)
--- a/docs/runStrandedAnalysis.sh
+++ b/docs/runStrandedAnalysis.sh
@@ -11,10 +11,10 @@
  
  if [ -z "$ERANGEPATH" ]
  then
-    ERANGEPATH='../commoncode'
+    ERANGEPATH='../erange'
  fi
  
-echo 'runStrandedAnalysis.sh: version 4.1'
+echo 'runStrandedAnalysis.sh: version 4.2'
  
  if [ -z "$1" ]
  then
@@ -34,39 +34,39 @@ python $ERANGEPATH/recordLog.py rna.log runStrandedAnalysis.sh "with parameters:
  # count the unique reads falling on the gene models ; the nomatch files are 
  # mappable reads that fell outside of the Cistematic gene models and not the 
  # unmappable of Eland (i.e, the "NM" reads)
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count -stranded -markGID -cache 1
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.uniqs.count --stranded --markGID --cache 1
  
  # calculate a first-pass RPKM to re-weigh the unique reads,
  # using 'none' for the splice count
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.count none $2.firstpass.rpkm --cache
  
  # recount the unique reads with weights calculated during the first pass
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount -stranded -uniq -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.firstpass.rpkm $2.uniqs.recount --stranded --uniq --cache 1
  
  # count splice reads
-python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count -stranded -splices -noUniqs -cache 1
+python $ERANGEPATH/geneMrnaCounts.py $1 $2.rds $2.splices.count --stranded --splices --noUniqs --cache 1
  
  # find new regions outside of gene models with reads piled up 
-python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter plus -log rna.log -cache 1
-python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt -RNA -minimum 1 -nomulti -flag NM -strandfilter minus -log rna.log -cache 1 -append
+python $ERANGEPATH/findall.py RNAFARP $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter plus --log rna.log --cache 1
+python $ERANGEPATH/findall.py RNAFARM $2.rds $2.newregions.txt --RNA --minimum 1 --nomulti --flag NM --strandfilter minus --log rna.log --cache 1 --append
  
  # filter out new regions that overlap repeats more than a certain fraction
-python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good -startField 1 -log rna.log -cache 1
+python $ERANGEPATH/checkrmask.py $3 $2.newregions.txt $2.newregions.repstatus $2.newregions.good --startField 1 --log rna.log --cache 1
  
  # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
  #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
  #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good 
  
  # map all candidate regions that are within a given radius of a gene in bp
-python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt -radius $4 -trackfar -stranded -cache
+python $ERANGEPATH/getallgenes.py $1 $2.newregions.good $2.candidates.txt --radius $4 --trackfar --stranded --cache
  
  # calculate expanded exonic read density
-python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm -cache
+python $ERANGEPATH/normalizeExpandedExonic.py $1 $2.rds $2.uniqs.recount $2.splices.count $2.expanded.rpkm $2.candidates.txt $2.accepted.rpkm --cache
  
  # weigh multi-reads
-python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count -accept $2.accepted.rpkm -stranded -multi -cache 1
+python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi.count --accept $2.accepted.rpkm --stranded --multi --cache 1
  
  # calculate final exonic read density
-python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm -multifraction -withGID -cache
+python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  
  fi
diff --git a/farPairs.py b/farPairs.py

index 73dd3ca851e2ca64cc71e3148a8eced359e44def..00cc91844b23084d712573261e996524739c9870 100644 (file)
--- a/farPairs.py
+++ b/farPairs.py
@@ -11,11 +11,13 @@ try:
  except:
      pass
  
-import sys, time
+import sys
+import time
  import optparse
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
-print "%prog: version 1.3"
+print "farPairs: version 1.4"
  
  
  def main(argv=None):
@@ -24,16 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.add_option("--minDist", type="int", dest="minDist")
-    parser.add_option("--maxDist", type="int", dest="maxDist")
-    parser.add_option("--minCount", type="int", dest="minCount")
-    parser.add_option("--label", dest="label")
-    parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
-                        minDist=1000, maxDist=500000, minCount=2, label=None)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -50,6 +43,32 @@ def main(argv=None):
               options.label)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--minDist", type="int", dest="minDist")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--minCount", type="int", dest="minCount")
+    parser.add_option("--label", dest="label")
+
+    configParser = getConfigParser
+    section = "farPairs"
+    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    minDist = getConfigIntOption(configParser, section, "minDist", 1000)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+    minCount = getConfigIntOption(configParser, section, "minCount", 2)
+    label = getConfigOption(configParser, section, "label", None)
+
+    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+                        minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
+
+    return parser
+
+
  def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
               cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
  
@@ -62,7 +81,7 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
      if label is None:
          label = rdsfile
  
-    RDS = readDataset(rdsfile, verbose=True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
      rdsChromList = RDS.getChromosomes()
  
      if doVerbose:
@@ -80,7 +99,7 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
              continue
  
          print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
          if doVerbose:
              print len(uniqDict), time.ctime()
  
@@ -88,8 +107,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
              readList = uniqDict[readID]
              if len(readList) == 2:
                  total += 1
-                (start1, flag1, pair1) = readList[0]
-                (start2, flag2, pair2) = readList[1]
+                start1 = readList[0]["start"]
+                flag1 = readList[0]["flag"]
+                start2 = readList[1]["start"]
+                flag2 = readList[1]["flag"]
  
                  if flag1 != flag2:
                      dist = abs(start1 - start2)
diff --git a/featureIntersects.py b/featureIntersects.py

index e0b77267a32b6cd12751f96617cb017c30d9b671..cd357cc072c5cc7aa4460371439ef59c428873f5 100755 (executable)
--- a/featureIntersects.py
+++ b/featureIntersects.py
@@ -9,10 +9,13 @@ try:
  except:
      pass
  
-import sys, optparse
+import sys
+import optparse
  from cistematic.core import featuresIntersecting
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption
  
-print "%prog: version 1.0"
+
+print "featureIntersects: version 1.1"
  
  
  def main(argv=None):
@@ -21,10 +24,7 @@ def main(argv=None):
  
      usage = "usage: python %s tabfile [--cistype type] [--radius radius]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cistype", action="store_false", dest="cistype")
-    parser.add_option("--radius", type="int", dest="radius")
-    parser.set_defaults(cistype="TFBSCONSSITES", radius=100)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 1:
@@ -36,6 +36,21 @@ def main(argv=None):
      featureIntersects(tabfile, options.cistype, options.radius)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--radius", type="int", dest="radius")
+
+    configParser = getConfigParser()
+    section = "featureIntersects"
+    cistype = getConfigOption(configParser, section, "cistype", "TFBSCONSSITES")
+    radius = getConfigIntOption(configParser, section, "radius", 100)
+
+    parser.set_defaults(cistype=cistype, radius=radius)
+
+    return parser
+
+
  def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
      tabfile = open(tabFileName)
      previous = ""
diff --git a/findMotifs.py b/findMotifs.py

index e79401bf3544d8a6bb33b1c83aebb6a6fec1f227..a072d43432efe3d53a9455f8e92119401049f1bd 100755 (executable)
--- a/findMotifs.py
+++ b/findMotifs.py
@@ -8,14 +8,17 @@ try:
  except:
      pass
  
-import sys, os, optparse
+import sys
+import os
+import optparse
  from cistematic.experiments.fasta import Fasta
  from cistematic.programs.meme import Meme
  from cistematic.programs.cisGreedy import CisGreedy
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
  #TODO: cisSampler is not supported yet!
  #from cistematic.programs.cisSampler import CisSampler
  
-print "%prog: version 3.4"
+print "findMotifs: version 3.5"
  
  def main(argv=None):
      if not argv:
@@ -23,18 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog explabel regions.fsa [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--meme", action="store_true", dest="doMeme")
-    parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
-    parser.add_option("--logo", action="store_true", dest="saveLogo")
-    parser.add_option("--threshold", type="float", dest="threshold")
-    parser.add_option("--prefix", dest="motifPrefix")
-    parser.add_option("--numMotifs", dest="numMotifs")
-    parser.add_option("--maxWidth", type="int", dest="maxWidth")
-    parser.add_option("--maskLower", action="store_true", dest="maskLower")
-    parser.set_defaults(doMeme=False, doCisGreedy=False, saveLogo=False,
-                        threshold=75., numMotifs="10", maxWidth=28, maskLower=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -55,6 +47,34 @@ def main(argv=None):
                 doCisSampler)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--meme", action="store_true", dest="doMeme")
+    parser.add_option("--cisGreedy", action="store_true", dest="doCisGreedy")
+    parser.add_option("--logo", action="store_true", dest="saveLogo")
+    parser.add_option("--threshold", type="float", dest="threshold")
+    parser.add_option("--prefix", dest="motifPrefix")
+    parser.add_option("--numMotifs", dest="numMotifs")
+    parser.add_option("--maxWidth", type="int", dest="maxWidth")
+    parser.add_option("--maskLower", action="store_true", dest="maskLower")
+
+    configParser = getConfigParser()
+    section = "findMotifs"
+    doMeme = getConfigBoolOption(configParser, section, "doMeme", False)
+    doCisGreedy = getConfigBoolOption(configParser, section, "doCisGreedy", False)
+    saveLogo = getConfigBoolOption(configParser, section, "saveLogo", False)
+    threshold = getConfigFloatOption(configParser, section, "threshold", 75.)
+    numMotifs = getConfigOption(configParser, section, "numMotifs", "10")
+    maxWidth = getConfigIntOption(configParser, section, "maxWidth", 28)
+    maskLower = getConfigBoolOption(configParser, section, "maskLower", False)
+
+
+    parser.set_defaults(doMeme=doMeme, doCisGreedy=doCisGreedy, saveLogo=saveLogo,
+                        threshold=threshold, numMotifs=numMotifs, maxWidth=maxWidth, maskLower=maskLower)
+
+    return parser
+
+
  def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
                 numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False):
  
diff --git a/findall.py b/findall.py

index 10f007bd7d340fcf3b30606046c84ef42ee6556d..d20608f1997e2e6a0fd84d9169c78025485e4b57 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -49,10 +49,12 @@ import sys
  import math
  import string
  import optparse
-from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
+from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
+import ReadDataset
+import Region
  
  
-versionString = "%s: version 3.2" % sys.argv[0]
+versionString = "findall: version 3.2"
  print versionString
  
  def usage():
@@ -63,6 +65,27 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
+    parser = makeParser()
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        usage()
+        sys.exit(2)
+
+    factor = args[0]
+    hitfile = args[1]
+    outfilename = args[2]
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
+            options.stringency, options.noshift, options.autoshift, options.reportshift,
+            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
+            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
+            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
+            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.strandfilter, options.combine5p)
+
+
+def makeParser():
      usage = __doc__
  
      parser = optparse.OptionParser(usage=usage)
@@ -94,31 +117,47 @@ def main(argv=None):
      parser.add_option("--append", action="store_true", dest="doAppend")
      parser.add_option("--RNA", action="store_true", dest="rnaSettings")
      parser.add_option("--combine5p", action="store_true", dest="combine5p")
-    parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-                        stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-                        minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-                        normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-                        trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-                        cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-                        strandfilter=None, combine5p=False)
  
-    (options, args) = parser.parse_args(argv[1:])
-
-    if len(args) < 3:
-        usage()
-        sys.exit(2)
-
-    factor = args[0]
-    hitfile = args[1]
-    outfilename = args[2]
-
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
-            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
-            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
-            options.strandfilter, options.combine5p)
+    configParser = getConfigParser()
+    section = "findall"
+    minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
+    minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
+    maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
+    listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
+    shift = getConfigOption(configParser, section, "shift", None)
+    stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
+    noshift = getConfigBoolOption(configParser, section, "noshift", False)
+    autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
+    reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
+    minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
+    maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
+    leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
+    minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
+    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
+    doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
+    rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    ptype = getConfigOption(configParser, section, "ptype", None)
+    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
+    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
+
+    parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
+                        stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
+                        minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
+                        normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
+                        trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
+                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        strandfilter=strandfilter, combine5p=combine5p)
+
+    return parser
  
  
  def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
@@ -129,20 +168,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
              cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
              strandfilter=None, combine5p=False):
  
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
-
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
-
-    if noshift:
-        shiftValue = 0
+    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
  
      if trimValue is not None:
          trimValue = float(trimValue) / 100.
@@ -198,7 +224,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
  
      if rnaSettings:
          print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        shiftValue = 0
          doTrim = False
          doDirectionality = False
  
@@ -219,48 +244,44 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
      if doControl:
          print "\ncontrol:" 
-        mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
+        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
  
          if cachePages > mockRDS.getDefaultCacheSize():
              mockRDS.setDBcache(cachePages)
  
      print "\nsample:" 
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      readlen = hitRDS.getReadSize()
      if rnaSettings:
          maxSpacing = readlen
  
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
-
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
-    hitRDSsize = len(hitRDS) / 1000000.
-    if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
-
-        hitSampleSize = hitRDSsize
-
      if doAppend:
-        outfile = open(outfilename, "a")
+        fileMode = "a"
      else:
-        outfile = open(outfilename, "w")
+        fileMode = "w"
+
+    outfile = open(outfilename, fileMode)
  
      outfile.write("#ERANGE %s\n" % versionString)
      if doControl:
-        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
+        mockRDSsize = len(mockRDS) / 1000000.
+        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
      else:
-        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
+        controlSampleString = " none"
+
+    hitRDSsize = len(hitRDS) / 1000000.
+    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
  
      if withFlag != "":
          outfile.write("#restrict to Flag = %s\n" % withFlag)
  
+    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
+    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
+    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+
      outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
      outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
      outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
@@ -300,6 +321,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      if doControl:
          mockChromList = mockRDS.getChromosomes()
  
+    if normalize:
+        if doControl:
+            mockSampleSize = mockRDSsize
+
+        hitSampleSize = hitRDSsize
+
      hitChromList.sort()
  
      for chromosome in hitChromList:
@@ -307,16 +334,21 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
              continue
  
          print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
+        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
+                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
+                                      combine5p=combine5p)
          maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
          if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-                                    stringency, readlen, minHits, logfilename, outfile, outfilename)
+            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
+                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
+                                    logfilename, outfile, outfilename)
  
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
-                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                                                              noMulti, doControl, factor, trimValue, outputRegionList=True)
+        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
+                                                                  chromosome, useMulti, normalize, maxSpacing,
+                                                                  doDirectionality, doTrim, minHits, minRatio,
+                                                                  readlen, shiftValue, minPeak, minPlusRatio,
+                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                  doControl, factor, trimValue, outputRegionList=True)
  
          statistics["index"] += regionStats["index"]
          statistics["total"] += regionStats["total"]
@@ -332,10 +364,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
          #now do background swapping the two samples around
          print "calculating background..."
          backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
-                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                                                              noMulti, doControl, factor, backgroundTrimValue)
+        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+                                                                       chromosome, useMulti, normalize, maxSpacing,
+                                                                       doDirectionality, doTrim, minHits, minRatio,
+                                                                       readlen, shiftValue, minPeak, minPlusRatio,
+                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                       doControl, factor, backgroundTrimValue)
  
          statistics["mIndex"] += backgroundRegionStats["index"]
          statistics["mTotal"] += backgroundRegionStats["total"]
@@ -358,6 +392,25 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
  
  
+def determineShiftValue(autoshift, shift, noshift, rnaSettings):
+    shiftValue = 0
+    if autoshift:
+        shiftValue = "auto"
+
+    if shift is not None:
+        try:
+            shiftValue = int(shift)
+        except ValueError:
+            if shift == "learn":
+                shiftValue = "learn"
+                print "Will try to learn shift"
+
+    if noshift or rnaSettings:
+        shiftValue = 0
+
+    return shiftValue
+
+
  def doNotProcessChromosome(chromosome, doControl, mockChromList):
      skipChromosome = False
      if chromosome == "chrM":
@@ -384,19 +437,22 @@ def calculatePValue(dataList):
  
  
  def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename):
+               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
  
-    print "learning shift.... will need at least 30 training sites"
+    print "learning shift.... will need at least %d training sites" % minSites
      previousHit = -1 * maxSpacing
      hitList = [-1]
-    weightList = [0]
+    totalWeight = 0
      readList = []
      shiftDict = {}
      count = 0
      numStarts = 0
-    for (pos, sense, weight) in hitDict[chrom]:
+    for read in hitDict[chrom]:
+        pos = read["start"]
+        sense = read["sense"]
+        weight = read["weight"]
          if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = sum(weightList)
+            sumAll = totalWeight
              if normalize:
                  sumAll /= hitSampleSize
  
@@ -417,7 +473,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                      count += 1
  
              hitList = []
-            weightList = []
+            totalWeight = 0
              readList = []
              numStarts = 0
  
@@ -425,18 +481,21 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
              numStarts += 1
  
          hitList.append(pos)
-        weightList.append(weight)
-        readList.append((pos, sense, weight))
+        totalWeight += weight
+        readList.append({"start": pos, "sense": sense, "weight": weight})
          previousHit = pos
  
      bestShift = 0
      bestCount = 0
-    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
+    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
+                                                                                                       stringency * minRatio, stringency * readlen),
+                        "#number of training examples: %d" % count]
+    outline = string.join(learningSettings, "\n")
      print outline
      writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < 30:
+    if count < minSites:
          outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print outline
+        print >> outfile, outline
          writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
          shiftValue = 0
      else:
@@ -481,20 +540,24 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                  noMulti, doControl, factor, trimValue, outputRegionList=False):
  
      index = 0
-    total = 0
+    totalRegionWeight = 0
      failedCounter = 0
      previousHit = - 1 * maxSpacing
      currentHitList = [-1]
-    currentWeightList = [0]
+    currentTotalWeight = 0
+    currentUniqReadCount = 0
      currentReadList = []
      regionWeights = []
      outregions = []
      numStarts = 0
      hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
      maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for (pos, sense, weight) in hitDict[chrom]:
+    for read in hitDict[chrom]:
+        pos = read["start"]
+        sense = read["sense"]
+        weight = read["weight"]
          if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = sum(currentWeightList)
+            sumAll = currentTotalWeight
              if normalize:
                  sumAll /= rdsSampleSize
  
@@ -507,13 +570,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                  foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
                  if foldRatio >= minRatio:
                      # first pass, with absolute numbers
-                    if doDirectionality:
-                        (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
-                    else:
-                        (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
-
-                    bestPos = topPos[0]
-                    peakScore = smoothArray[bestPos]
+                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
+
+                    bestPos = peak.topPos[0]
+                    numHits = peak.numHits
+                    peakScore = peak.smoothArray[bestPos]
+                    numPlus = peak.numPlus
+                    shift = peak.shift
+                    numLeft = peak.numLeft
                      if normalize:
                          peakScore /= rdsSampleSize
  
@@ -523,28 +587,25 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                          stop = regionStop - regionStart - 1
                          startFound = False
                          while not startFound:
-                            if smoothArray[start] >= minSignalThresh or start == bestPos:
+                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
                                  startFound = True
                              else:
                                  start += 1
  
                          stopFound = False
                          while not stopFound:
-                            if smoothArray[stop] >= minSignalThresh or stop == bestPos:
+                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
                                  stopFound = True
                              else:
                                  stop -= 1
  
                          regionStop = regionStart + stop
                          regionStart += start
-                        try:
-                            if doDirectionality:
-                                (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-                            else:
-                                (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
-                        except:
-                            continue
+                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
  
+                        sumAll = trimPeak.numHits
+                        numPlus = trimPeak.numPlus
+                        numLeft = trimPeak.numLeft
                          if normalize:
                              sumAll /= rdsSampleSize
  
@@ -553,8 +614,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                              sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                          # just in case it changed, use latest data
                          try:
-                            bestPos = topPos[0]
-                            peakScore = smoothArray[bestPos]
+                            bestPos = trimPeak.topPos[0]
+                            peakScore = trimPeak.smoothArray[bestPos]
                          except:
                              continue
  
@@ -563,7 +624,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                              peakScore /= rdsSampleSize
  
                      elif outputRegionList:
-                        sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
+                        sumMulti = currentTotalWeight - currentUniqReadCount
  
                      if outputRegionList:
                          # normalize to RPM
@@ -583,9 +644,9 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                          plusRatio = float(numPlus)/numHits
                          if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
                              if outputRegionList:
-                                peak = ""
+                                peakDescription = ""
                                  if listPeak:
-                                    peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
+                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
  
                              if doDirectionality:
                                  if leftPlusRatio < numLeft / numPlus:
@@ -594,20 +655,27 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                                          plusP = plusRatio * 100.
                                          leftP = 100. * numLeft / numPlus
                                          # we have a region that passes all criteria
-                                        outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
+                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
+                                                                          factor, index, chrom, sumAll,
+                                                                          foldRatio, multiP, plusP, leftP,
+                                                                          peakDescription, shift)
+                                        outregions.append(region)
  
-                                    total += sumAll
+                                    totalRegionWeight += sumAll
                                  else:
                                      failedCounter += 1
                              else:
                                  # we have a region, but didn't check for directionality
                                  index += 1
-                                total += sumAll
+                                totalRegionWeight += sumAll
                                  if outputRegionList:
-                                    outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
+                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
+                                                           sumAll, foldRatio, multiP, peakDescription, shift)
+                                    outregions.append(region)
  
              currentHitList = []
-            currentWeightList = []
+            currentTotalWeight = 0
+            currentUniqReadCount = 0
              currentReadList = []
              numStarts = 0
  
@@ -615,12 +683,15 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
              numStarts += 1
  
          currentHitList.append(pos)
-        currentWeightList.append(weight)
-        currentReadList.append((pos, sense, weight))
+        currentTotalWeight += weight
+        if weight == 1.0:
+            currentUniqReadCount += 1
+
+        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
          previousHit = pos
  
      statistics = {"index": index,
-                  "total": total,
+                  "total": totalRegionWeight,
                    "failed": failedCounter
      }
  
@@ -634,43 +705,35 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor
      bestShift = 0
      shiftDict = {}
      for region in outregions:
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if reportshift:
+            outputList = [region.printRegionWithShift()]
+            if shiftValue == "auto":
+                try:
+                    shiftDict[region.shift] += 1
+                except KeyError:
+                    shiftDict[region.shift] = 1
+        else:
+            outputList = [region.printRegion()]
+
          # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
          if doPvalue:
-            sumAll = int(region[5])
+            sumAll = int(region.numReads)
              for i in xrange(sumAll):
                  pValue *= poissonmean
                  pValue /= i+1
  
-        if shiftValue == "auto" and reportshift:
-            try:
-                shiftDict[region[-1]] += 1
-            except KeyError:
-                shiftDict[region[-1]] = 1
-
-        try:
-            if reportshift:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
-            else:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
-        except:
-            if reportshift:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
-            else:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
-
-        if doPvalue:
-            outputList.append("%1.2g" % pValue)
+            outputList.append("%1.2f" % pValue)
  
          outline = string.join(outputList, "\t")
          print outline
          print >> outfile, outline
  
-    if shiftValue == "auto" and reportshift:
-        bestCount = 0
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
+    bestCount = 0
+    for shift in sorted(shiftDict):
+        if shiftDict[shift] > bestCount:
+            bestShift = shift
+            bestCount = shiftDict[shift]
  
      return bestShift
  
diff --git a/fraction.py b/fraction.py

index f955fce582c98279b1e4db358f62511faa298f25..ba7edbda7c04c39d97df90d67e3b6f6c3aad77bf 100755 (executable)
--- a/fraction.py
+++ b/fraction.py
@@ -9,10 +9,11 @@ try:
  except:
      pass
  
-from random import random
  import sys
+from random import random
+
  
-print "%s: version 1.0" % sys.argv[0]
+print "fraction: version 1.1"
  
  def main(argv=None):
      if not argv:
diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py

index 058ad8236a4f7df6d08a6ca293d0a27ab090c9bd..4ce97e4fc1380517447153e59fba80e5531e3029 100755 (executable)
--- a/geneDownstreamBins.py
+++ b/geneDownstreamBins.py
@@ -10,12 +10,13 @@ except:
      pass
  
  # originally from version 1.3 of geneDnaDownstreamCounts.py
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
  
-print "%prog: version 2.0"
+print "geneDownstreamBins: version 2.1"
  
  def main(argv=None):
      if not argv:
@@ -23,10 +24,7 @@ def main(argv=None):
  
      usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--max", type="int", dest="standardMinDist",
-                      help="maximum region in bp")
-    parser.set_defaults(standardMinDist=3000)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -40,22 +38,33 @@ def main(argv=None):
      geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--max", type="int", dest="standardMinDist",
+                      help="maximum region in bp")
+
+    configParser = getConfigParser()
+    section = "geneDownstreamBins"
+    standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
+
+    parser.set_defaults(standardMinDist=standardMinDist)
+
+    return parser
+
+
  def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
      bins = 10
      standardMinThresh = standardMinDist / bins
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      normalizationFactor = 1.0
      if normalize:
          hitDictSize = len(hitRDS)
          normalizationFactor = hitDictSize / 1000000.
  
      hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
      hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
-    geneinfoDict = idb.getallGeneInfo(genome)
      featuresDict = hg.getallGeneFeatures()
  
      outfile = open(outfilename, "w")
@@ -115,7 +124,9 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac
              continue
  
          binList = [0.] * bins
-        for (tagStart, sense, weight) in hitDict[chrom]:
+        for read in hitDict[chrom]:
+            tagStart = read["start"]
+            weight = read["weight"]
              tagStart -= gstart
              if tagStart >= glen:
                  break
diff --git a/geneLocusBins.py b/geneLocusBins.py

index e6b403f2b33ef9beb6b0a8b2d1d8acb778cc1547..6a1efc48e270d92a8453e8fcb6907db00185ed92 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -10,12 +10,14 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
  
-print '%s: version 2.1' % sys.argv[0]
+print "geneLocusBins: version 2.2"
  
  def main(argv=None):
      if not argv:
@@ -23,25 +25,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--bins", type="int", dest="bins",
-                      help="number of bins to use [default: 10]")
-    parser.add_option("--flank", type="int", dest="flankBP",
-                      help="number of flanking BP on both upstream and downstream [default: 0]")
-    parser.add_option("--upstream", type="int", dest="upstreamBP",
-                      help="number of upstream flanking BP [default: 0]")
-    parser.add_option("--downstream", type="int", dest="downstreamBP",
-                      help="number of downstream flanking BP [default: 0]")
-    parser.add_option("--nocds", action="store_false", dest="doCDS",
-                      help="do not CDS")
-    parser.add_option("--raw", action="store_false", dest="normalizeBins",
-                      help="do not normalize results")
-    parser.add_option("--force", action="store_false", dest="limitNeighbor",
-                      help="limit neighbor region")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--cache", action="store_true", dest="doCache",
-                      help="use cache")
-    parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -71,12 +55,53 @@ def main(argv=None):
      geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
  
  
-def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--bins", type="int", dest="bins",
+                      help="number of bins to use [default: 10]")
+    parser.add_option("--flank", type="int", dest="flankBP",
+                      help="number of flanking BP on both upstream and downstream [default: 0]")
+    parser.add_option("--upstream", type="int", dest="upstreamBP",
+                      help="number of upstream flanking BP [default: 0]")
+    parser.add_option("--downstream", type="int", dest="downstreamBP",
+                      help="number of downstream flanking BP [default: 0]")
+    parser.add_option("--nocds", action="store_false", dest="doCDS",
+                      help="do not CDS")
+    parser.add_option("--raw", action="store_false", dest="normalizeBins",
+                      help="do not normalize results")
+    parser.add_option("--force", action="store_false", dest="limitNeighbor",
+                      help="limit neighbor region")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+
+    configParser = getConfigParser()
+    section = "geneLocusBins"
+    normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    bins = getConfigIntOption(configParser, section, "bins", 10)
+    flankBP = getConfigOption(configParser, section, "flankBP", None)
+    upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
+    downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
+    doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
+    limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
+
+    parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
+                        upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
+                        limitNeighbor=limitNeighbor)
+
+    return parser
+
+def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
+                  normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
+                  acceptfile=None):
+
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
@@ -86,9 +111,7 @@ def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, do
      hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=doCache)
-
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
      if doFlank:
          locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
      else:
@@ -97,9 +120,9 @@ def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, do
      gidList = hg.allGIDs()
      gidList.sort()
      for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+        for region in acceptDict[chrom]:
+            if region.label not in gidList:
+                gidList.append(region.label)
  
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
  
diff --git a/geneLocusCounts.py b/geneLocusCounts.py

index 0e8792b740c3c83b524a03c0084027ed0bc7766c..218017e10a0d9a04d0b1cd8adde8449ecb60dcb8 100755 (executable)
--- a/geneLocusCounts.py
+++ b/geneLocusCounts.py
@@ -17,12 +17,14 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
  
-print '%s: version 3.0' % sys.argv[0]
+print "geneLocusCounts: version 3.1"
  
  def main(argv=None):
      if not argv:
@@ -30,19 +32,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome readDB outfilename [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
-                      help="do not count unique reads")
-    parser.add_option("--multi", action="store_true", dest="doUniqs",
-                      help="count multi reads")
-    parser.add_option("--splices", action="store_true", dest="doUniqs",
-                      help="count splice reads")
-    parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--noCDS", action="store_false", dest="useCDS")
-    parser.add_option("--locusLength", type="int", dest="bplength",
-                      help="number of bases to report")
-    parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -68,71 +58,107 @@ def main(argv=None):
      except ValueError:
          pass
  
-    geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+    geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
+                    options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
+                    options.bplength, options.acceptfile)
  
  
-def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
-    print 'returning only up to %d bp from gene locus' % bplength
-    print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+                      help="do not count unique reads")
+    parser.add_option("--multi", action="store_true", dest="doUniqs",
+                      help="count multi reads")
+    parser.add_option("--splices", action="store_true", dest="doUniqs",
+                      help="count splice reads")
+    parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--noCDS", action="store_false", dest="useCDS")
+    parser.add_option("--locusLength", type="int", dest="bplength",
+                      help="number of bases to report")
+
+    configParser = getConfigParser()
+    section = "geneLocusCounts"
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
+    spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
+    bplength = getConfigIntOption(configParser, section, "bplength", 0)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+
+    parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
+                        useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
+                        acceptfile=acceptfile)
+
+    return parser
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
+                    doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
+                    spanTSS=False, bplength=0, acceptfile=""):
+
+    print "returning only up to %d bp from gene locus" % bplength
+    print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
  
      if acceptfile:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  
-    hitRDS = readDataset(hitfile, verbose = True)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
  
      totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
-    gidCount = {}
-    gidList = []
-    gidLen = {}
-    geneinfoDict = idb.getallGeneInfo(genome)
-    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
-
+    gidDict = {}
+    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
      locusChroms = locusByChromDict.keys()
      chromList = hitRDS.getChromosomes(fullChrom=False)
      chromList.sort()
      for chrom in chromList:
-        if chrom == 'M' or chrom not in locusChroms:
+        if doNotProcessChromosome(chrom, locusChroms):
              continue
  
-        print 'chr' + chrom
-        fullchrom = 'chr' + chrom
+        fullchrom = "chr%s" % chrom
+        print fullchrom
          hitRDS.memSync(fullchrom, index=True)
          for (start, stop, gid, length) in locusByChromDict[chrom]:
-            if gid not in gidList:
-                gidList.append(gid)
-                gidCount[gid] = 0
-                gidLen[gid] = length
+            if not gidDict.has_key(gid):
+                gidDict[gid] = {"count": 0, "length": length}
  
-            gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+            gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
  
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
  
      totalCount /= 1000000.
  
-    outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+    outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
+    gidList = gidDict.keys()
      gidList.sort()
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
      for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except:
+            except (KeyError, IndexError):
                  pass
          else:
              symbol = gid
  
-        if gid in gidCount and gid in gidLen:
-            rpm  = gidCount[gid] / totalCount
-            rpkm = 1000. * rpm / gidLen[gid]
-            outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+        gidCount = gidDict[gid]["count"]
+        gidLength = gidDict[gid]["length"]
+        rpm  = gidCount / totalCount
+        rpkm = 1000. * rpm / gidLength
+        outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
  
      outfile.close()
  
+
+def doNotProcessChromosome(chrom, locusChroms):
+    return chrom == "M" or chrom not in locusChroms
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/geneLocusPeaks.py b/geneLocusPeaks.py

index fdfddf95ccb821187b887876ea0aee2c5b283d48..00c77d812b3029eaacb294b94377b771bc7d4444 100755 (executable)
--- a/geneLocusPeaks.py
+++ b/geneLocusPeaks.py
@@ -9,12 +9,15 @@ try:
  except:
      pass
  
-from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getLocusByChromDict
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
-import sys, optparse
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
-print "%prog: version 2.0"
+
+print "geneLocusPeaks: version 2.1"
  
  def main(argv=None):
      if not argv:
@@ -22,13 +25,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--up", type="int", dest="upstream")
-    parser.add_option("--down", type="int", dest="downstream")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--raw", action="store_false", dest="normalize")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -43,6 +40,27 @@ def main(argv=None):
      geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--up", type="int", dest="upstream")
+    parser.add_option("--down", type="int", dest="downstream")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+
+    configParser = getConfigParser()
+    section = "geneLocusPeaks"
+    upstream = getConfigIntOption(configParser, section, "upstream", 0)
+    downstream = getConfigIntOption(configParser, section, "downstream", 0)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+
+    parser.set_defaults(upstream=upstream, downstream=downstream, acceptfile=acceptfile, normalize=normalize, doCache=doCache)
+
+    return parser
+
+
  def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
      acceptDict = {}
  
@@ -51,7 +69,7 @@ def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, accep
  
      print "upstream = %d downstream = %d" % (upstream, downstream)
  
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalize:
@@ -61,19 +79,17 @@ def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, accep
      hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
      gidCount = {}
      gidPos = {}
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
      locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
  
      gidList = hg.allGIDs()
      gidList.sort()
      for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+        for region in acceptDict[chrom]:
+            if region.label not in gidList:
+                gidList.append(region.label)
  
      for gid in gidList:
          gidCount[gid] = 0
@@ -85,10 +101,10 @@ def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, accep
          print chrom
          for (start, stop, gid, glen) in locusByChromDict[chrom]:
              gidCount[gid] = 0.
-            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
-            if len(topPos) > 0:
-                gidCount[gid] = smoothArray[topPos[0]]
-                gidPos[gid] = (chrom, start + topPos[0])
+            peak = findPeak(hitDict[chrom], start, glen, readlen)
+            if len(peak.topPos) > 0:
+                gidCount[gid] = peak.smoothArray[peak.topPos[0]]
+                gidPos[gid] = (chrom, start + peak.topPos[0])
              else:
                  gidPos[gid] = (chrom, start)
  
diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py

index b905cf0f1317e2fbdb6af64008511870dd81c107..cf5065ab88c40279c7ca5dfb30b70bc99dfd2065 100755 (executable)
--- a/geneMrnaCounts.py
+++ b/geneMrnaCounts.py
@@ -6,11 +6,12 @@ except:
  
  import sys
  import optparse
-from commoncode import readDataset, getFeaturesByChromDict
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
  from cistematic.genomes import Genome
  from cistematic.core.geneinfo import geneinfoDB
  
-print "%s: version 5.1" % sys.argv[0]
+print "geneMrnaCounts: version 5.2"
  
  
  def main(argv=None):
@@ -19,21 +20,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome rdsfile outfilename [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--stranded", action="store_true", dest="trackStrand")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
-    parser.add_option("--multi", action="store_true", dest="doMulti")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.add_option("--searchGID", action="store_true", dest="searchGID")
-    parser.add_option("--countfeatures", action="store_true", dest="countFeats")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--markGID", action="store_true", dest="markGID")
-    parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
-                        extendGenome="", replaceModels=False, searchGID=False,
-                        countFeats=False, cachePages=None, markGID=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -49,6 +36,38 @@ def main(argv=None):
                     options.searchGID, options.countFeats, options.cachePages, options.markGID)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_true", dest="trackStrand")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+    parser.add_option("--multi", action="store_true", dest="doMulti")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--searchGID", action="store_true", dest="searchGID")
+    parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+
+    configParser = getConfigParser()
+    section = "geneMrnaCounts"
+    trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+    searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
+    countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    markGID = getConfigBoolOption(configParser, section, "markGID", False)
+
+    parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
+                        extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
+                        countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+
+    return parser
+
  def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
                     doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
                     searchGID=False, countFeats=False, cachePages=None, markGID=False):
@@ -73,7 +92,7 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
          cachePages = 100000
          doCache = False
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 7acf0b92fd1d93da0c29d4bdbba08af6c7d2a937..38e853aec6172be26b448066692193f1537b6926 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -4,13 +4,15 @@ try:
  except:
      print 'psyco not running'
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print '%s: version 4.1' % sys.argv[0]
+print "geneMrnaCountsWeighted: version 4.3"
  
  
  def main(argv=None):
@@ -19,21 +21,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--stranded", action="store_false", dest="ignoreSense")
-    parser.add_option("--uniq", action="store_true", dest="withUniqs")
-    parser.add_option("--multi", action="store_true", dest="withMulti")
-    parser.add_option("--record", action="store_true", dest="recording",
-                      help="ignored with uniq reads")
-    parser.add_option("--accept", dest="acceptfile")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
-                        acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
-                        replaceModels=False)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -46,13 +34,48 @@ def main(argv=None):
      outfilename = args[3]
  
      geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
-                           options.withUniqs, options.withMulti, options.recording,
+                           options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_false", dest="ignoreSense")
+    parser.add_option("--uniq", action="store_true", dest="withUniqs")
+    parser.add_option("--multi", action="store_true", dest="withMulti")
+    parser.add_option("--accept", dest="acceptfile")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+    configParser = getConfigParser()
+    section = "geneMrnaCountsWeighted"
+    ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", None)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+    parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
+                        acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
+                        replaceModels=replaceModels)
+
+    return parser
+
+
+#TODO: Reported user performance issue. Long run times in conditions:
+#    small number of reads ~40-50M
+#    all features on single chromosome
+#
+#    User states has been a long time problem.
+
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
-                           withUniqs=False, withMulti=False, recording=False, acceptfile=None,
+                           withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
      if (not withUniqs and not withMulti) or (withUniqs and withMulti):
@@ -62,70 +85,40 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
      if cachePages is not None:
          cacheGeneDB(genome)
          hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        idb = geneinfoDB(cache=True)
          print "%s cached" % genome
          doCache = True
      else:
          doCache = False
          cachePages = 0
          hg = Genome(genome, inRAM=True)
-        idb = geneinfoDB()
-
-    if acceptfile is not None:
-        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
-    else:
-        acceptDict = {}
-
-    if recording and withUniqs:
-        recording = False
  
      if extendGenome:
          if replaceModels:
              print "will replace gene models with %s" % extendGenome
          else:
              print "will extend gene models with %s" % extendGenome
-    else:
-        replaceModels = False
  
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace = replaceModels)
-    
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
-    readlen = hitRDS.getReadSize()
-
-    geneinfoDict = idb.getallGeneInfo(genome)
-    geneannotDict = hg.allAnnotInfo()
-    gidCount = {}
-    gidReadDict = {}
-
-    featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
-    gidList = hg.allGIDs()
+    allGIDs = set(hg.allGIDs())
+    if acceptfile is not None:
+        regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
+        for chrom in regionDict:
+            for region in regionDict[chrom]:
+                allGIDs.add(region.label)
+    else:
+        regionDict = {}
  
-    gidList.sort()
-    for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+    featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
  
-    for gid in gidList:
-        gidCount[gid] = 0
-        gidReadDict[gid] = []
-
-    uniqueCountDict = {}
+    gidReadDict = {}
      read2GidDict = {}
-
-    uniquecounts = open(countfile)
-    for line in uniquecounts:
-        fields = line.strip().split()
-        # add a pseudo-count here to ease calculations below
-        uniqueCountDict[fields[0]] = float(fields[-1]) + 1
-
-    uniquecounts.close()
-
-    outfile = open(outfilename, "w")
+    for gid in allGIDs:
+        gidReadDict[gid] = []
  
      index = 0
      if withMulti and not withUniqs:
@@ -133,124 +126,98 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
      else:
          chromList = hitRDS.getChromosomes(fullChrom=False)
  
-    for achrom in chromList:
-        if achrom not in featuresByChromDict:
+    readlen = hitRDS.getReadSize()
+    for chromosome in chromList:
+        if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
-        print "\n" + achrom + " ",
-        startFeature = 0
-        fullchrom = "chr" + achrom
+        print "\n%s " % chromosome,
+        fullchrom = "chr%s" % chromosome
          hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
-        featList = featuresByChromDict[achrom]
-        if ignoreSense:
-            for (tagStart, tagReadID) in hitDict[fullchrom]:
-                index += 1
-                if index % 100000 == 0:
-                    print "read %d" % index,
-
-                stopPoint = tagStart + readlen
-                if startFeature < 0:
-                    startFeature = 0
-
-                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
-                    if tagStart > stop:
-                        startFeature += 1
-                        continue
-
-                    if start > stopPoint:
-                        startFeature -= 100
-                        break
-
-                    if start <= tagStart <= stop:
-                        try:
-                            gidReadDict[gid].append(tagReadID)
-                            if tagReadID in read2GidDict:
-                                if gid not in read2GidDict[tagReadID]:
-                                    read2GidDict[tagReadID].append(gid)
-                            else:
-                                read2GidDict[tagReadID] = [gid]
-
-                            gidCount[gid] += 1
-                        except:
-                            print "gid %s not in gidReadDict" % gid
-
-                        stopPoint = stop
-        else:
-            for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
-                index += 1
-                if index % 100000 == 0:
-                    print "read %d" % index,
-
-                stopPoint = tagStart + readlen
-                if startFeature < 0:
-                    startFeature = 0
-
-                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
-                    if tagStart > stop:
-                        startFeature += 1
-                        continue
-
-                    if start > stopPoint:
-                        startFeature -= 100
-                        break
-
-                    if sense == "R":
-                        sense = "-"
-                    else:
-                        sense = "+"
-
-                    if start <= tagStart <= stop and sense == tSense:
-                        try:
-                            gidReadDict[gid].append(tagReadID)
-                            if tagReadID in read2GidDict:
-                                if gid not in read2GidDict[tagReadID]:
-                                    read2GidDict[tagReadID].append(gid)
-                            else:
-                                read2GidDict[tagReadID] = [gid]
-
-                            gidCount[gid] += 1
-                        except:
-                            print "gid %s not in gidReadDict" % gid
-
-                        stopPoint = stop
-
-    for gid in gidList:
-        if "FAR" not in gid:
-            symbol = "LOC" + gid
-            geneinfo = ""
+        featureList = featuresByChromDict[chromosome]
+
+        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        index = totalProcessedReads
+        for (tagReadID, gid) in readGidList:
              try:
-                geneinfo = geneinfoDict[gid]
-                if genome == "celegans":
-                    symbol = geneinfo[0][1]
+                gidReadDict[gid].append(tagReadID)
+                if tagReadID in read2GidDict:
+                    read2GidDict[tagReadID].add(gid)
                  else:
-                    symbol = geneinfo[0][0]
-            except:
-                try:
-                    symbol = geneannotDict[(genome, gid)][0]
-                except:
-                    symbol = "LOC" + gid
-        else:
-            symbol = gid
+                    read2GidDict[tagReadID] = set([gid])
+            except KeyError:
+                print "gid %s not in gidReadDict" % gid
  
-        tagCount = 0.
-        for readID in gidReadDict[gid]:
-            try:
-                tagValue = uniqueCountDict[gid]
-            except:
-                tagValue = 1
+    writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
+    if doCache:
+        uncacheGeneDB(genome)
  
-            tagDenom = 0.
-            for aGid in read2GidDict[readID]:
-                try:
-                    tagDenom += uniqueCountDict[aGid]
-                except:
-                    tagDenom += 1
  
-        try:
-            tagCount += tagValue / tagDenom
-        except ZeroDivisionError:
-            tagCount = 0
-    
+def doNotProcessChromosome(chromosome, chromosomeList):
+    return chromosome not in chromosomeList
+
+
+def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+
+    startFeature = 0
+    readGidList = []
+    ignoreSense = True
+    for read in hitDict[fullchrom]:
+        tagStart = read["start"]
+        tagReadID = read["readID"]
+        if read.has_key("sense"):
+            tagSense = read["sense"]
+            ignoreSense = False
+
+        index += 1
+        if index % 100000 == 0:
+            print "read %d" % index,
+
+        stopPoint = tagStart + readlen
+        if startFeature < 0:
+            startFeature = 0
+
+        for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+            if tagStart > stop:
+                startFeature += 1
+                continue
+
+            if start > stopPoint:
+                startFeature -= 100
+                break
+
+            if not ignoreSense:
+                if sense == "R":
+                    sense = "-"
+                else:
+                    sense = "+"
+
+            if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
+                readGidList.append((tagReadID, gid))
+                stopPoint = stop
+
+    return readGidList, index
+
+
+def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
+
+    uniqueCountDict = {}
+    uniquecounts = open(countFilename)
+    for line in uniquecounts:
+        fields = line.strip().split()
+        # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
+        uniqueCountDict[fields[0]] = float(fields[-1]) + 1
+
+    uniquecounts.close()
+
+    genomeName = genome.genome
+    geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
+    geneannotDict = genome.allAnnotInfo()
+    outfile = open(outFilename, "w")
+    for gid in allGIDs:
+        symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
+        tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
          if doVerbose:
              print "%s %s %f" % (gid, symbol, tagCount)
  
@@ -258,8 +225,49 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
      outfile.close()
  
-    if doCache:
-        uncacheGeneDB(genome)
+
+def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
+    if "FAR" not in gid:
+        symbol = "LOC%s" % gid
+        geneinfo = ""
+        try:
+            geneinfo = geneinfoDict[gid]
+            if genomeName == "celegans":
+                symbol = geneinfo[0][1]
+            else:
+                symbol = geneinfo[0][0]
+        except (KeyError, IndexError):
+            try:
+                symbol = geneannotDict[(genomeName, gid)][0]
+            except (KeyError, IndexError):
+                symbol = "LOC%s" % gid
+    else:
+        symbol = gid
+
+    return symbol
+
+
+def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
+    tagCount = 0.
+    for readID in gidReadDict[gid]:
+        try:
+            tagValue = uniqueCountDict[gid]
+        except KeyError:
+            tagValue = 1
+
+        tagDenom = 0.
+        for relatedGID in read2GidDict[readID]:
+            try:
+                tagDenom += uniqueCountDict[relatedGID]
+            except KeyError:
+                tagDenom += 1
+
+        try:
+            tagCount += tagValue / tagDenom
+        except ZeroDivisionError:
+            pass
+
+    return tagCount
  
  
  if __name__ == "__main__":
diff --git a/geneNeighbors.py b/geneNeighbors.py

index 8ec363c77fdbacaa39534d8debe1a65abb1962bb..221ffe215e2ca1c9b21560334329e7b0fbe1ba66 100755 (executable)
--- a/geneNeighbors.py
+++ b/geneNeighbors.py
@@ -9,12 +9,13 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigIntOption, getConfigBoolOption, getConfigOption
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
  
-print "%prog: version 2.4"
+print "geneNeighbors: version 2.5" % sys.argv[0]
  
  
  def main(argv=None):
@@ -23,16 +24,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome outfilename [--regions acceptfile] [--downstream bp] [--upstream bp] [--mindist bp] [--minlocus bp] [--maxlocus bp] [--samesense]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--regions", dest="acceptFile")
-    parser.add_option("--downstream", type="int", dest="downMax")
-    parser.add_option("--upstream", type="int", dest="upMax")
-    parser.add_option("--mindist", type="int", dest="minDist")
-    parser.add_option("--minlocus", type="int", dest="minLocus")
-    parser.add_option("--maxlocus", type="int", dest="maxLocus")
-    parser.add_option("--samesense", action="store_true", dest="checkSense")
-    parser.set_defaults(acceptfile="", checkSense=False, downMax=10000000,
-                        upMax=10000000, minDist=0, minLocus=-1, maxLocus=10000000)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -49,6 +41,32 @@ def main(argv=None):
      print "\n%d genes matched" % index
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--regions", dest="acceptFile")
+    parser.add_option("--downstream", type="int", dest="downMax")
+    parser.add_option("--upstream", type="int", dest="upMax")
+    parser.add_option("--mindist", type="int", dest="minDist")
+    parser.add_option("--minlocus", type="int", dest="minLocus")
+    parser.add_option("--maxlocus", type="int", dest="maxLocus")
+    parser.add_option("--samesense", action="store_true", dest="checkSense")
+
+    configParser = getConfigParser()
+    section = "geneNeighbors"
+    acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+    checkSense = getConfigBoolOption(configParser, section, "checkSense", False)
+    downMax = getConfigIntOption(configParser, section, "downMax", 10000000)
+    upMax = getConfigIntOption(configParser, section, "upMax", 10000000)
+    minDist = getConfigIntOption(configParser, section, "minDist", 0)
+    minLocus = getConfigIntOption(configParser, section, "minLocus", -1)
+    maxLocus = getConfigIntOption(configParser, section, "maxLocus", 10000000)
+
+    parser.set_defaults(acceptfile=acceptfile, checkSense=checkSense, downMax=downMax,
+                        upMax=upMax, minDist=minDist, minLocus=minLocus, maxLocus=maxLocus)
+
+    return parser
+
+
  def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
                    downMax=10000000, upMax=10000000, minDist=0, minLocus=-1,
                    maxLocus=10000000):
@@ -58,17 +76,15 @@ def geneNeighbors(genome, outfilename, acceptfile="", checkSense=False,
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
      locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
      gidList = hg.allGIDs()
      gidList.sort()
      for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+        for region in acceptDict[chrom]:
+            if region.label not in gidList:
+                gidList.append(region.label)
  
      index = 0
      outfile = open(outfilename,"w")
diff --git a/geneStallingBins.py b/geneStallingBins.py

index f08abe6292ee9ba8cc7ea26039073f8ae2afb91d..4b869f2ff4b64f4d301eac701cf1e8f612ee61fd 100755 (executable)
--- a/geneStallingBins.py
+++ b/geneStallingBins.py
@@ -11,12 +11,14 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, computeRegionBins, getLocusByChromDict, getConfigParser, getConfigBoolOption, getConfigIntOption, getConfigOption
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
  
-print "%prog: version 1.3"
+print "geneStallingBins: version 1.4"
  
  
  def main(argv=None):
@@ -25,16 +27,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--upstream", type="int", dest="upstreamBp")
-    parser.add_option("--downstream", type="int", dest="downstreamBp")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--normalize", action="store_true", dest="normalize")
-    parser.add_option("--tagCount", action="store_true", dest="doTagCount")
-    parser.add_option("--bins", type="int", dest="bins")
-    parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
-                        doCache=False, normalize=False, doTagCount=False, bins=4)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -51,6 +44,32 @@ def main(argv=None):
                       options.normalize, options.doTagCount, options.bins)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--upstream", type="int", dest="upstreamBp")
+    parser.add_option("--downstream", type="int", dest="downstreamBp")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--normalize", action="store_true", dest="normalize")
+    parser.add_option("--tagCount", action="store_true", dest="doTagCount")
+    parser.add_option("--bins", type="int", dest="bins")
+
+    configParser = getConfigParser()
+    section = "geneStallingBins"
+    upstreamBp = getConfigIntOption(configParser, section, "upstreamBp", 300)
+    downstreamBp = getConfigIntOption(configParser, section, "downstreamBp", 0)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", False)
+    doTagCount = getConfigBoolOption(configParser, section, "doTagCount", False)
+    bins = getConfigIntOption(configParser, section, "bins", 4)
+
+    parser.set_defaults(upstreamBp=upstreamBp, downstreamBp=downstreamBp, acceptfile=acceptfile,
+                        doCache=doCache, normalize=normalize, doTagCount=doTagCount, bins=bins)
+
+    return parser
+
+
  def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
                       downstreamBp=0, acceptfile="", doCache=False, normalize=False,
                       doTagCount=False, bins=4):
@@ -62,14 +81,14 @@ def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
      doCDS = True
      limitNeighbor = False
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      readlen = hitRDS.getReadSize()
      hitNormalizationFactor = 1.0
      if normalize:
          hitDictSize = len(hitRDS)
          hitNormalizationFactor = hitDictSize / 1000000.
  
-    controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    controlRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      controlNormalizationFactor = 1.0
      if normalize:
          controlDictSize = len(hitRDS)
@@ -79,17 +98,15 @@ def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
      controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=doCache)
-
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
      locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
  
      gidList = hg.allGIDs()
      gidList.sort()
      for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+        for region in acceptDict[chrom]:
+            if region.label not in gidList:
+                gidList.append(region.label)
  
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
      (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
diff --git a/geneStartBins.py b/geneStartBins.py

index cbb3c4a37d9d74cbbb356038c80912a6643200f1..d929d41a89a2da712b1ee953770e0b01bb2f96bb 100755 (executable)
--- a/geneStartBins.py
+++ b/geneStartBins.py
@@ -9,13 +9,13 @@ try:
  except:
      pass
  
-# originally from version 1.3 of geneDownstreamBins.py
+import sys
  from commoncode import *
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
-import sys
  
-print '%s: version 2.0' % sys.argv[0]
+
+print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
      print '\n\twhere regionSize is the optional maximum region in bp\n'
@@ -43,7 +43,7 @@ if '-cache' in sys.argv:
  bins = 10
  standardMinThresh = standardMinDist / bins
  
-hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
  readlen = hitRDS.getReadSize()
  normalizationFactor = 1.0
  if normalize:
@@ -51,13 +51,10 @@ if normalize:
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
-idb = geneinfoDB(cache=True)
-
  gidDict = {}
-geneinfoDict = idb.getallGeneInfo(genome)
+geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
  
-#infile = open(infilename)
  outfile = open(outfilename,'w')
  
  gidList = hg.allGIDs()
@@ -72,46 +69,58 @@ for gid in gidList:
          symbol = geneinfo[0][0]
      except:
          print geneinfo
+
      newfeatureList = []
      if len(featureList) == 0:
          continue
+
      for (ftype, chrom, start, stop, fsense) in featureList:
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
+
      if chrom not in hitDict:
          continue
+
      newfeatureList.sort()
      if len(newfeatureList) < 1:
-        #print '%s %s %d' % (gid, symbol, -1)
-        #outfile.write('%s\t%s\t%d\n' % (gid, symbol, -1))
          continue
+
      glen = standardMinDist / 2
      if fsense == 'F':
          nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
          if nextGene < glen * 2:
-                glen = nextGene / 2
+            glen = nextGene / 2
+
          if glen < 1:
-                glen = 1
+            glen = 1
+
          gstart = newfeatureList[0][0] - glen
          if gstart < 0:
-                gstart = 0
+            gstart = 0
+
          gstop = newfeatureList[0][0]  + glen
      else:
          nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
          if nextGene < glen * 2:
              glen = nextGene / 2
+
          if glen < 1:
              glen = 1
+
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
+
      binList = [0] * bins
-    for (tagStart, sense, weight) in hitDict[chrom]:
-        tagStart -= gstart 
+    for read in hitDict[chrom]:
+        tagStart = read["start"] - gstart
+        sense = read["sense"]
+        weight = read["weight"]
          if tagStart >= 2 * glen:
              break
+
          if tagStart > 0:
              tagCount += weight
              if fsense == 'R':
@@ -122,13 +131,16 @@ for gid in gidList:
                  rdist = 2 * glen - tagStart
                  binID = rdist / standardMinThresh 
                  binList[binID] += weight
+
      if tagCount < 2:
          continue
+
      print '%s %s %d %d %s' % (gid, symbol, tagCount, glen, str(binList))
      outfile.write('%s\t%s\t%d\t%d' % (gid, symbol, tagCount, glen))
      for binAmount in binList:
          outfile.write('\t%d' % binAmount)
+
      outfile.write('\n')
-#infile.close()
+
  outfile.close()
  
diff --git a/geneUpstreamBins.py b/geneUpstreamBins.py

index e8554161c6fe18417dd227a2bed985a0c826038c..3bdd1dd773c94372692ff066995445c680bd8055 100755 (executable)
--- a/geneUpstreamBins.py
+++ b/geneUpstreamBins.py
@@ -9,12 +9,13 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
  
-print "%prog: version 2.0"
+print "geneUpstreamBins: version 2.1"
  
  def main(argv=None):
      if not argv:
@@ -22,12 +23,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--raw", action="store_false", dest="normalize",
-                       help="maximum region in bp")
-    parser.add_option("--max", type="int", dest="standardMinDist")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -41,11 +37,29 @@ def main(argv=None):
      geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--raw", action="store_false", dest="normalize",
+                       help="maximum region in bp")
+    parser.add_option("--max", type="int", dest="standardMinDist")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+
+    configParser = getConfigParser()
+    section = "geneUpstreamBins"
+    standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+
+    parser.set_defaults(standardMinDist=standardMinDist, normalize=normalize, doCache=doCache)
+
+    return parser
+
+
  def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
      bins = 10
      standardMinThresh = standardMinDist / bins
  
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
      normalizationFactor = 1.0
      if normalize:
          totalCount = len(hitRDS)
@@ -54,9 +68,7 @@ def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normali
      hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  
      hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
      featuresDict = hg.getallGeneFeatures()
  
      outfile = open(outfilename,"w")
@@ -117,7 +129,9 @@ def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normali
              continue
  
          binList = [0] * bins
-        for (tagStart, sense, weight) in hitDict[chrom]:
+        for read in hitDict[chrom]:
+            tagStart = read["start"]
+            weight = read["weight"]
              tagStart -= gstart
              if tagStart >= glen:
                  break
diff --git a/getGOgenes.py b/getGOgenes.py

index 0a320ee9839f74912a4eb807b0771c17302d6421..4c86443e4b66e5b6dc9e908cd9c6714502ca0914 100755 (executable)
--- a/getGOgenes.py
+++ b/getGOgenes.py
@@ -1,8 +1,9 @@
-import sys, optparse
+import sys
+import optparse
  from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigBoolOption
  
-print "%prog: version 3.1"
+print "getGOgenes: version 3.2"
  
  def main(argv=None):
      if not argv:
@@ -10,11 +11,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome GOID1 [GOID2 ....] [--outfile outfilename] [--append] [--restrict genefile]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--outfile", dest="outfilename")
-    parser.add_option("--append", action="store_true", dest="append")
-    parser.add_option("--restrict", dest="restrictfilename")
-    parser.set_defaults(outfilename=None, restrictfilename=None, append=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -31,6 +28,23 @@ def main(argv=None):
      getGOgenes(genome, GOIDlist, options.outfilename, options.restrictfilename, options.append)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--outfile", dest="outfilename")
+    parser.add_option("--append", action="store_true", dest="append")
+    parser.add_option("--restrict", dest="restrictfilename")
+
+    configParser = getConfigParser()
+    section = "getGOgenes"
+    outfilename = getConfigOption(configParser, section, "outfilename", None)
+    restrictfilename = getConfigOption(configParser, section, "restrictfilename", None)
+    append = getConfigBoolOption(configParser, section, "append", False)
+
+    parser.set_defaults(outfilename=outfilename, restrictfilename=restrictfilename, append=append)
+
+    return parser
+
+
  def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append=False):
      writeOut = False
      if outfilename is not None:
@@ -41,7 +55,6 @@ def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append
          restrict = True
      
      hg = Genome(genome)
-    idb = geneinfoDB()
  
      print sys.argv
      print GOIDlist
@@ -58,7 +71,7 @@ def getGOgenes(genome, GOIDlist, outfilename=None, restrictfilename=None, append
  
      geneList = geneDict.keys()
      print len(geneList)
-    geneInfoList = idb.getallGeneInfo(genome)
+    geneInfoList = getGeneInfoDict(genome)
  
      if writeOut:
          if append:
diff --git a/getNovelSNPs.py b/getNovelSNPs.py

index 0936a7f3cc2c1f967de17d695090b4a2154c4134..4091281c8fe0f5410090b23db7276496fe49d6ed 100755 (executable)
--- a/getNovelSNPs.py
+++ b/getNovelSNPs.py
@@ -11,7 +11,7 @@ import string
  from cistematic.genomes import Genome 
  from commoncode import writeLog
  
-print "%prog: version 1.5"
+print "getNovelSNPs: version 1.6"
  
  try:
      import psyco
diff --git a/getSNPGeneInfo.py b/getSNPGeneInfo.py

index 307413b866e804e831e2288a14a6c0984a55b5bf..edcb2ceae0103af5b810c4e98d5c0521544efd34 100755 (executable)
--- a/getSNPGeneInfo.py
+++ b/getSNPGeneInfo.py
@@ -16,9 +16,9 @@ import sys
  import optparse
  import string
  from cistematic.core import genesIntersecting, cacheGeneDB, uncacheGeneDB
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
  
-print "%prog: version 4.5"
+print "getSNPGeneInfo: version 4.6"
  
  def main(argv=None):
      if not argv:
@@ -26,11 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome snpsfile rpkmfile dbsnp_geneinfo_outfile [--cache] [--withoutsense] [--flank bp]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cache", action="store_true", dest="cachePages")
-    parser.add_option("--withoutsense", action="store_false", dest="withSense")
-    parser.add_option("--flank", type="int", dest="flankBP")
-    parser.set_defaults(doCache=False, withSense=True, flankBP=0)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -45,6 +41,23 @@ def main(argv=None):
      writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, options.doCache, options.withSense, options.flankBP)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--cache", action="store_true", dest="cachePages")
+    parser.add_option("--withoutsense", action="store_false", dest="withSense")
+    parser.add_option("--flank", type="int", dest="flankBP")
+
+    configParser = getConfigParser()
+    section = "getSNPGeneInfo"
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    withSense = getConfigBoolOption(configParser, section, "withSense", True)
+    flankBP = getConfigIntOption(configParser, section, "flankBP", 0)
+
+    parser.set_defaults(doCache=doCache, withSense=withSense, flankBP=flankBP)
+
+    return parser
+
+
  def writeSNPGeneInfo(genome, infilename, rpkmfilename, outfilename, doCache=False, withSense=True, flankBP=0):
  
      outList = getSNPGeneInfo(genome, infilename, rpkmfilename, doCache, withSense, flankBP)
@@ -85,12 +98,9 @@ def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=Tr
  
      if doCache:
          cacheGeneDB(genome)
-        idb = geneinfoDB(cache=True)
          print "cached %s" % genome
-    else:
-        idb = geneinfoDB()
  
-    geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
      geneDict = {}
  
      if flankBP > 0:
diff --git a/getSNPs.py b/getSNPs.py

index 0adde42c0b09a921f45c9edf01070077e68bf805..f6071ae444be9ca294ac4c4ff39b50a74e3d0cb4 100755 (executable)
--- a/getSNPs.py
+++ b/getSNPs.py
@@ -19,10 +19,12 @@
      totalRatioMin = total # of reads supporting a base change at position S / total # reads that pass through position S
  """
  
-import sys, optparse
-from commoncode import readDataset, writeLog
+import sys
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption
+import ReadDataset
  
-print "%prog: version 3.5"
+print "getSNPs: version 3.6"
  
  try:
      import psyco
@@ -41,11 +43,7 @@ def main(argv=None):
  
      usage = __doc__
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nosplices", action="store_false", dest="doSplices")
-    parser.add_option("--enforceChr", action="store_true", dest="forceChr")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -65,6 +63,23 @@ def main(argv=None):
      writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, options.cachePages, options.doSplices, options.forceChr)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nosplices", action="store_false", dest="doSplices")
+    parser.add_option("--enforceChr", action="store_true", dest="forceChr")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "getSNPs"
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", True)
+    forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
+
+    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+
+    return parser
+
+
  def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache, cachePages=0, doSplices=True, forceChr=False):
      writeLog("snp.log", sys.argv[0], "rdsfile: %s uniqStartMin: %1.2f totalRatioMin: %1.2f" % (hitfile, uniqStartMin, totalRatioMin))
  
@@ -86,7 +101,7 @@ def writeSNPsToFile(hitfile, uniqStartMin, totalRatioMin, outfilename, doCache,
  
  def getSNPs(hitfile, uniqStartMin, totalRatioMin, doCache, cachePages=0, doSplices=True, forceChr=False):
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      if cachePages > 20000:
          hitRDS.setDBcache(cachePages)
  
@@ -157,7 +172,9 @@ def getMatchDict(rds, chrom, withSplices=True):
      except:
          readDict[chrom] = []
  
-    for (start, stop) in readDict[chrom]:
+    for read in readDict[chrom]:
+        start = read["start"]
+        stop = read["stop"]
          if finalDict.has_key(start):
              finalDict[start].append(stop)
          else:
@@ -169,7 +186,14 @@ def getMatchDict(rds, chrom, withSplices=True):
          except:
              spliceDict[chrom] = []
  
-        for (start, stop) in spliceDict[chrom]:
+        for read in spliceDict[chrom]:
+            try:
+                start = read["startL"]
+                stop = read["stopL"]
+            except KeyError:
+                start = read["startR"]
+                stop = read["stopR"]
+
              if finalDict.has_key(start):
                  finalDict[start].append(stop)
              else:
diff --git a/getallNRSE.py b/getallNRSE.py

index c2e639f5c09b1ad4660256f49fe3d4aab7cc9bab..c02bab11ccfcc53c059d4796b1029d796f55bae8 100755 (executable)
--- a/getallNRSE.py
+++ b/getallNRSE.py
@@ -9,11 +9,12 @@ except:
  from cistematic.core import complement
  from cistematic.core.motif import Motif
  from cistematic.genomes import Genome
-from commoncode import readDataset, getMergedRegions, findPeak
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
+import ReadDataset
  from pylab import *
  import matplotlib
  
-print '%s: version 3.4' % sys.argv[0]
+print 'getallNRSE: version 3.5'
  
  def main(argv=None):
      if not argv:
@@ -21,21 +22,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome regionfile siteOutfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--dataset", dest="chipfilename")
-    parser.add_option("--min", type="float", dest="minHeight")
-    parser.add_option("--minfraction", type="float", dest="minFraction")
-    parser.add_option("--plot", dest="plotname")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--raw", action="store_false", dest="normalize")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
-    parser.add_option("--peakdist", type="int", dest="maxpeakdist")
-    parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
-    parser.add_option("--motifdir", dest="motifDir")
-    parser.set_defaults(chipfilename="", minHeight=-2., minFraction=-2., plotname="",
-                        doCache=False, normalize=True, doVerbose=False, doMarkov1=False,
-                        maxpeakdist=None, fullOnly=False, motifDir="./")
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -53,6 +40,42 @@ def main(argv=None):
                 options.motifDir)
  
  
+def getParser(usage):
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--dataset", dest="chipfilename")
+    parser.add_option("--min", type="float", dest="minHeight")
+    parser.add_option("--minfraction", type="float", dest="minFraction")
+    parser.add_option("--plot", dest="plotname")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+    parser.add_option("--peakdist", type="int", dest="maxpeakdist")
+    parser.add_option("--fullOnly", action="store_true", dest="fullOnly")
+    parser.add_option("--motifdir", dest="motifDir")
+
+    configParser = getConfigParser()
+    section = "getallNRSE"
+    chipfilename = getConfigOption(configParser, section, "chipfilename", "")
+    minHeight = getConfigFloatOption(configParser, section, "minHeight", -2.)
+    minFraction = getConfigFloatOption(configParser, section, "minFraction", -2.)
+    plotname = getConfigOption(configParser, section, "plotname", "")
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
+    maxpeakdist = getConfigOption(configParser, section, "maxpeakdist", None)
+    fullOnly = getConfigBoolOption(configParser, section, "fullOnly", False)
+    motifDir = getConfigOption(configParser, section, "motifDir", "./")
+
+    parser.set_defaults(chipfilename=chipfilename, minHeight=minHeight, minFraction=minFraction, plotname=plotname,
+                        doCache=doCache, normalize=normalize, doVerbose=doVerbose, doMarkov1=doMarkov1,
+                        maxpeakdist=maxpeakdist, fullOnly=fullOnly, motifDir=motifDir)
+
+    return parser
+
+
  def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
                 minFraction=-2., plotname="", doCache=False, normalize=True,
                 doVerbose=False, doMarkov1=False, maxpeakdist=None, fullOnly=False,
@@ -69,7 +92,7 @@ def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
      doDataset = False
      normalizeBy = 1
      if chipfilename:
-        hitRDS = readDataset(chipfilename, verbose=doVerbose, cache=doCache)
+        hitRDS = ReadDataset.ReadDataset(chipfilename, verbose=doVerbose, cache=doCache)
          doDataset = True
          if normalize:
              normalizeBy = len(hitRDS) / 1000000.
@@ -107,8 +130,8 @@ def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
          if "rand" in rchrom or "M" in rchrom or "hap" in rchrom:
              continue
  
-        for (start, stop, length) in regions[rchrom]:
-            regionList.append((rchrom, start, length))
+        for region in regions[rchrom]:
+            regionList.append((rchrom, region.start, region.length))
  
      notFoundIndex = 0
      currentChrom = ""
@@ -120,12 +143,14 @@ def getallNRSE(genome, infilename, outfilename, chipfilename="", minHeight=-2.,
                  hitDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True)
                  currentChrom = rchrom
  
-            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[rchrom], start, length, doWeight=True)
+            peak = findPeak(hitDict[rchrom], start, length, doWeight=True)
+            topPos = peak.topPos
+            numHits = peak.numHits
              if len(topPos) == 0:
                  print "topPos error"
  
              peakpos = topPos[0]
-            peakscore = smoothArray[peakpos]
+            peakscore = peak.smoothArray[peakpos]
              if peakscore == 0.:
                  peakscore = -1.
  
diff --git a/getallgenes.py b/getallgenes.py

index addba3653451a96a2db445e239ec31726a84a49b..a717fe86f0881268f8745791dc94d97f6d161d09 100755 (executable)
--- a/getallgenes.py
+++ b/getallgenes.py
@@ -4,12 +4,12 @@ try:
  except:
      print 'psyco not running'
  
-import sys, optparse
-from cistematic.core import genesIntersecting, featuresIntersecting, cacheGeneDB, uncacheGeneDB
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import optparse
+from cistematic.core import genesIntersecting, featuresIntersecting
+from commoncode import getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption, getGeneInfoDict, getGeneAnnotDict, getExtendedGeneAnnotDict
  
-print "%prog: version 5.5"
+print "getallgenes: version 5.6"
  
  
  def main(argv=None):
@@ -18,20 +18,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome regionfile outfile [--radius bp] [--nomatch nomatchfile] --trackfar --stranded --cache --compact [--step dist] [--startField colID]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--radius", type="int", dest="maxRadius")
-    parser.add_option("--nomatch", dest="nomatchfilename")
-    parser.add_option("--trackfar", action="store_true", dest="trackFar")
-    parser.add_option("--stranded", action="store_true", dest="trackStrand")
-    parser.add_option("--cache", action="store_true", dest="cachePages")
-    parser.add_option("--compact", action="store_true", dest="compact")
-    parser.add_option("--step", type="int", dest="step")
-    parser.add_option("--startField", type="int", dest="colID")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.set_defaults(maxRadius=20002, nomatchfilename="", step=None, trackFar=False,
-                        trackStrand=False, compact=False, colID=1, doCache=False,
-                        extendGenome="", replaceModels=False)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -48,15 +35,43 @@ def main(argv=None):
                  options.doCache, options.extendgenome, options.replaceModels)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--radius", type="int", dest="maxRadius")
+    parser.add_option("--nomatch", dest="nomatchfilename")
+    parser.add_option("--trackfar", action="store_true", dest="trackFar")
+    parser.add_option("--stranded", action="store_true", dest="trackStrand")
+    parser.add_option("--cache", action="store_true", dest="cachePages")
+    parser.add_option("--compact", action="store_true", dest="compact")
+    parser.add_option("--step", type="int", dest="step")
+    parser.add_option("--startField", type="int", dest="colID")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+    configParser = getConfigParser()
+    section = "getallgenes"
+    maxRadius = getConfigIntOption(configParser, section, "maxRadius", 20002)
+    nomatchfilename = getConfigOption(configParser, section, "nomatchfilename", "")
+    step = getConfigOption(configParser, section, "step", None)
+    trackFar = getConfigBoolOption(configParser, section, "trackFar", False)
+    trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+    compact = getConfigBoolOption(configParser, section, "compact", False)
+    colID = getConfigIntOption(configParser, section, "colID", 1)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+    parser.set_defaults(maxRadius=maxRadius, nomatchfilename=nomatchfilename, step=step, trackFar=trackFar,
+                        trackStrand=trackStrand, compact=compact, colID=colID, doCache=doCache,
+                        extendGenome=extendGenome, replaceModels=replaceModels)
+
+    return parser
+
+
  def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilename="",
                  step=None, trackFar=False, trackStrand=False, compact=False, colID=1,
                  doCache=False, extendGenome="", replaceModels=False):
  
-    if doCache:
-        idb = geneinfoDB(cache=True)
-    else:
-        idb = geneinfoDB()
-
      if not step:
          step = maxRadius - 2
  
@@ -68,10 +83,7 @@ def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilenam
      infile = open(infilename)
      outfile = open(outfilename,"w")
  
-    if genome == "dmelanogaster":
-        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
-    else:
-        geneinfoDict = idb.getallGeneInfo(genome)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
  
      posList = []
      altPosDict = {}
@@ -120,11 +132,10 @@ def getallgenes(genome, infilename, outfilename, maxRadius=20002, nomatchfilenam
      if maxRadius < step:
          step = maxRadius - 2
  
-    hg = Genome(genome, inRAM=True)
      if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace = replaceModels)
-
-    geneannotDict = hg.allAnnotInfo()
+        geneannotDict = getExtendedGeneAnnotDict(genome, extendGenome, replace=replaceModels, inRAM=True)
+    else:
+        geneannotDict = getGeneAnnotDict(genome, inRAM=True)
  
      for radius in range(1, maxRadius, step):
          print "radius %d" % radius
diff --git a/getallsites.py b/getallsites.py

index 39335e481f3b30be49f46f4889bf2435cfa9b109..4d6707615d9d7ea95c5caf64a48c532e934c7e5f 100755 (executable)
--- a/getallsites.py
+++ b/getallsites.py
@@ -8,9 +8,10 @@ except:
  from cistematic.core.motif import Motif, hasMotifExtension
  from cistematic.core import complement
  from cistematic.genomes import Genome
-from commoncode import readDataset, getMergedRegions, findPeak
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
  
-print "%prog: version 2.4"
+print "getallsites: version 2.5"
  
  
  def main(argv=None):
@@ -19,21 +20,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome motifFile motThreshold regionfile siteOutfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--dataset", dest="chipfilename")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--best", action="store_true", dest="bestOnly",
-                      help="only report the best position for each region")
-    parser.add_option("--usepeak", action="store_true", dest="usePeak",
-                      help="use peak position and height from regions file")
-    parser.add_option("--printseq", action="store_true", dest="printSeq")
-    parser.add_option("--nomerge", action="store_true", dest="noMerge")
-    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
-    parser.add_option("--rank", type="int", dest="useRank",
-                      help="return region ranking based on peak height ranking [requires --usepeak]")
-    parser.set_defaults(chipfilename="", doCache=False, bestOnly=False, usePeak=False,
-                        printSeq=False, doMarkov1=False, useRank=False, noMerge=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 5:
@@ -51,6 +38,37 @@ def main(argv=None):
                  options.useRank, options.noMerge)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--dataset", dest="chipfilename")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--best", action="store_true", dest="bestOnly",
+                      help="only report the best position for each region")
+    parser.add_option("--usepeak", action="store_true", dest="usePeak",
+                      help="use peak position and height from regions file")
+    parser.add_option("--printseq", action="store_true", dest="printSeq")
+    parser.add_option("--nomerge", action="store_true", dest="noMerge")
+    parser.add_option("--markov1", action="store_true", dest="doMarkov1")
+    parser.add_option("--rank", type="int", dest="useRank",
+                      help="return region ranking based on peak height ranking [requires --usepeak]")
+
+    configParser = getConfigParser()
+    section = "getallsites"
+    chipfilename = getConfigOption(configParser, section, "chipfilename", "")
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    bestOnly = getConfigBoolOption(configParser, section, "bestOnly", False)
+    usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
+    printSeq = getConfigBoolOption(configParser, section, "printSeq", False)
+    doMarkov1 = getConfigBoolOption(configParser, section, "doMarkov1", False)
+    useRank = getConfigBoolOption(configParser, section, "useRank", False)
+    noMerge = getConfigBoolOption(configParser, section, "noMerge", False)
+
+    parser.set_defaults(chipfilename=chipfilename, doCache=doCache, bestOnly=bestOnly, usePeak=usePeak,
+                        printSeq=printSeq, doMarkov1=doMarkov1, useRank=useRank, noMerge=noMerge)
+
+    return parser
+
+
  def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chipfilename="",
                  doCache=False, bestOnly=False, usePeak=False, printSeq=False, doMarkov1=False,
                  useRank=False, noMerge=False):
@@ -90,7 +108,7 @@ def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chip
          doRDS = True
  
      if doRDS:
-        hitRDS = readDataset(chipfilename, verbose = True, cache=doCache)
+        hitRDS = ReadDataset.ReadDataset(chipfilename, verbose = True, cache=doCache)
  
      outfile = open(outfilename, "w")
  
@@ -101,11 +119,11 @@ def getallsites(genome, motfilename, motThreshold, infilename, outfilename, chip
              continue
  
          if usePeak:
-            for (start, stop, length, peakPos, peakHeight) in regions[chrom]:
-                regionList.append((peakHeight, chrom, start, length, peakPos))
+            for region in regions[chrom]:
+                regionList.append((region.peakHeight, chrom, region.start, region.length, region.peakPos))
          else:
-            for (start, stop, length) in regions[chrom]:
-                regionList.append((chrom, start, length))
+            for region in regions[chrom]:
+                regionList.append((chrom, region.start, region.length))
  
      if usePeak:
          regionList.sort()
diff --git a/getfasta.py b/getfasta.py

index 0b2faf9d4177b4ea20971e4b433fb17074399fc0..ea347e4aa44f1a21a0b7c54d11b407352ce91b6f 100755 (executable)
--- a/getfasta.py
+++ b/getfasta.py
@@ -9,11 +9,13 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
  from cistematic.genomes import Genome
  
-print "%s: version 3.4" % sys.argv[0]
+print "getfasta: version 3.5"
  
  
  def main(argv=None):
@@ -22,18 +24,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome regionfile outfilename [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--seqradius", type="int", dest="seqsize")
-    parser.add_option("--minreads", type="int", dest="minHitThresh")
-    parser.add_option("--returnTop", type="int", dest="topRegions")
-    parser.add_option("--maxsize", type="int", dest="maxsize")
-    parser.add_option("--usepeak", action="store_true", dest="usePeaks")
-    parser.add_option("--dataset", dest="hitfile")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--compact", action="store_true", dest="doCompact")
-    parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
-                        usePeaks=False, hitfile=None, doCache=False, doCompact=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -49,6 +40,33 @@ def main(argv=None):
               options.doCache, options.doCompact)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--seqradius", type="int", dest="seqsize")
+    parser.add_option("--minreads", type="int", dest="minHitThresh")
+    parser.add_option("--returnTop", type="int", dest="topRegions")
+    parser.add_option("--maxsize", type="int", dest="maxsize")
+    parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+    parser.add_option("--dataset", dest="hitfile")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--compact", action="store_true", dest="doCompact")
+
+    configParser = getConfigParser()
+    section = "getfasta"
+    seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
+    minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
+    topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
+    maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
+    usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
+    hitfile = getConfigOption(configParser, section, "hitFile", None)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
+
+    parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
+                        usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
+
+    return parser
+
  def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
               maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
      doDataset = False
@@ -69,7 +87,7 @@ def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRe
      if usePeaks:
          ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
      elif doDataset:
-        hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+        hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
          ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
      else:
          ncregions = getDefaultRegion(mergedRegions, maxsize)
@@ -100,23 +118,24 @@ def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
  
  def getDefaultRegion(regionDict, maxsize):
      ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
  
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen) = region
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            length = region.length
  
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
                  continue
  
-            resultDict = {"start": rstart,
-                          "length": rlen,
+            resultDict = {"start": start,
+                          "length": length,
                            "topPos": [-1]
              }
-            ncregions[achrom].append(resultDict)
+            ncregions[chromosome].append(resultDict)
  
      return ncregions
  
@@ -124,25 +143,26 @@ def getDefaultRegion(regionDict, maxsize):
  def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
  
      ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
  
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen, peakPos, peakHeight) = region
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            length = region.length
  
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
                  continue
  
-            topPos = peakPos - rstart
-            if peakHeight > minHitThresh:
-                resultDict = {"start": rstart,
-                              "length": rlen,
+            topPos = region.peakPos - start
+            if region.peakHeight > minHitThresh:
+                resultDict = {"start": start,
+                              "length": length,
                                "topPos": [topPos]
                  }
-                ncregions[achrom].append(resultDict)
+                ncregions[chromosome].append(resultDict)
  
      return ncregions
  
@@ -152,29 +172,31 @@ def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
      readlen = hitRDS.getReadSize()
  
      ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
-
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen) = region
-
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
+
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
+
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
                  continue
  
-            thechrom = "chr%s" % achrom
+            thechrom = "chr%s" % chromosome
              print "."
-            hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+            hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
              print "hitDict length: %d", len(hitDict[thechrom])
-            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
-            if numHits > minHitThresh:
-                resultDict = {"start": rstart,
-                              "length": rlen,
-                              "topPos": topPos
+            peak = findPeak(hitDict[thechrom], start, length, readlen)
+            if peak.numHits > minHitThresh:
+                resultDict = {"start": start,
+                              "length": length,
+                              "topPos": peak.topPos
                  }
-                ncregions[achrom].append(resultDict)
+                ncregions[chromosome].append(resultDict)
  
      return ncregions
  
diff --git a/getgosig.py b/getgosig.py

index b04dca6c77f385258dcda2ff4a8657ad59b3eeb0..59d4174a3a32509be91057996746a79f7a8eba41 100755 (executable)
--- a/getgosig.py
+++ b/getgosig.py
@@ -11,8 +11,10 @@ import sys
  import optparse
  import matplotlib
  from pylab import *
+from commoncode import getConfigParser, getConfigIntOption
  
-print "%prog: version 2.1"
+
+print "getgosig: version 2.2"
  
  def main(argv=None):
      if not argv:
@@ -20,11 +22,7 @@ def main(argv=None):
  
      usage = "usage: python %prog genome outimage gofileroot1 title1 cohortsize1 [gofileroot2 title2 cohortsize2 ...] [--fontsize pts] [--length in] [--width in]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--fontsize", type="int", dest="fontSize")
-    parser.add_option("--length", type="int", dest="length")
-    parser.add_option("--width", type="int", dest="width")
-    parser.set_defaults(fontSize=5, length=10, width=7)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 5:
@@ -46,6 +44,23 @@ def main(argv=None):
      getgosig(genome, imagename, fileroots, titles, options.fontSize, options.length, options.width)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--fontsize", type="int", dest="fontSize")
+    parser.add_option("--length", type="int", dest="length")
+    parser.add_option("--width", type="int", dest="width")
+
+    configParser = getConfigParser()
+    section = "getgosig"
+    fontSize = getConfigIntOption(configParser, section, "fontSize", 5)
+    length = getConfigIntOption(configParser, section, "length", 10)
+    width = getConfigIntOption(configParser, section, "width", 7)
+
+    parser.set_defaults(fontSize=fontSize, length=length, width=width)
+
+    return parser
+
+
  def getgosig(genome, imagename, fileroots=[], titles=[], fontSize=5, length=10, width=7):
      hg = Genome(genome)
      allgodesc = hg.allGOterms()
diff --git a/getmers.py b/getmers.py

index c7c35ff8387d3fa91729c5cf6bd95473355db11f..6d1555edf90d6b8530c8dbf85b28e068f5b3c849 100755 (executable)
--- a/getmers.py
+++ b/getmers.py
@@ -16,7 +16,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print '%s: version 1.1' % argv[0]
+    print "getmers: version 1.2"
  
      if len(sys.argv) < 5:
          print 'usage: python %s genome merlen chrAny:start-stop outfile' % argv[0]
diff --git a/getsplicefa.py b/getsplicefa.py

index db8e2048aae93b16551fd9c62c2ff0e6fd55b433..cbca805a2426ace12e5121668cffecf2c646d3d5 100755 (executable)
--- a/getsplicefa.py
+++ b/getsplicefa.py
@@ -6,15 +6,16 @@ try:
      psyco.full()
  except:
      print "psyco not running"
-from cistematic.core import complement
+
  from cistematic.genomes import Genome
+from commoncode import getConfigParser, getConfigIntOption
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "%prog: version 1.0"
+    verstring = "getsplicefa: version 1.1"
      print verstring
      delimiter = "|"
  
@@ -22,11 +23,7 @@ def main(argv=None):
              \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
              \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--verbose", action="store_true", dest="doVerbose",
-                      help="show verbose messages [default: False]")
-    parser.add_option("--spacer", type="int", dest="spacer",
-                      help="number of spacer NTs to use [default: 2")
+    parser = makeParser(usage)
      parser.set_defaults(doVerbose=False, spacer=2)
      (options, args) = parser.parse_args(argv[1:])
  
@@ -42,6 +39,23 @@ def main(argv=None):
      getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--verbose", action="store_true", dest="doVerbose",
+                      help="show verbose messages [default: False]")
+    parser.add_option("--spacer", type="int", dest="spacer",
+                      help="number of spacer NTs to use [default: 2")
+
+    configParser = getConfigParser()
+    section = "getsplicefa"
+    doVerbose = getConfigIntOption(configParser, section, "doVerbose", False)
+    spacer = getConfigIntOption(configParser, section, "spacer", 2)
+
+    parser.set_defaults(doVerbose=doVerbose, spacer=spacer)
+
+    return parser
+
+
  def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
      spacerseq = "N" * spacer
  
@@ -87,7 +101,7 @@ def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False
          exonStopDict[name] = exonStops
          exonLengths = []
          for index in range(spliceCount + 1):
-            exonLengths.append(exonStops[index] - exonStarts[index])
+            exonLengths.append(exonStops[index] - exonStarts[index] + 1)
  
          exonLengthDict[name] = exonLengths
  
diff --git a/gointersects.py b/gointersects.py

index 0f747273889042191b21eb2b266597455f060f6d..52df647c254c859cf4b171a2436fcfee99e98574 100755 (executable)
--- a/gointersects.py
+++ b/gointersects.py
@@ -5,7 +5,7 @@
  
  import sys
  
-print "%s: version 1.0" % sys.argv[0]
+print "gointersects: version 1.1"
  
  def main(argv=None):
      if not argv:
diff --git a/hepg2.rds b/hepg2.rds

deleted file mode 100644 (file)

index 8674f76..0000000

Binary files a/hepg2.rds and /dev/null differ
diff --git a/intersects.py b/intersects.py

index 67e7d3506bccfbc73102c82de849d0eb8553f3fa..b8846cce1547b8d33caeefd9de39adc8308d708d 100755 (executable)
--- a/intersects.py
+++ b/intersects.py
@@ -3,9 +3,11 @@
  #  ENRAGE
  #
  
-import sys, optparse
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
-print 'version 2.0'
+print "intersects: version 2.1"
  
  def main(argv=None):
      if not argv:
@@ -13,16 +15,7 @@ def main(argv=None):
  
      usage = "usage: python %prog infile1 infile2 outfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("-d", dest="delimiter")
-    parser.add_option("--file3", dest="infile3")
-    parser.add_option("-1", type="int", dest="matchfield1")
-    parser.add_option("-2", type="int", dest="matchfield2")
-    parser.add_option("-3", type="int", dest="matchfield3")
-    parser.add_option("-reject1", dest="reject1file")
-    parser.add_option("-trackGID", action="store_true", dest="trackGID")
-    parser.set_defaults(delimiter="\t", infile3=None, matchField1=0, matchField2=0,
-                        matchField3=0, rejectFileName="", trackGID=False)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -38,6 +31,32 @@ def main(argv=None):
                 options.rejectFileName, options.trackGID)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("-d", dest="delimiter")
+    parser.add_option("--file3", dest="infile3")
+    parser.add_option("-1", type="int", dest="matchfield1")
+    parser.add_option("-2", type="int", dest="matchfield2")
+    parser.add_option("-3", type="int", dest="matchfield3")
+    parser.add_option("-reject1", dest="reject1file")
+    parser.add_option("-trackGID", action="store_true", dest="trackGID")
+
+    configParser = getConfigParser()
+    section = "geneMrnaCountsWeighted"
+    delimiter = getConfigOption(configParser, section, "delimiter", "\t")
+    infile3 = getConfigOption(configParser, section, "infile3", None)
+    matchField1 = getConfigIntOption(configParser, section, "matchField1", 0)
+    matchField2 = getConfigIntOption(configParser, section, "matchField2", 0)
+    matchField3 = getConfigIntOption(configParser, section, "matchField3", 0)
+    rejectFileName = getConfigOption(configParser, section, "rejectFileName", "\t")
+    trackGID = getConfigBoolOption(configParser, section, "trackGID", False)
+
+    parser.set_defaults(delimiter=delimiter, infile3=infile3, matchField1=matchField1, matchField2=matchField2,
+                        matchField3=matchField3, rejectFileName=rejectFileName, trackGID=trackGID)
+
+    return parser
+
+
  def intersects(infile1Name, infile2Name, outfileName, delimiter="\t", infile3Name=None,
                 matchField1=0, matchField2=0, matchField3=0, rejectFileName="", trackGID=False):
  
diff --git a/listGeneFeatures.py b/listGeneFeatures.py

index 607100d967fd3d7a3527a4f63a331ad365888811..eb2a7c630244a70c844777f55017c049b87aef23 100755 (executable)
--- a/listGeneFeatures.py
+++ b/listGeneFeatures.py
@@ -7,7 +7,7 @@ import sys
  from cistematic.genomes import Genome
  from commoncode import getMergedRegions, getFeaturesByChromDict
  
-print "%s: version 1.1" % sys.argv[0]
+print "listGeneFeatures: version 1.2"
  
  
  def main(argv=None):
diff --git a/makeGraphs.py b/makeGraphs.py

index 3965e5cd824867168291e193acebdef2cfc0f91b..5e3349ae713de73e792b1c2136a946956042b89b 100644 (file)
--- a/makeGraphs.py
+++ b/makeGraphs.py
@@ -1,4 +1,5 @@
-import sys, os
+import sys
+import os
  
  
  def getEdges(nodeList, shorten=False):
diff --git a/makeSNPtrack.py b/makeSNPtrack.py

index 23d8ac9a59322309896d5455e1612d54c7b283b4..b3f0e67f7b249ed093ad091d8bbbd234de2bff32 100755 (executable)
--- a/makeSNPtrack.py
+++ b/makeSNPtrack.py
@@ -14,7 +14,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "%s: version 1.2" % argv[0]
+    print "makeSNPtrack: version 1.3"
  
      if len(argv) < 4:
          print "usage: python %s snpfile trackname trackoutfile" % argv[0]
diff --git a/makebedfromrds.py b/makebedfromrds.py

index 924bc7e9f163ef4c9efe282cd82a643a3faaa310..a0f141699b947f99bc48a8518bce2f8e41ef36e8 100755 (executable)
--- a/makebedfromrds.py
+++ b/makebedfromrds.py
@@ -11,8 +11,10 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
  PLUS_COLOR = "0,0,255"
  MINUS_COLOR = "255,0,0"
@@ -27,31 +29,14 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "%prog: version 3.1"
+    verstring = "makebedfromrds: version 3.2"
      print verstring
  
      doPairs = False
      
      usage = "usage:  %prog trackLabel rdsFile bamFile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
-    parser.add_option("--nomulti", action="store_false", dest="withMulti")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
-    parser.add_option("--flag", dest="withFlag")
-    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
-    parser.add_option("--pairs", type="int", dest="pairDist")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
-    parser.add_option("--chrom", action="append", dest="chromList")
-    parser.add_option("--strand", dest="strand")
-    parser.add_option("-r", "--region", dest="region", type="string",
-                      help="samtools region string")
-    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False,
-                        pairDist=None, withFlag="", useFlagLike=False, enforceChr=False,
-                        senseStrand="", allChrom=True, doCache=False, cachePages=100000,
-                        chromList=[])
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      try:
@@ -84,6 +69,43 @@ def main(argv=None):
                       options.allChrom, options.doCache, options.cachePages, options.chromList)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", type="int", dest="pairDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--strand", dest="strand")
+
+    configParser = getConfigParser()
+    section = "makebedfromrds"
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doSpliceColor = getConfigBoolOption(configParser, section, "doSpliceColor", False)
+    pairDist = getConfigOption(configParser, section, "pairDist", None)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+    enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+    senseStrand = getConfigOption(configParser, section, "senseStrand", "")
+    allChrom = getConfigBoolOption(configParser, section, "allChrom", True)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", 100000)
+
+    parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices, doSpliceColor=doSpliceColor,
+                        pairDist=pairDist, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+                        senseStrand=senseStrand, allChrom=allChrom, doCache=doCache, cachePages=cachePages,
+                        chromList=[])
+
+    return parser
+
+
  def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
                       doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
                       withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
@@ -94,7 +116,7 @@ def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=
          sys.exit(1)
  
      print "\nsample:"
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
  
      #check that this is better than the dataset's default cache size
      if cachePages > RDS.getDefaultCacheSize():
@@ -165,28 +187,46 @@ def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=
                      listLen = len(localList) - 1
                      localIndex = 0
                      while localIndex <= listLen:
+                        read = localList[localIndex]
                          try:
-                            (leftpos, leftsense, leftweight, lPairID) = localList[localIndex]
+                            leftpos = read["start"]
+                            leftsense = read["sense"]
+                            leftweight = read["weight"]
+                            lPairID = read["pairID"]
                              leftstop = leftpos + readlength - 1
                              lpart = 1
                              startList = [leftpos]
                              stopList = [leftstop]
-                        except:
-                            (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex]
+                        except KeyError:
+                            leftpos = read["startL"]
+                            LLstop = read["stopL"]
+                            LRstart = read["startR"]
+                            leftstop = read["stopL"]
+                            leftsense = read["sense"]
+                            lPairID = read["pairID"]
                              leftweight = 1.0
                              lpart = 2
                              startList = [leftpos, LRstart]
                              stopList = [LLstop, leftstop]
  
                          if localIndex < listLen:
+                            read = localList[localIndex + 1]
                              try:
-                                (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1]
+                                rightpos = read["start"]
+                                rightsense = read["sense"]
+                                rightweight = read["weight"]
+                                rPairID= read["pairID"]
                                  rightstop = rightpos + readlength - 1
                                  rpart = 1
                                  rstartList = [rightpos]
                                  rstopList = [rightstop]
-                            except:
-                                (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1]
+                            except KeyError:
+                                rightpos = read["startL"]
+                                RLstop = read["stopL"]
+                                RRstart = read["startR"]
+                                rightstop = read["stopR"]
+                                rightsense = read["sense"]
+                                rPairID = read["pairID"]
                                  rightweight = 1.0
                                  rpart = 2
                                  rstartList = [rightpos, RRstart]
@@ -229,7 +269,10 @@ def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=
              else:
                  hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
                  try:
-                    for (pos, sense, weight, readID) in hitDict[achrom]:
+                    for read in hitDict[achrom]:
+                        pos = read["start"]
+                        sense = read["sense"]
+                        readID = read["readID"]
                          splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
                          index += 1
                  except:
@@ -239,7 +282,13 @@ def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=
                      spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
                      if achrom not in spliceDict:
                          continue
-                    for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+                    for read in spliceDict[achrom]:
+                        readstart = read["startL"]
+                        Lstop = read["stopL"]
+                        Rstart = read["startR"]
+                        readstop = read["stopR"]
+                        rsense = read["sense"]
+                        readName = read["readID"]
                          splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
                          index += 1
  
@@ -254,7 +303,13 @@ def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=
              spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
              if achrom not in spliceDict:
                  continue
-            for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
+            for read in spliceDict[achrom]:
+                readstart = read["startL"]
+                Lstop = read["stopL"]
+                Rstart = read["startR"]
+                readstop = read["stopR"]
+                rsense = read["sense"]
+                readName = read["readID"]
                  splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
                  index += 1
  
diff --git a/makerdsfrombed.py b/makerdsfrombed.py

index 4f38d51ffd2e470d3cf91d9e2d5e38c84f2f6355..83ad010f73aab7e7994f28422c7474d3395a3b2b 100755 (executable)
--- a/makerdsfrombed.py
+++ b/makerdsfrombed.py
@@ -10,10 +10,13 @@ try:
  except:
      pass
  
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigIntOption, getConfigBoolOption
+import ReadDataset
  
-verstring = "%prog: version 2.1" % sys.argv[0]
+verstring = "makerdsfrombed: version 2.2"
  print verstring
  
  
@@ -23,12 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog label bedfile outrdsfile [--append] [--index] [propertyName::propertyValue] [--cache numPages]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--append", action="store_false", dest="init")
-    parser.add_option("--index", action="store_true", dest="doIndex")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--RNA", action="store_true", dest="rnaDataType")
-    parser.set_defaults(init=True, rnaDataType=False, doIndex=False, cachePages=100000)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -54,6 +52,25 @@ def main(argv=None):
      makerdsfrombed(label, filename, outdbname, options.init, dataType, options.doIndex, options.cachePages, propertyList)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--RNA", action="store_true", dest="rnaDataType")
+
+    configParser = getConfigParser()
+    section = "makerdsfrombed"
+    init = getConfigBoolOption(configParser, section, "init", True)
+    rnaDataType = getConfigBoolOption(configParser, section, "RNA", False)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+
+    parser.set_defaults(init=init, rnaDataType=rnaDataType, doIndex=doIndex, cachePages=cachePages)
+
+    return parser
+
+
  def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doIndex=False, cachePages=100000, propertyList=[]):
      readsize = 0
      padsize = 0
@@ -64,7 +81,7 @@ def makerdsfrombed(label, filename, outdbname, init=True, dataType="DNA", doInde
  
      infile = open(filename,"r")
  
-    rds = readDataset(outdbname, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
      if not init:
          rds.dropIndex()
  
diff --git a/makerdsfromblat.py b/makerdsfromblat.py

index f92d5f512c43f18d13b709241f991b4d985fd38b..37576ca867fb2283f9a5d497c0bbca7d06191f7e 100755 (executable)
--- a/makerdsfromblat.py
+++ b/makerdsfromblat.py
@@ -11,10 +11,13 @@ try:
  except:
      pass
  
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
  
-verstring = "%prog: version 3.9"
+verstring = "makerdsfromblat: version 3.10"
  print verstring
  
  def main(argv=None):
@@ -23,19 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--append", action="store_false", dest="init")
-    parser.add_option("--index", action="store_true", dest="doIndex")
-    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
-    parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
-    parser.add_option("--flag", action="store_true", dest="flagReads")
-    parser.add_option("--strict", type="int", dest="minSpliceLength",
-                      help="min required bp on each side of a splice")
-    parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
-    parser.add_option("--verbose", action="store_true", dest="verbose")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--RNA", dest="geneDataFileName")
-    parser.set_defaults(init=True, doIndex=False, trimReadID=True, minSpliceLength=0, forceRNA=False, flagReads=False, spliceOnly=False, verbose=False, cachePages=100000, geneDataFileName="")
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -67,6 +58,40 @@ def main(argv=None):
                     options.cachePages, options.geneDataFileName, propertyList)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+    parser.add_option("--forceRNA", action="store_true", dest="forceRNA")
+    parser.add_option("--flag", action="store_true", dest="flagReads")
+    parser.add_option("--strict", type="int", dest="minSpliceLength",
+                      help="min required bp on each side of a splice")
+    parser.add_option("--spliceonly", action="store_true", dest="spliceOnly")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--RNA", dest="geneDataFileName")
+
+    configParser = getConfigParser()
+    section = "makerdsfromblat"
+    init = getConfigBoolOption(configParser, section, "init", True)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+    minSpliceLength = getConfigIntOption(configParser, section, "minSpliceLength", 0)
+    forceRNA = getConfigBoolOption(configParser, section, "forceRNA", False)
+    flagReads = getConfigBoolOption(configParser, section, "flagReads", False)
+    spliceOnly = getConfigBoolOption(configParser, section, "spliceOnly", False)
+    verbose = getConfigBoolOption(configParser, section, "verbose", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", "")
+
+    parser.set_defaults(init=init, doIndex=doIndex, trimReadID=trimReadID, minSpliceLength=minSpliceLength, forceRNA=forceRNA,
+                        flagReads=flagReads, spliceOnly=spliceOnly, verbose=verbose, cachePages=cachePages,
+                        geneDataFileName=geneDataFileName)
+
+    return parser
+
+
  def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
                      doIndex=False,trimReadID=True, minSpliceLength=0,
                      forceRNA=False, theFlag="", spliceOnly=False,
@@ -115,7 +140,7 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  
          genedatafile.close()
  
-    rds = readDataset(outdbname, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
  
      #check that our cacheSize is better than the dataset's default cache size
      defaultCacheSize = rds.getDefaultCacheSize()
diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py

index 3534a8827c9538fdc05ba193cd792ef7bf5aab72..14be260cdf0721b0514098a22d3a07423756c41b 100755 (executable)
--- a/makerdsfrombowtie.py
+++ b/makerdsfrombowtie.py
@@ -11,10 +11,13 @@ try:
  except:
      pass
  
-import sys, string, optparse
-from commoncode import readDataset, writeLog
+import sys
+import string
+import optparse
+from commoncode import writeLog, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
  
-verstring = "%prog: version 4.1"
+verstring = "makerdsfrombowtie: version 4.2"
  print verstring
  
  def main(argv=None):
@@ -23,21 +26,7 @@ def main(argv=None):
  
      usage = "usage: python %prog label infilename outrdsfile [propertyName::propertyValue] [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--RNA", dest="genedatafilename")
-    parser.add_option("--append", action="store_false", dest="init")
-    parser.add_option("--index", action="store_true", dest="doIndex")
-    parser.add_option("--spacer", type="int", dest="spacer")
-    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
-    parser.add_option("--forcepair", type="int", dest="forceID")
-    parser.add_option("--flip", action="store_true", dest="flip")
-    parser.add_option("--verbose", action="store_true", dest="verbose")
-    parser.add_option("--strip", action="store_true", dest="stripSpace")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(genedatafilename=None, init=True, doIndex=False, spacer=2,
-                        trimReadID=True, forceID=None, flip=False, verbose=False,
-                        stripSpace=False, cachePages=100000)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -60,6 +49,39 @@ def main(argv=None):
                        propertyList)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--RNA", dest="genedatafilename")
+    parser.add_option("--append", action="store_false", dest="init")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--spacer", type="int", dest="spacer")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID")
+    parser.add_option("--forcepair", type="int", dest="forceID")
+    parser.add_option("--flip", action="store_true", dest="flip")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--strip", action="store_true", dest="stripSpace")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "makerdsfrom bowtie"
+    genedatafilename = getConfigOption(configParser, section, "genedatafilename", None)
+    init = getConfigBoolOption(configParser, section, "init", True)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    spacer = getConfigIntOption(configParser, section, "spacer", 2)
+    trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+    forceID = getConfigOption(configParser, section, "forceID", None)
+    flip = getConfigBoolOption(configParser, section, "flip", False)
+    verbose = getConfigBoolOption(configParser, section, "verbose", False)
+    stripSpace = getConfigBoolOption(configParser, section, "stripSpace", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+
+    parser.set_defaults(genedatafilename=genedatafilename, init=init, doIndex=doIndex, spacer=spacer,
+                        trimReadID=trimReadID, forceID=forceID, flip=flip, verbose=verbose,
+                        stripSpace=stripSpace, cachePages=cachePages)
+
+    return parser
+
+
  def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=True,
                        doIndex=False, spacer=2, trimReadID=True, forceID=None,
                        flip=False, verbose=False, stripSpace=False, cachePages=100000,
@@ -112,7 +134,7 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
  
          genedatafile.close()
  
-    rds = readDataset(outdbname, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
  
      #check that our cacheSize is better than the dataset's default cache size
      defaultCacheSize = rds.getDefaultCacheSize()
diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py

index 317ceda90359c6cda17541af71b39e400eee8b80..66209eef973f36072ba964aac73f8579685aed99 100755 (executable)
--- a/makerdsfromeland2.py
+++ b/makerdsfromeland2.py
@@ -8,38 +8,23 @@ try:
  except:
      pass
  
-import sys, string, optparse
-from commoncode import readDataset
+import sys
+import string
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "%prog: version 3.4"
+    verstring = "makerdsfromeland2: version 3.5"
      print verstring
  
      usage = "usage:  %prog label infilename outrdsfile [propertyName::propertyValue] [options]\
              \ninput reads must be sorted to properly record multireads"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--append", action="store_false", dest="init",
-                      help="append to existing rds file [default: create new]")
-    parser.add_option("--RNA", dest="geneDataFileName",
-                      help="set data type to RNA [default: DNA]")
-    parser.add_option("--index", action="store_true", dest="doIndex",
-                      help="index the output rds file")
-    parser.add_option("--cache", type="int", dest="cachePages",
-                      help="number of cache pages to use [default: 100000")
-    parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
-                      help="use : as the delimiter")
-    parser.add_option("--paired", dest="pairID",
-                      help="pairID value")
-    parser.add_option("--extended", action="store_true", dest="extended",
-                      help="use eland_extended input")
-    parser.add_option("--verbose", action="store_true", dest="verbose")
-    parser.add_option("--maxlines", type="int", dest="maxLines",
-                      help="[default: 1000000000")
-    parser.set_defaults(init=True, doIndex=False, cachePages=100000, geneDataFileName=None, useOldDelimiter=False, pairID=None, maxLines=1000000000, extended=False, verbose=False)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -68,10 +53,54 @@ def main(argv=None):
      if options.geneDataFileName is not None:
          dataType = 'RNA'
  
-    makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init, options.pairID, dataType, options.geneDataFileName, options.cachePages, options.maxLines, options.extended, options.verbose)
+    makeRDSFromEland2(label, filename, outdbname, options.doIndex, delimiter, paired, options.init,
+                      options.pairID, dataType, options.geneDataFileName, options.cachePages,
+                      options.maxLines, options.extended, options.verbose)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init",
+                      help="append to existing rds file [default: create new]")
+    parser.add_option("--RNA", dest="geneDataFileName",
+                      help="set data type to RNA [default: DNA]")
+    parser.add_option("--index", action="store_true", dest="doIndex",
+                      help="index the output rds file")
+    parser.add_option("--cache", type="int", dest="cachePages",
+                      help="number of cache pages to use [default: 100000")
+    parser.add_option("--olddelimiter", action="store_true", dest="useOldDelimiter",
+                      help="use : as the delimiter")
+    parser.add_option("--paired", dest="pairID",
+                      help="pairID value")
+    parser.add_option("--extended", action="store_true", dest="extended",
+                      help="use eland_extended input")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--maxlines", type="int", dest="maxLines",
+                      help="[default: 1000000000")
+
+    configParser = getConfigParser()
+    section = "makerdsfromeland2"
+    init = getConfigBoolOption(configParser, section, "init", True)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    geneDataFileName = getConfigOption(configParser, section, "geneDataFileName", None)
+    useOldDelimiter = getConfigBoolOption(configParser, section, "useOldDelimiter", False)
+    pairID = getConfigOption(configParser, section, "pairID", None)
+    maxLines = getConfigIntOption(configParser, section, "maxLines", 1000000000)
+    extended = getConfigBoolOption(configParser, section, "extended", False)
+    verbose = getConfigBoolOption(configParser, section, "verbose", False)
+
+    parser.set_defaults(init=init, doIndex=doIndex, cachePages=cachePages,
+                        geneDataFileName=geneDataFileName, useOldDelimiter=useOldDelimiter,
+                        pairID=pairID, maxLines=maxLines, extended=extended, verbose=verbose)
+
+    return parser
+
  
+def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False,
+                      init=True, pairID="1", dataType="DNA", geneDataFileName=None,
+                      cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
  
-def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|", paired=False, init=True, pairID="1", dataType="DNA", geneDataFileName=None, cachePages=100000, maxLines=1000000000, extended=False, verbose=False):
      maxBorder = 0
      index = 0
      insertSize = 100000
@@ -104,7 +133,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
              mapDict[uname] = []
          genedatafile.close()
  
-    rds = readDataset(outdbname, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
  
      if cachePages > rds.getDefaultCacheSize():
          if init:
diff --git a/makesitetrack.py b/makesitetrack.py

index c6d0b8ec3d87987950c4876d9e70f4120e9dee0a..6537bea791e3cfd58ccf95c8c89061b14797a07e 100755 (executable)
--- a/makesitetrack.py
+++ b/makesitetrack.py
@@ -3,9 +3,12 @@
  #  ENRAGE
  #
  
-import sys, string, optparse
+import sys
+import string
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
  
-print "%prog: version 2.1"
+print "makesitetrack: version 2.2"
  
  
  def main(argv=None):
@@ -14,13 +17,7 @@ def main(argv=None):
  
      usage = "usage: python %prog sitefile outbedfile [--noheader] [--stype fieldID] [--color xx,yy,zz] [--append] [--exploded]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--noheader", action="store_true", dest="noHeader")
-    parser.add_option("--stype", type="int", dest="stypeID")
-    parser.add_option("--color", dest="color")
-    parser.add_option("--append", action="store_true", dest="append")
-    parser.add_option("--exploded", action="store_false", dest="compact")
-    parser.set_defaults(stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -33,6 +30,27 @@ def main(argv=None):
      makesitetrack(infile, outfileName, options.stypeID, options.color, options.append, options.compact, options.noHeader)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--noheader", action="store_true", dest="noHeader")
+    parser.add_option("--stype", type="int", dest="stypeID")
+    parser.add_option("--color", dest="color")
+    parser.add_option("--append", action="store_true", dest="append")
+    parser.add_option("--exploded", action="store_false", dest="compact")
+
+    configParser = getConfigParser()
+    section = "makesitetrack"
+    stypeID = getConfigOption(configParser, section, "stypeID", None)
+    color = getConfigOption(configParser, section, "color", "0,0,0")
+    append = getConfigBoolOption(configParser, section, "append", False)
+    compact = getConfigBoolOption(configParser, section, "compact", True)
+    noHeader = getConfigBoolOption(configParser, section, "noHeader", False)
+
+    parser.set_defaults(stypeID=stypeID, color=color, append=append, compact=compact, noHeader=noHeader)
+
+    return parser
+
+
  def makesitetrack(infileName, outFileName, stypeID=None, color="0,0,0", append=False, compact=True, noHeader=False):
      if stypeID is not None:
          doStype = True
diff --git a/makewiggle.py b/makewiggle.py

index 95b0634bc2ec6bf804e955a1926866a9ceb28c5c..7440a796f73038130abb89ef593df425ac2921f6 100755 (executable)
--- a/makewiggle.py
+++ b/makewiggle.py
@@ -2,10 +2,12 @@
  #  makewiggle.py
  #  ENRAGE
  #
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, getConfigFloatOption
  
-print "%prog: version 6.7"
+print "makewiggle: version 6.8"
  
  try:
      import psyco
@@ -20,6 +22,25 @@ def main(argv=None):
  
      usage = "usage: python %s name rdsfile outfilename [options]"
  
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    name = args[0]
+    hitfilename = args[1]
+    outfilename = args[2]
+
+    makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
+               options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
+               options.group, options.startPriority, options.skipRandom, options.withMulti,
+               options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
+               options.chunk)
+
+
+def getParser(usage):
      parser = optparse.OptionParser(usage=usage)
      parser.add_option("--raw", action="store_false", dest="doNormalize")
      parser.add_option("--color", dest="color")
@@ -39,27 +60,35 @@ def main(argv=None):
      parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
      parser.add_option("--stranded", dest="strand")
      parser.add_option("--maxchunk", type="int", dest="chunk")
-    parser.set_defaults(doNormalize=True, color=None, altColor="", limitChrom=None,
-                        shift=0, doSplit=False, listfilename=None, listPrefix="",
-                        group="", startPriority=0.01, skipRandom=False, withMulti=True,
-                        withSplices=False, doSingle=False, cachePages=-1, enforceChr=False,
-                        strand=None, chunk=20)
-
-    (options, args) = parser.parse_args(argv[1:])
-
-    if len(args) < 3:
-        print usage
-        sys.exit(1)
-
-    name = args[0]
-    hitfilename = args[1]
-    outfilename = args[2]
  
-    makewiggle(name, hitfilename, outfilename, options.doNormalize, options.color, options.altColor,
-               options.limitChrom, options.shift, options.doSplit, options.listfilename, options.listPrefix,
-               options.group, options.startPriority, options.skipRandom, options.withMulti,
-               options.withSplices, options.doSingle, options.cachePages, options.enforceChr, options.strand,
-               options.chunk)
+    configParser = getConfigParser()
+    section = "makewiggle"
+    doNormalize = getConfigBoolOption(configParser, section, "doNormalize", True)
+    color = getConfigOption(configParser, section, "color", None)
+    altColor = getConfigOption(configParser, section, "altColor", "")
+    limitChrom = getConfigOption(configParser, section, "limitChrom", None)
+    shift = getConfigIntOption(configParser, section, "shift", 0)
+    doSplit = getConfigBoolOption(configParser, section, "doSplit", False)
+    listfilename = getConfigOption(configParser, section, "listfilename", None)
+    listPrefix = getConfigOption(configParser, section, "listPrefix", "")
+    group = getConfigOption(configParser, section, "group", "")
+    startPriority = getConfigFloatOption(configParser, section, "startPriority", 0.01)
+    skipRandom = getConfigBoolOption(configParser, section, "skipRandom", False)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+    withSplices = getConfigBoolOption(configParser, section, "withSplices", False)
+    doSingle = getConfigBoolOption(configParser, section, "doSingle", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+    enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+    strand = getConfigOption(configParser, section, "strand", None)
+    chunk = getConfigIntOption(configParser, section, "chunk", 20)
+
+    parser.set_defaults(doNormalize=doNormalize, color=color, altColor=altColor, limitChrom=limitChrom,
+                        shift=shift, doSplit=doSplit, listfilename=listfilename, listPrefix=listPrefix,
+                        group=group, startPriority=startPriority, skipRandom=skipRandom, withMulti=withMulti,
+                        withSplices=withSplices, doSingle=doSingle, cachePages=cachePages, enforceChr=enforceChr,
+                        strand=strand, chunk=chunk)
+
+    return parser
  
  
  def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, altColor="",
@@ -110,7 +139,7 @@ def makewiggle(name, hitfilename, outfilename, doNormalize=True, color=None, alt
          print "Will shift reads by +/- %d bp according to their sense" % shift
          name += "shift=%d" % shift
      
-    hitRDS = readDataset(hitfilename, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfilename, verbose=True, cache=doCache)
  
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index 4d174bf1cd1c35b6a4e6e84c95e86e50bc4f3b95..c677d563e18d3f2544fc3c70caee4148c743bf5f 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -4,12 +4,14 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
  
-print "%prog: version 5.6"
+print "normalizeExpandedExonic: version 5.7"
  
  
  def main(argv=None):
@@ -18,15 +20,7 @@ def main(argv=None):
  
      usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--gidField", type="int", dest="fieldID")
-    parser.add_option("--maxLength", type="float", dest="maxLength")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
-                        replaceModels=False)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(sys.argv) < 6:
@@ -57,6 +51,28 @@ def main(argv=None):
                              options.replaceModels)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--gidField", type="int", dest="fieldID")
+    parser.add_option("--maxLength", type="float", dest="maxLength")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+    configParser = getConfigParser()
+    section = "normalizeExpandedExonic"
+    fieldID = getConfigIntOption(configParser, section, "fieldID", 0)
+    maxLength = getConfigFloatOption(configParser, section, "maxLength", 1000000000.)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+    parser.set_defaults(fieldID=fieldID, maxLength=maxLength, doCache=doCache, extendGenome=extendGenome,
+                        replaceModels=replaceModels)
+
+    return parser
+
+
  def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
                              outfilename, candidateLines=[], acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
@@ -88,7 +104,7 @@ def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfil
      if extendGenome != "":
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
+    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
      uniqcount = RDS.getUniqsCount()
      print "%d unique reads" % uniqcount
  
diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py

index 6053e8077865c2cfbe3142bf8d3231da70d8688f..d5a6b4b954d4a9df6cb7b749c97c513e9394dafb 100755 (executable)
--- a/normalizeFinalExonic.py
+++ b/normalizeFinalExonic.py
@@ -4,10 +4,12 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
  
-print "%prog: version 3.5" % sys.argv[0]
+print "normalizeFinalExonic: version 3.6"
  
  def main(argv=None):
      if not argv:
@@ -15,15 +17,7 @@ def main(argv=None):
  
      usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--multifraction", action="store_true", dest="reportfraction")
-    parser.add_option("--multifold", action="store_true", dest="reportFold")
-    parser.add_option("--minrpkm", type="float", dest="minThreshold")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--withGID", action="store_true", dest="writeGID")
-    parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
-                        doCache=False, writeGID=False)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -40,6 +34,28 @@ def main(argv=None):
                           options.doCache, options.writeGID)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--multifraction", action="store_true", dest="reportfraction")
+    parser.add_option("--multifold", action="store_true", dest="reportFold")
+    parser.add_option("--minrpkm", type="float", dest="minThreshold")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--withGID", action="store_true", dest="writeGID")
+
+    configParser = getConfigParser()
+    section = "normalizeFinalExonic"
+    reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
+    reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
+    minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
+
+    parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
+                        doCache=doCache, writeGID=writeGID)
+
+    return parser
+
+
  def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
                           reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
                           writeGID=False):
@@ -53,7 +69,7 @@ def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename,
      elif reportFold:
          print "reporting fold contribution of multireads"
  
-    RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
+    RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
      uniqcount = RDS.getUniqsCount()
      splicecount = RDS.getSplicesCount()
      multicount = RDS.getMultiCount()
diff --git a/partition.py b/partition.py

index 89148fd8a5cfe1b075650e328d4c6e4180cdcada..cbaa64336f35ce02a2bd75e68c53a9be5bcbb1ff 100755 (executable)
--- a/partition.py
+++ b/partition.py
@@ -2,7 +2,7 @@
  #  partition.py
  #  ENRAGE
  #
-""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [--minFeature bp] [--padregion bp] [--mergeregion bp] [--nomerge] [--log altlogfile] [--locid] [--ignorerandom] [--chromField fieldNum]
+""" usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]
             where the regionfiles must be comma-separated with no white space
             -minFeature controls the size of the smallest partition
  """
@@ -13,10 +13,12 @@ try:
  except:
      pass
  
-import sys, string, optparse
-from commoncode import getMergedRegions, writeLog
+import sys
+import string
+import optparse
+from commoncode import getMergedRegions, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
-versionString = '%s: version 2.0' % sys.argv[0]
+versionString = "partition: version 2.1"
  print versionString
  
  
@@ -26,24 +28,7 @@ def main(argv=None):
  
      usage = "usage: python %s mergeID regionfile1[,regionfile2,...] combpartitionfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--minFeature", type="int", dest="minFeature",
-                      help="size of smallest partition")
-    parser.add_option("--chromField", type="int", dest="cField",
-                      help="num chromosome fields")
-    parser.add_option("--padregion", type="int", dest="padregion",
-                      help="padding on each side of region")
-    parser.add_option("--mergeregion", type="int", dest="mergeregion",
-                      help="bp threshold to merge regions")
-    parser.add_option("--nomerge", action="store_false", dest="merging",
-                      help="do not merge regions")
-    parser.add_option("--log", dest="logfilename",
-                      help="log file")
-    parser.add_option("--locID", action="store_true", dest="locID",
-                      help="use location as region ID")
-    parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
-                      help="ignore 'random' chromosomes")
-    parser.set_defaults(minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log")
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -66,10 +51,51 @@ def main(argv=None):
      if options.ignoreRandom:
          print "ignoring 'random' chromosomes"
  
-    partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField, options.padregion, options.locID, options.ignoreRandom, options.mergeregion, options.merging, options.logfilename)
+    partition(mergeID, regionfiles, outfilename, options.minFeature, options.cField,
+              options.padregion, options.locID, options.ignoreRandom, options.mergeregion,
+              options.merging, options.logfilename)
  
  
-def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0, locID=False, ignoreRandom=False, mergeregion=0, merging=True, logfilename="partition.log"):
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--minFeature", type="int", dest="minFeature",
+                      help="size of smallest partition")
+    parser.add_option("--chromField", type="int", dest="cField",
+                      help="num chromosome fields")
+    parser.add_option("--padregion", type="int", dest="padregion",
+                      help="padding on each side of region")
+    parser.add_option("--mergeregion", type="int", dest="mergeregion",
+                      help="bp threshold to merge regions")
+    parser.add_option("--nomerge", action="store_false", dest="merging",
+                      help="do not merge regions")
+    parser.add_option("--log", dest="logfilename",
+                      help="log file")
+    parser.add_option("--locID", action="store_true", dest="locID",
+                      help="use location as region ID")
+    parser.add_option("--norandom", action="store_true", dest="ignoreRandom",
+                      help="ignore 'random' chromosomes")
+
+    configParser = getConfigParser()
+    section = "partition"
+    minFeature = getConfigIntOption(configParser, section, "minFeature", 25)
+    cField = getConfigIntOption(configParser, section, "cField", 1)
+    padregion = getConfigIntOption(configParser, section, "padregion", 1)
+    locID = getConfigBoolOption(configParser, section, "locID", False)
+    ignoreRandom = getConfigBoolOption(configParser, section, "ignoreRandom", False)
+    mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
+    merging = getConfigBoolOption(configParser, section, "merging", True)
+    logfilename = getConfigOption(configParser, section, "logfilename", "partition.log")
+
+    parser.set_defaults(minFeature=minFeature, cField=cField, padregion=padregion, locID=locID,
+                        ignoreRandom=ignoreRandom, mergeregion=mergeregion, merging=merging,
+                        logfilename=logfilename)
+
+    return parser
+
+
+def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padregion=0,
+              locID=False, ignoreRandom=False, mergeregion=0, merging=True,
+              logfilename="partition.log"):
  
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
  
@@ -78,7 +104,10 @@ def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padreg
      numRegions = len(regionFileList)
      chromList = []
      for regionID in range(numRegions):
-        allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,  minHits=-1, fullChrom = True, verbose = True, chromField = cField, doMerge=merging, pad=padregion)
+        allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist = mergeregion,
+                                                    minHits=-1, fullChrom=True, verbose=True, chromField=cField,
+                                                    doMerge=merging, pad=padregion)
+
          for achrom in allregionsDict[regionID]:
              if achrom not in chromList:
                  chromList.append(achrom)
@@ -88,16 +117,16 @@ def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padreg
      chromList = sorted(chromList)
  
      for chrom in chromList:
-        if ignoreRandom and 'random' in chrom:
+        if ignoreRandom and "random" in chrom:
              continue
  
          outregionDict[chrom] = []
          pointList = []
          for regionID in range(numRegions):
              if chrom in allregionsDict[regionID]:
-                for (rstart, rstop, rlength) in allregionsDict[regionID][chrom]:
-                    pointList.append(rstart)
-                    pointList.append(rstop)
+                for region in allregionsDict[regionID][chrom]:
+                    pointList.append(region.start)
+                    pointList.append(region.stop)
  
          pointList.sort()
          start = 0
@@ -106,20 +135,22 @@ def partition(mergeID, regionfiles, outfilename, minFeature=25, cField=1, padreg
                  outregionDict[chrom].append((start, point - 1, point - 1 - start))
                  start = point
  
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
      if locID:
-        outfile.write('#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n')
+        outfile.write("#chrom:start-stop\tchrom\tstart\tstop\tlength_kb\n")
      else:
-        outfile.write('#labelID\tchrom\tstart\tstop\tlength_kb\n')
+        outfile.write("#labelID\tchrom\tstart\tstop\tlength_kb\n")
  
      index = 0
      for chrom in outregionDict:
          for (start, stop, length) in outregionDict[chrom]:
              index += 1
              if locID:
-                outfile.write("%s:%d-%d\t%s\t%d\t%d\t%.3f\n" % (chrom, start, stop, chrom, start, stop, length/1000.))
+                label = "%s:%d-%d" % (chrom, start, stop)
              else:
-                outfile.write("%s%d\t%s\t%d\t%d\t%.3f\n" % (mergeID, index, chrom, start, stop, length/1000.))
+                label = "%s%d" % (mergeID, index)
+
+            outfile.write("%s\t%s\t%d\t%d\t%.3f\n" % (label, chrom, start, stop, length/1000.))
  
      message = "%s was partitioned into %d regions" % (mergeID, index)
      print message
diff --git a/peakstoregion.py b/peakstoregion.py

index 78000f5a2253d6a711cdafaaeac2adff26e5ddee..b7a9ec571c886fe5140b5a8e43a13dfa598d91c0 100755 (executable)
--- a/peakstoregion.py
+++ b/peakstoregion.py
@@ -11,7 +11,7 @@ except:
  
  import sys
  
-print "%s: version 1.0" % sys.argv[0]
+print "peakstoregion: version 1.1"
  
  def main(argv=None):
      if not argv:
@@ -24,26 +24,30 @@ def main(argv=None):
      peakfile = argv[1]
      outfile = argv[2]
  
-    radius = 500
-    chromField = 2
-    posField = 3
-    labelField = 1
-    dataField = -1
-
-    if len(argv) > 3:
+    try:
          radius = int(argv[3])
+    except (IndexError, ValueError):
+        radius = 500
  
-    if len(argv) > 4:
+    try:
          chromField = int(argv[4])
+    except (IndexError, ValueError):
+        chromField = 2
  
-    if len(argv) > 5:
+    try:
          posField = int(argv[5])
+    except (IndexError, ValueError):
+        posField = 3
  
-    if len(argv) > 6:
+    try:
          labelField = int(argv[6])
+    except (IndexError, ValueError):
+        labelField = 1
  
-    if len(argv) > 7:
+    try:
          dataField = int(argv[7])
+    except (IndexError, ValueError):
+        dataField = -1
  
      peakstoregion(peakfile, outfile, radius, chromField, posField, labelField, dataField)
  
diff --git a/plotbardist.py b/plotbardist.py

index 52ccbe294f0b584b07400839ab3b869ea50017f3..8b1432ecd82a65df1c3d2de4ec2b1da5a384f116 100755 (executable)
--- a/plotbardist.py
+++ b/plotbardist.py
@@ -15,9 +15,10 @@ import optparse
  import matplotlib
  from pylab import *
  from math import *
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption
  
  
-print "%prog: version 3.2"
+print "plotbardist: version 3.3"
  
  
  def main(argv=None):
@@ -26,23 +27,7 @@ def main(argv=None):
  
      usage = "usage: python %prog infile1 [infile2] [infile3] [options] outfile.png"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--bins", type="int", dest="bins")
-    parser.add_option("--field", type="int", dest="binnedField")
-    parser.add_option("--binSize", type="float", dest="binLength")
-    parser.add_option("--doLog", type="int", dest="logBase")
-    parser.add_option("--ymax", type="int", dest="maxY")
-    parser.add_option("--xlabel", dest="xLabel")
-    parser.add_option("--ylabel", dest="yLabel")
-    parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
-    parser.add_option("--title", dest="figTitle")
-    parser.add_option("--legend", dest="barsLegend", help="comma separated list")
-    parser.add_option("--xoffset", type="float", dest="pointOffset")
-    parser.add_option("--figsize", dest="figSizes", help="x,y pair")
-    parser.set_defaults(bins=10, binnedField=-1, binLength=-1, logBase=None, maxY=0,
-                        xLabel="bins", yLabel="count", binLabels=None, figTitle="",
-                        barsLegend=None, pointOffset=0., figSizes=None)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
  
@@ -59,6 +44,43 @@ def main(argv=None):
                  options.figTitle, options.barsLegend, options.pointOffset, options.figSizes)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--bins", type="int", dest="bins")
+    parser.add_option("--field", type="int", dest="binnedField")
+    parser.add_option("--binSize", type="float", dest="binLength")
+    parser.add_option("--doLog", type="int", dest="logBase")
+    parser.add_option("--ymax", type="int", dest="maxY")
+    parser.add_option("--xlabel", dest="xLabel")
+    parser.add_option("--ylabel", dest="yLabel")
+    parser.add_option("--binLabels", dest="binLabels", help="comma separated list")
+    parser.add_option("--title", dest="figTitle")
+    parser.add_option("--legend", dest="barsLegend", help="comma separated list")
+    parser.add_option("--xoffset", type="float", dest="pointOffset")
+    parser.add_option("--figsize", dest="figSizes", help="x,y pair")
+
+    configParser = getConfigParser()
+    section = "plotbardist"
+    bins = getConfigIntOption(configParser, section, "bins", 10)
+    binnedField = getConfigIntOption(configParser, section, "binnedField", -1)
+    binLength = getConfigIntOption(configParser, section, "binLength", -1)
+    logBase = getConfigOption(configParser, section, "logBase", None)
+    maxY = getConfigIntOption(configParser, section, "maxY", 0)
+    xLabel = getConfigOption(configParser, section, "xLabel", "bins")
+    yLabel = getConfigOption(configParser, section, "yLabel", "count")
+    binLabels = getConfigOption(configParser, section, "binLabels", None)
+    figTitle = getConfigOption(configParser, section, "figTitle", "")
+    barsLegend = getConfigOption(configParser, section, "barsLegend", None)
+    pointOffset = getConfigFloatOption(configParser, section, "pointOffset", 0.)
+    figSizes = getConfigOption(configParser, section, "figSizes", None)
+
+    parser.set_defaults(bins=bins, binnedField=binnedField, binLength=binLength, logBase=logBase, maxY=maxY,
+                        xLabel=xLabel, yLabel=yLabel, binLabels=binLabels, figTitle=figTitle,
+                        barsLegend=barsLegend, pointOffset=pointOffset, figSizes=figSizes)
+
+    return parser
+
+
  def plotbardist(fileList, pngfilename, bins=10, binnedField=-1, binLength=-1, logBase=None,
                  maxY=0, xLabel="bins", yLabel="count", binLabels=None, figTitle="",
                  barsLegend=None, pointOffset=0., figSizes=None):
diff --git a/plotnomogram.py b/plotnomogram.py

index 238a4da2b9dddcd902754f561f408a3e83b1c506..573427463e4fb8eb5b9a8ffff958cdd40be4c45d 100755 (executable)
--- a/plotnomogram.py
+++ b/plotnomogram.py
@@ -15,7 +15,7 @@ try:
  except:
      pass
  
-print "%s: version 1.1" % sys.argv[0]
+print "plotnomogram: version 1.2"
  
  
  def main(argv=None):
diff --git a/plotprofile.py b/plotprofile.py

index 854affab825de5248b1f8db6a9b272fe320e3c5d..db41bac618170593248135678fb387aea423dde2 100755 (executable)
--- a/plotprofile.py
+++ b/plotprofile.py
@@ -14,9 +14,10 @@ import optparse
  from pylab import *
  from math import *
  import matplotlib
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
  
-print "%prog: version 2.2"
+print "plotprofile: version 2.3"
  
  def main(argv=None):
      if not argv:
@@ -24,13 +25,7 @@ def main(argv=None):
  
      usage = "usage: python %s infile outfile.png [--scale] [--max weightMax] [--ymin bottom] [--ymax top] [--subtractEvens]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--scale", action="store_true", dest="doScale")
-    parser.add_option("--max", type="float", dest="weightMax")
-    parser.add_option("--ymin", type="float", dest="ymin")
-    parser.add_option("--ymax", type="float", dest="ymax")
-    parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
-    parser.set_defaults(doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -43,6 +38,27 @@ def main(argv=None):
      plotprofile(infile, pngfilename, options.doScale, options.weightMax, options.ymin, options.ymax, options.subtractEvens)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--scale", action="store_true", dest="doScale")
+    parser.add_option("--max", type="float", dest="weightMax")
+    parser.add_option("--ymin", type="float", dest="ymin")
+    parser.add_option("--ymax", type="float", dest="ymax")
+    parser.add_option("--subtractEvens", action="store_true", dest="subtractEvens")
+
+    configParser = getConfigParser()
+    section = "plotprofile"
+    doScale = getConfigBoolOption(configParser, section, "doScale", False)
+    weightMax = getConfigIntOption(configParser, section, "weightMax", -1)
+    ymin = getConfigOption(configParser, section, "ymin", None)
+    ymax = getConfigOption(configParser, section, "ymax", None)
+    subtractEvens = getConfigBoolOption(configParser, section, "subtractEvens", False)
+
+    parser.set_defaults(doScale=doScale, weightMax=weightMax, ymin=ymin, ymax=ymax, subtractEvens=subtractEvens)
+
+    return parser
+
+
  def plotprofile(inFileName, pngfilename, doScale=False, weightMax=-1, ymin=None, ymax=None, subtractEvens=False):
      infile = open(inFileName)
      limitYscale = False
diff --git a/predictSpliceCount.py b/predictSpliceCount.py

index bab85db0af4c9571d16a82338762db7557d48d90..bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07 100755 (executable)
--- a/predictSpliceCount.py
+++ b/predictSpliceCount.py
@@ -12,7 +12,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print '%s: version 1.1' % argv[0]
+    print "predictSpliceCount: version 1.2"
  
      if len(argv) < 6:
          print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
diff --git a/profilebins.py b/profilebins.py

index 46274f52c401a7b0a08e2171b4c0147e857b8857..094b09707622847d4d7ab57945ffceb9dbd65675 100755 (executable)
--- a/profilebins.py
+++ b/profilebins.py
@@ -9,8 +9,11 @@ try:
  except:
      pass
  
-import sys, optparse
-print "%prog: version 2.2"
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigFloatOption
+
+print "profilebins: version 2.3"
  
  
  def main(argv=None):
@@ -19,17 +22,7 @@ def main(argv=None):
  
      usage = "usage: python %prog label infile1 [--upstream infile2] [--downstream infile3] [--uplength kb] [--downlength kb] [--gene geneName] [--genes genefile] [--append] outfile"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--upstream", dest="upfilename")
-    parser.add_option("--downstream", dest="downfilename")
-    parser.add_option("--uplength", type="float", dest="uplength")
-    parser.add_option("--downlength", type="int", dest="")
-    parser.add_option("--gene", dest="gene")
-    parser.add_option("--genes", dest="genefile")
-    parser.add_option("--append", action="store_true", dest="doAppend")
-    parser.set_defaults(upfilename=None, downfilename=None, uplength=0.0, downlength=0.0,
-                        gene=None, genefile=None, doAppend=False)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -45,6 +38,32 @@ def main(argv=None):
                  options.doAppend)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--upstream", dest="upfilename")
+    parser.add_option("--downstream", dest="downfilename")
+    parser.add_option("--uplength", type="float", dest="uplength")
+    parser.add_option("--downlength", type="int", dest="")
+    parser.add_option("--gene", dest="gene")
+    parser.add_option("--genes", dest="genefile")
+    parser.add_option("--append", action="store_true", dest="doAppend")
+
+    configParser = getConfigParser()
+    section = "profilebins"
+    upfilename = getConfigOption(configParser, section, "upfilename", None)
+    downfilename = getConfigOption(configParser, section, "downfilename", None)
+    uplength = getConfigFloatOption(configParser, section, "uplength", 0.0)
+    downlength = getConfigFloatOption(configParser, section, "downlength", 0.0)
+    gene = getConfigOption(configParser, section, "gene", None)
+    genefile = getConfigOption(configParser, section, "genefile", None)
+    doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
+
+    parser.set_defaults(upfilename=upfilename, downfilename=downfilename, uplength=uplength, downlength=downlength,
+                        gene=gene, genefile=genefile, doAppend=doAppend)
+
+    return parser
+
+
  def profilebins(label, infilename, outfilename, upfilename=None, downfilename=None,
                  uplength=0.0, downlength=0.0, gene=None, genefile=None, doAppend=False):
  
diff --git a/ratio.py b/ratio.py

index ae14cfec081bda22a1f5b06415e14576921036e8..52e1abd3600a8bf358f132fed5f489fb5963abb9 100755 (executable)
--- a/ratio.py
+++ b/ratio.py
@@ -2,8 +2,9 @@ import sys
  import string
  import optparse
  import math
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption
  
-print "%prog: version 2.3"
+print "ratio: version 2.4"
  
  
  def main(argv=None):
@@ -12,10 +13,7 @@ def main(argv=None):
  
      usage = "usage: python %prog denominatorField infile [--only fieldID] [--out outfile]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--only", type="int", dest="onlyField")
-    parser.add_option("--out", dest="outFileName")
-    parser.set_defaults(outFileName=None, onlyField=-1)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -30,6 +28,22 @@ def main(argv=None):
  
      ratio(field, inFileName, options.outFileName, options.onlyField)
  
+
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--only", type="int", dest="onlyField")
+    parser.add_option("--out", dest="outFileName")
+
+    configParser = getConfigParser()
+    section = "ratio"
+    outFileName = getConfigOption(configParser, section, "outfile", None)
+    onlyField = getConfigIntOption(configParser, section, "fieldID", -1)
+
+    parser.set_defaults(outFileName=outFileName, onlyField=onlyField)
+
+    return parser
+
+
  def ratio(field, inFileName, outFileName=None, onlyField=-1):
  
      if inFileName is not None:
diff --git a/rdsmetadata.py b/rdsmetadata.py

index 1ac458b43540963680b8a9f0315c3648c49dc543..fc25dfdd024e56910e6f745df064e0b5bddfcccf 100755 (executable)
--- a/rdsmetadata.py
+++ b/rdsmetadata.py
@@ -10,9 +10,10 @@ except:
  
  import sys
  import optparse
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigIntOption
  
-print "%prog: version 2.7"
+print "rdsmetadata: version 2.8"
  
  
  def main(argv=None):
@@ -21,19 +22,7 @@ def main(argv=None):
  
      usage = "usage: python %prog rdsfile [propertyName1::propertyValue1] ... [propertyNameN::propertyValueN] [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--defaultcache", type="int", dest="cacheVal")
-    parser.add_option("--index", action="store_true", dest="buildIndex")
-    parser.add_option("--dropindex", action="store_true", dest="dropIndex")
-    parser.add_option("--nocount", action="store_false", dest="doCount")
-    parser.add_option("--complexity", action="store_true", dest="doComplexity")
-    parser.add_option("--reset", action="store_true", dest="resetFlags")
-    parser.add_option("--initrna", action="store_true", dest="rnaDataType")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(cacheVal=0, buildIndex=False, dropIndex=False, doCount=True,
-                        doComplexity=False, resetFlags=False, rnaDataType=False,
-                        cachePages=-1)
-
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 1:
@@ -55,6 +44,35 @@ def main(argv=None):
                  options.resetFlags, options.rnaDataType, options.cachePages)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--defaultcache", type="int", dest="cacheVal")
+    parser.add_option("--index", action="store_true", dest="buildIndex")
+    parser.add_option("--dropindex", action="store_true", dest="dropIndex")
+    parser.add_option("--nocount", action="store_false", dest="doCount")
+    parser.add_option("--complexity", action="store_true", dest="doComplexity")
+    parser.add_option("--reset", action="store_true", dest="resetFlags")
+    parser.add_option("--initrna", action="store_true", dest="rnaDataType")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "rdsmetadata"
+    cacheVal = getConfigIntOption(configParser, section, "cacheVal", 0)
+    buildIndex = getConfigBoolOption(configParser, section, "buildIndex", False)
+    dropIndex = getConfigBoolOption(configParser, section, "dropIndex", False)
+    doCount = getConfigBoolOption(configParser, section, "doCount", True)
+    doComplexity = getConfigBoolOption(configParser, section, "doComplexity", False)
+    resetFlags = getConfigBoolOption(configParser, section, "resetFlags", False)
+    rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+
+    parser.set_defaults(cacheVal=cacheVal, buildIndex=buildIndex, dropIndex=dropIndex, doCount=doCount,
+                        doComplexity=doComplexity, resetFlags=resetFlags, rnaDataType=rnaDataType,
+                        cachePages=cachePages)
+
+    return parser
+
+
  def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False,
                  dropIndex=False, doCount=True, doComplexity=False, resetFlags=False,
                  rnaDataType=False, cachePages=-1):
@@ -64,9 +82,9 @@ def rdsmetadata(datafile, propertyList=[], cacheVal=0, buildIndex=False,
          doCache = True
  
      if rnaDataType:
-        rds = readDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
+        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType="RNA", verbose=True, cache=doCache)
      else:
-        rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
+        rds = ReadDataset.ReadDataset(datafile, verbose=True, reportCount=doCount, cache=doCache)
  
      if cachePages > rds.getDefaultCacheSize():
          rds.setDBcache(cachePages)
diff --git a/regionBins.py b/regionBins.py

index 2d1649bd6614066656a0ef72a75aa7a90a4ffffc..d6c858de23c670a8526459bf41ba25dc807ac78d 100755 (executable)
--- a/regionBins.py
+++ b/regionBins.py
@@ -10,14 +10,15 @@ except:
      print 'psyco not running'
  
  import sys
+import ReadDataset
+from commoncode import getMergedRegions, computeRegionBins
+
  print '%s: version 2.0' % sys.argv[0]
  
  if len(sys.argv) < 4:
      print 'usage: python %s regionfile rdsfile outfilename [-bins numbins] [-field fieldNum] [-raw] [-padregion bp] [-mergeregion bp] [-cache]' % sys.argv[0]
      sys.exit(1)
  
-from commoncode import *
-
  regionfilename = sys.argv[1]
  hitfile =  sys.argv[2]
  outfilename = sys.argv[3]
@@ -55,7 +56,7 @@ if '-bins' in sys.argv:
      binfield = sys.argv.index('-bins') + 1
      bins = int(sys.argv[binfield])
  
-hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
+hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
  readlen = hitRDS.getReadSize()
  normalizationFactor = 1.0
  if normalize:
@@ -65,7 +66,7 @@ if normalize:
  chromList = hitRDS.getChromosomes(fullChrom=False)
  chromList.sort()
  
-regionDict = getMergedRegions(regionfilename, maxDist = mergeregion, keepLabel = True, verbose = True, chromField = cField, pad=padregion)
+regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, keepLabel=True, verbose=True, chromField=cField, pad=padregion)
  
  hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  
@@ -76,6 +77,7 @@ for regionID in regionsBins:
      tagCount = 0.
      for binAmount in regionsBins[regionID]:
          tagCount += binAmount
+
      outfile.write('%s\t%s\t%.1f\t%d' % (regionID, regionID, tagCount, Len[gid]))
      for binAmount in gidBins[gid]:
              if normalizeBins:
diff --git a/regionCounts.py b/regionCounts.py

index 0104cc2f4bf97bb64797a1b6ce418a8b7e1b4528..ae005cb3146b686fd3934cd9a152c331900ebde4 100755 (executable)
--- a/regionCounts.py
+++ b/regionCounts.py
@@ -9,10 +9,13 @@ try:
  except:
      print 'psyco not running'
  
-import sys, string, optparse
-from commoncode import readDataset, getMergedRegions, findPeak, writeLog
+import sys
+import string
+import optparse
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
  
-versionString = "%prog: version 3.9"
+versionString = "regionCounts: version 3.10"
  print versionString
  
  def main(argv=None):
@@ -21,6 +24,25 @@ def main(argv=None):
  
      usage = "usage: python %prog regionfile rdsfile outfilename [options]"
  
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    regionfilename = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+                 options.useFullchrom, options.normalize, options.padregion,
+                 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
+                 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
+                 options.doRPKM, options.doLength, options.forceRegion)
+
+
+def getParser(usage):
      parser = optparse.OptionParser(usage=usage)
      parser.add_option("--markRDS", action="store_true", dest="flagRDS")
      parser.add_option("--chromField", type="int", dest="cField")
@@ -38,27 +60,33 @@ def main(argv=None):
      parser.add_option("--rpkm", action="store_true", dest="doRPKM")
      parser.add_option("--length", action="store_true", dest="doLength")
      parser.add_option("--force", action="store_true", dest="forceRegion")
-    parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
-                        padregion=0, mergeregion=0, merging=True, doUniqs=True,
-                        doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
-                        logfilename="regionCounts.log", doRPKM=False, doLength=False,
-                        forceRegion=False)
-
-    (options, args) = parser.parse_args(argv[1:])
  
-    if len(args) < 3:
-        print usage
-        sys.exit(1)
-
-    regionfilename = args[0]
-    hitfile =  args[1]
-    outfilename = args[2]
-
-    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
-                 options.useFullchrom, options.normalize, options.padregion,
-                 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
-                 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
-                 options.doRPKM, options.doLength, options.forceRegion)
+    configParser = getConfigParser()
+    section = "regionCounts"
+    flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
+    cField = getConfigIntOption(configParser, section, "cField", 1)
+    useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    padregion = getConfigIntOption(configParser, section, "padregion", 0)
+    mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
+    merging = getConfigBoolOption(configParser, section, "merging", True)
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+    logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
+    doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
+    doLength = getConfigBoolOption(configParser, section, "doLength", False)
+    forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
+
+    parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
+                        padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
+                        doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
+                        logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
+                        forceRegion=forceRegion)
+
+    return parser
  
  
  def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
@@ -91,7 +119,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      labeltoRegionDict = {}
      regionCount = {}
  
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      readlen = hitRDS.getReadSize()
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
@@ -112,10 +140,10 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      for rchrom in regionDict:
          if forceRegion and rchrom not in chromList:
              print rchrom
-            for (label, start, stop, length) in regionDict[rchrom]:
-                regionCount[label] = 0
-                labelList.append(label)
-                labeltoRegionDict[label] = (rchrom, start, stop)
+            for region in regionDict[rchrom]:
+                regionCount[region.label] = 0
+                labelList.append(region.label)
+                labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
  
      for rchrom in chromList:
          regionList = []
@@ -133,25 +161,21 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
              rindex = 0
              dictLen = len(readDict[fullchrom])
  
-        for (label, start, stop, length) in regionDict[rchrom]:
+        for region in regionDict[rchrom]:
+            label = region.label
+            start = region.start
+            stop = region.stop
              regionCount[label] = 0
              labelList.append(label)
              labeltoRegionDict[label] = (rchrom, start, stop)
-
-        if useFullchrom:
-            fullchrom = rchrom
-        else:
-            fullchrom = "chr%s" % rchrom
-
-        for (label, rstart, rstop, length) in regionDict[rchrom]:
-            regionList.append((label, fullchrom, rstart, rstop))
+            regionList.append((label, fullchrom, start, stop))
              if usePeak:
                  readList = []
                  for localIndex in xrange(rindex, dictLen):
                      read = readDict[fullchrom][localIndex]
-                    if read[0] < rstart:
+                    if read["start"] < start:
                          rindex += 1
-                    elif rstart <= read[0] <= rstop:
+                    elif start <= read["start"] <= stop:
                          readList.append(read)
                      else:
                          break
@@ -160,16 +184,16 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
                      continue
  
                  readList.sort()
-                (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
+                peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
                  try:
-                    topValue = smoothArray[topPos[0]]
+                    topValue = peak.smoothArray[peak.topPos[0]]
                  except:
-                    print "problem with %s %s" % (str(topPos), str(smoothArray))
+                    print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
                      continue
  
                  regionCount[label] += topValue
              else:
-                regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
  
          if flagRDS:
              hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
diff --git a/regionintersects.py b/regionintersects.py

index 340d2f88f0cda1177cf0b8c772d5e808c8e39f94..49a0e328af4dc8bf09bf3c7dc3a511e56b4e4a38 100755 (executable)
--- a/regionintersects.py
+++ b/regionintersects.py
@@ -8,10 +8,12 @@ try:
  except:
      pass
  
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
  
-print "%prog: version 3.0"
+print "regionintersects: version 3.1"
  
  def main(argv=None):
      if not argv:
@@ -19,16 +21,7 @@ def main(argv=None):
  
      usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--reject1", dest="rejectOneName")
-    parser.add_option("--reject2", dest="rejectTwoName")
-    parser.add_option("--union", action="store_true", dest="trackReject")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--raw", action="store_false", dest="normalize")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
-                        doCache=False, normalize=True, doVerbose=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 5:
@@ -47,6 +40,31 @@ def main(argv=None):
                       options.doVerbose)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--reject1", dest="rejectOneName")
+    parser.add_option("--reject2", dest="rejectTwoName")
+    parser.add_option("--union", action="store_true", dest="trackReject")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+
+    configParser = getConfigParser()
+    section = "regionintersects"
+    rejectOneName = getConfigOption(configParser, section, "rejectOneName", None)
+    rejectTwoName = getConfigOption(configParser, section, "rejectTwoName", None)
+    trackReject = getConfigBoolOption(configParser, section, "trackReject", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+
+    parser.set_defaults(rejectOneName=rejectOneName, rejectTwoName=rejectTwoName,
+                        trackReject=trackReject, doCache=doCache, normalize=normalize,
+                        doVerbose=doVerbose)
+
+    return parser
+
+
  def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
                       outfilename, rejectOneName=None, rejectTwoName=None,
                       trackReject=False, doCache=False, normalize=True, doVerbose=False):
@@ -69,8 +87,8 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
      oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
      twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
  
-    oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) 
-    twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
+    oneRDS = ReadDataset.ReadDataset(readOneName, verbose=doVerbose, cache=doCache) 
+    twoRDS = ReadDataset.ReadDataset(readTwoName, verbose=doVerbose, cache=doCache)
  
      if normalize:
          normalize1 = len(oneRDS) / 1000000.
@@ -88,36 +106,37 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
  
      numRegionsOne = 0
      numRegionsTwo = 0
+    commonChromosomeList = set(oneDict.keys())
      for rchrom in oneDict:
          numRegionsOne += len(oneDict[rchrom])
  
      for rchrom in twoDict:
+        commonChromosomeList.add(rchrom)
          numRegionsTwo += len(twoDict[rchrom])
  
      outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
  
-    for rchrom in oneDict:
-        if rchrom not in twoDict:
-            continue
-
-        print rchrom
+    for chromosome in commonChromosomeList:
+        print chromosome
          rindex = 0
          rindex2 = 0
-        fullchrom = "chr" + rchrom
+        fullchrom = "chr%s" % chromosome
          oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
          dictLen1 = len(oneReads[fullchrom])
          twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
          dictLen2 = len(twoReads[fullchrom])
-        chrom = rchrom
-        onePeaksDict[chrom] = []
-        oneFoundDict[chrom] = []
-        for (start, stop, length) in oneDict[chrom]:
+        onePeaksDict[chromosome] = []
+        oneFoundDict[chromosome] = []
+        for region in oneDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
              readList = []
              for localIndex in xrange(rindex, dictLen1):
                  read = oneReads[fullchrom][localIndex]
-                if read[0] < start:
+                if read["start"] < start:
                      rindex += 1
-                elif start <= read[0] <= stop:
+                elif start <= read["start"] <= stop:
                      readList.append(read)
                  else:
                      break
@@ -127,17 +146,20 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
  
              readList.sort()
  
-            (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
-            onePeakScore = smoothArray[topPos[0]]
-            onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
+            peak = findPeak(readList, start, length, doWeight=True)
+            onePeakScore = peak.smoothArray[peak.topPos[0]]
+            onePeaksDict[chromosome].append((peak.topPos[0] + start, length/2, start, stop, peak.numHits/normalize1, onePeakScore/normalize1))
  
-        for (start, stop, length) in twoDict[chrom]:
+        for region in twoDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
              readList2 = []
              for localIndex in xrange(rindex2, dictLen2):
                  read = twoReads[fullchrom][localIndex]
-                if read[0] < start:
+                if read["start"] < start:
                      rindex2 += 1
-                elif start <= read[0] <= stop:
+                elif start <= read["start"] <= stop:
                      readList2.append(read)
                  else:
                      break
@@ -146,45 +168,46 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
                  continue
  
              readList2.sort()
-            (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
+            peak2 = findPeak(readList2, start, length, doWeight=True)
+            numHits = peak2.numHits
              numHits /= normalize2
              twoIsCommon = False
-            twoPeak = topPos[0] + start
+            twoPeak = peak2.topPos[0] + start
              twoRadius = length/2
-            twoPeakScore = smoothArray[topPos[0]] / normalize2
-            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+            twoPeakScore = peak2.smoothArray[peak2.topPos[0]] / normalize2
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
                  if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
                      if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
-                        oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
+                        oneFoundDict[chromosome].append((onePeak, oneRadius, ostart, ostop, ohits))
  
                      twoIsCommon = True
                      commonRegions += 1
-                    outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
+                    outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chromosome, ostart, ostop, ohits, opeakScore, chromosome, start, stop, numHits, twoPeakScore)
                      if doVerbose:
                          print outline
  
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
  
              if trackReject and not twoIsCommon:
                  twoRejectIndex += 1
-                outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
+                outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chromosome, start, stop, numHits, twoPeakScore)
                  if doReject:
-                    rejectTwo.write(outline + "\n")
+                    print >> rejectTwo, outline
                  else:
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
  
                  if doVerbose:
                      print outline
  
          if trackReject:
-            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
-                if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
+                if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chromosome]:
                      oneRejectIndex += 1
-                    outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
+                    outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chromosome, ostart, ostop, ohits, opeakScore)
                      if doReject:
-                        rejectOne.write(outline + "\n")
+                        print >> rejectOne, outline
                      else:
-                        outfile.write(outline + "\n")
+                        print >> outfile, outline
  
                      if doVerbose:
                          print outline
diff --git a/regiontobed.py b/regiontobed.py

index e6ce22a5e170f7a8ae515e349415b6ec2bb630b4..96624114a48545214d7ca3e2de17e960af9ee096 100755 (executable)
--- a/regiontobed.py
+++ b/regiontobed.py
@@ -12,9 +12,12 @@ try:
  except:
      pass
  
-import sys, math, optparse
+import sys
+import math
+import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
  
-print "%prog: version 3.1"
+print "regiontobed: version 3.2"
  
  
  def usage():
@@ -27,15 +30,7 @@ def main(argv=None):
  
      usage = __doc__
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--color", dest="color")
-    parser.add_option("--score", type="int", dest="scoreField")
-    parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
-    parser.add_option("--broadPeak", action="store_true", dest="doBroad")
-    parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
-    parser.add_option("--nolabel", action="store_true", dest="noLabel")
-    parser.set_defaults(color="0,0,0", scoreField=None, doNarrow=False,
-                        doBroad=False, itemRGB=False, noLabel=False)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -51,6 +46,30 @@ def main(argv=None):
                  options.itemRGB, options.noLabel)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--color", dest="color")
+    parser.add_option("--score", type="int", dest="scoreField")
+    parser.add_option("--narrowPeak", action="store_true", dest="doNarrow")
+    parser.add_option("--broadPeak", action="store_true", dest="doBroad")
+    parser.add_option("--itemRgb", action="store_true", dest="itemRGB")
+    parser.add_option("--nolabel", action="store_true", dest="noLabel")
+
+    configParser = getConfigParser()
+    section = "regiontobed"
+    color = getConfigOption(configParser, section, "color", "0,0,0")
+    scoreField = getConfigOption(configParser, section, "scoreField", None)
+    doNarrow = getConfigBoolOption(configParser, section, "doNarrow", False)
+    doBroad = getConfigBoolOption(configParser, section, "doBroad", False)
+    itemRGB = getConfigBoolOption(configParser, section, "itemRGB", False)
+    noLabel = getConfigBoolOption(configParser, section, "noLabel", False)
+
+    parser.set_defaults(color=color, scoreField=scoreField, doNarrow=doNarrow,
+                        doBroad=doBroad, itemRGB=itemRGB, noLabel=noLabel)
+
+    return parser
+
+
  def regiontobed(factorlabel, regionFileName, outFileName, color="0,0,0",
                  scoreField=None, doNarrow=False, doBroad=False, itemRGB=False,
                  noLabel=False):
diff --git a/rnaEditing.py b/rnaEditing.py

index 30de5a33da307441c684ebaa41aa5f3152962f89..ce9f4ea4f81c121e174433bbca134954a451f021 100644 (file)
--- a/rnaEditing.py
+++ b/rnaEditing.py
@@ -1,11 +1,7 @@
-"""
-Based on shell script provided by Ali.
-"""
-
  import sys
  import optparse
-from Erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
-from Erange.commoncode import countDuplicatesInList
+from erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
+from erange.commoncode import countDuplicatesInList, getConfigParser, getConfigOption
  
  
  def main(argv=None):
@@ -14,14 +10,7 @@ def main(argv=None):
  
      usage = "usage: python %prog dbfile snpsfile genome rpkmfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--goprefix", dest="prefix")
-    parser.add_option("--novelsnp", dest="novelsnpoutfilename")
-    parser.add_option("--bedfile", dest="bedoutfilename")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--snpDB", action="append", dest="snpDBList",
-                      help="additional snp db files to check will be searched in order given")
-    parser.set_defaults(prefix=None, novelsnpoutfilename=None, bedoutfilename=None, cachePages=None, snpDBList=[])
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -33,6 +22,32 @@ def main(argv=None):
      genome = args[2]
      rpkmfilename = args[3]
  
+    rnaEditing(dbfile, hitfile, genome, rpkmfilename, options)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--goprefix", dest="prefix")
+    parser.add_option("--novelsnp", dest="novelsnpoutfilename")
+    parser.add_option("--bedfile", dest="bedoutfilename")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--snpDB", action="append", dest="snpDBList",
+                      help="additional snp db files to check will be searched in order given")
+
+    configParser = getConfigParser()
+    section = "rnaEditing"
+    prefix = getConfigOption(configParser, section, "prefix", None)
+    novelsnpoutfilename = getConfigOption(configParser, section, "novelsnpoutfilename", None)
+    bedoutfilename = getConfigOption(configParser, section, "bedoutfilename", None)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(prefix=prefix, novelsnpoutfilename=novelsnpoutfilename, bedoutfilename=bedoutfilename,
+                        cachePages=cachePages, snpDBList=[])
+
+    return parser
+
+
+def rnaEditing(dbfile, hitfile, genome, rpkmfilename, options):
      if options.cachePages is not None:
          doCache = True
      else:
diff --git a/rnafarPairs.py b/rnafarPairs.py

index d1baebd3e7e6e61278c4c141a84fe2396e323f28..4d70c4965ef77c2eea4f646e58526db83682ffa6 100755 (executable)
--- a/rnafarPairs.py
+++ b/rnafarPairs.py
@@ -13,26 +13,21 @@ try:
  except:
      pass
  
-import sys, time, optparse
-from commoncode import readDataset
-from cistematic.core.geneinfo import geneinfoDB
-from cistematic.genomes import Genome
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "%prog: version 3.6"
+    print "rnafarPairs: version 3.7"
      usage = "usage: python %prog genome goodfile rdsfile outfile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--verbose", action="store_true", dest="doVerbose",
-                      help="verbose output")
-    parser.add_option("--cache", action="store_true", dest="doCache",
-                      help="use cache")
-    parser.add_option("--maxDist", type="int", dest="maxDist",
-                      help="maximum distance")
-    parser.set_defaults(doVerbose=False, doCache=False, maxDist=500000)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 4:
@@ -45,7 +40,27 @@ def main(argv=None):
      outfilename = args[3]
  
      rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
-    
+
+
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--verbose", action="store_true", dest="doVerbose",
+                      help="verbose output")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+    parser.add_option("--maxDist", type="int", dest="maxDist",
+                      help="maximum distance")
+
+    configParser = getConfigParser()
+    section = "rnafarPairs"
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
+
+    parser.set_defaults(doVerbose=doVerbose, doCache=doCache, maxDist=maxDist)
+
+    return parser
+
  
  def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
      goodDict = {}
@@ -54,33 +69,25 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
          fields = line.split()
          goodDict[fields[0]] = line
  
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
-    rdsChromList = RDS.getChromosomes()
-
+    goodfile.close()
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
+    chromosomeList = RDS.getChromosomes()
      if doVerbose:
          print time.ctime()
  
      distinct = 0
      total = 0
      outfile = open(outfilename,"w")
-
-    idb = geneinfoDB()
-    if genome == "dmelanogaster":
-        geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
-    else:
-        geneinfoDict = idb.getallGeneInfo(genome)
-
-    hg = Genome(genome)
-    geneannotDict = hg.allAnnotInfo()
-
+    geneinfoDict = getGeneInfoDict(genome)
+    geneannotDict = getGeneAnnotDict(genome)
      assigned = {}
      farConnected = {}
-    for achrom in rdsChromList:
-        if achrom == "chrM":
+    for chromosome in chromosomeList:
+        if doNotProcessChromosome(chromosome):
              continue
  
-        print achrom
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
+        print chromosome
+        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
          if doVerbose:
              print len(uniqDict), time.ctime()    
  
@@ -88,82 +95,124 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
              readList = uniqDict[readID]
              if len(readList) == 2:
                  total += 1
-                (start1, flag1, pair1) = readList[0]
-                (start2, flag2, pair2) = readList[1]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    if flag1 != "NM" and flag2 != "NM" and dist < maxDist:
-                        geneID = ""
-                        saw1 = False
-                        saw2 = False
-                        if flag1 in goodDict:
-                            geneID = flag2
-                            farFlag = flag1
-                            saw1 = True
-
-                        if flag2 in goodDict:
-                            geneID = flag1
-                            farFlag = flag2
-                            saw2 = True
-
-                        if saw1 or saw2:
-                            total += 1
-
-                        if saw1 and saw2:
-                            if flag1 < flag2:
-                                geneID = flag1
-                                farFlag = flag2
-                            else:
-                                geneID = flag2
-                                farFlag = flag1
-
-                            if geneID in farConnected:
-                                farConnected[geneID].append(farFlag)
-                            else:
-                                farConnected[geneID] = [farFlag]
-                        elif geneID != "":
-                            try:
-                                if genome == "dmelanogaster":
-                                    symbol = geneinfoDict["Dmel_" + geneID][0][0]
-                                else:
-                                    symbol = geneinfoDict[geneID][0][0]
-                            except:
-                                try:
-                                    symbol = geneannotDict[(genome, geneID)][0]
-                                except:
-                                    symbol = "LOC" + geneID
-
-                            symbol = symbol.strip()
-                            symbol = symbol.replace(" ","|")
-                            symbol = symbol.replace("\t","|")
-                            if farFlag not in assigned:
-                                assigned[farFlag] = (symbol, geneID)
-                                print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
-                                outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
-                                distinct += 1
-
-    farIndex = 0
+                if processReads(readList[:2], maxDist):
+                    flags = (readList[0]["flag"], readList[1]["flag"])
+                    processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+                    total += processed
+                    distinct += distinctPairs
+
+    entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
+    distinct += entriesWritten
+    outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
+    outfile.close()
+    print "distinct: %d\ttotal: %d" % (distinct, total)
+    print time.ctime()
+
+
+def doNotProcessChromosome(chromosome):
+    return chromosome == "chrM"
+
+
+def processReads(reads, maxDist):
+    process = False
+    start1 = reads[0]["start"]
+    start2 = reads[1]["start"]
+    dist = abs(start1 - start2)
+    flag1 = reads[0]["flag"]
+    flag2 = reads[1]["flag"]
+
+    if flag1 != flag2 and flag1 != "NM" and flag2 != "NM" and dist < maxDist:
+        process = True
+
+    return process
+
+
+def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+    flag1, flag2 = flags
+    total = 0
+    distinct = 0
+    read1IsGood = flag1 in goodDict
+    read2IsGood = flag2 in goodDict
+
+    if read1IsGood and read2IsGood:
+        if flag1 < flag2:
+            geneID = flag1
+            farFlag = flag2
+        else:
+            geneID = flag2
+            farFlag = flag1
+
+        try:
+            farConnected[geneID].append(farFlag)
+        except KeyError:
+            farConnected[geneID] = [farFlag]
+    elif read1IsGood or read2IsGood:
+        total += 1
+        if read2IsGood:
+            farFlag = flag2
+            geneID = flag1
+        else:
+            farFlag = flag1
+            geneID = flag2
+
+        try:
+            if genome == "dmelanogaster":
+                symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+            else:
+                symbol = geneInfoDict[geneID][0][0]
+        except (KeyError, IndexError):
+            try:
+                symbol = geneAnnotDict[(genome, geneID)][0]
+            except (KeyError, IndexError):
+                symbol = "LOC%s" % geneID
+
+        symbol = symbol.strip()
+        symbol = symbol.replace(" ","|")
+        symbol = symbol.replace("\t","|")
+
+        if farFlag not in assigned:
+            assigned[farFlag] = (symbol, geneID)
+            print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
+            outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
+            distinct += 1
+
+    return total, distinct
+
+
+def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
+    total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
+    writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
+
+    return written
+
+
+def writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile):
+    total = 0
+    written = 0
      for farFlag in farConnected:
          geneID = ""
          symbol = ""
          idList = [farFlag] + farConnected[farFlag]
-        for oneID in idList:
-            if oneID in assigned:
-                (symbol, geneID) = assigned[oneID]
+        for ID in idList:
+            if ID in assigned:
+                (symbol, geneID) = assigned[ID]
  
          if geneID == "":
-            farIndex += 1
-            symbol = "FAR%d" % farIndex
-            geneID = -1 * farIndex
+            total += 1
+            symbol = "FAR%d" % total
+            geneID = -1 * total
  
-        for oneID in idList:
-            if oneID not in assigned:
-                print "%s %s %s" % (symbol, geneID, goodDict[oneID].strip())
-                outfile.write("%s %s %s" % (symbol, geneID, goodDict[oneID]))
-                distinct += 1
-                assigned[oneID] = (symbol, geneID)
+        for ID in idList:
+            if ID not in assigned:
+                print "%s %s %s" % (symbol, geneID, goodDict[ID].strip())
+                outfile.write("%s %s %s" % (symbol, geneID, goodDict[ID]))
+                written += 1
+                assigned[ID] = (symbol, geneID)
  
+    return total, written
+
+
+def writeUnassignedGoodReadsToFile(farIndex, goodDict, assigned, outfile):
      for farFlag in goodDict:
          if farFlag not in assigned:
              farIndex += 1
@@ -171,10 +220,6 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
              print line.strip()
              outfile.write(line)
  
-    outfile.write("#distinct: %d\ttotal: %d\n" % (distinct, total))
-    outfile.close()
-    print "distinct: %d\ttotal: %d" % (distinct, total)
-    print time.ctime()
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/scatterfields.py b/scatterfields.py

index 60649ff5eb803f9b989e500592c7c499a4abbf41..32d623c277b02dbc2fc57d93a7ab896c914a60e4 100755 (executable)
--- a/scatterfields.py
+++ b/scatterfields.py
@@ -17,10 +17,11 @@ from pylab import *
  import math, cmath
  import sys
  import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
  
  alphaVal = 0.5
  
-print "%prog: version 3.1"
+print "scatterfields: version 3.2"
  
  def main(argv=None):
      if not argv:
@@ -28,28 +29,7 @@ def main(argv=None):
  
      usage = __doc__
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--xmin", type="float", dest="forcexmin")
-    parser.add_option("--ymin", type="float", dest="forceymin")
-    parser.add_option("--xmax", type="float", dest="forcexmax")
-    parser.add_option("--ymax", type="float", dest="forceymax")
-    parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
-    parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
-    parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
-    parser.add_option("--order", type="int", dest="fitOrder")
-    parser.add_option("--base", type="int", dest="base")
-    parser.add_option("--markGenes", dest="markFile")
-    parser.add_option("--markfold", type="float", dest="foldChange")
-    parser.add_option("--noregression", action="store_false", dest="doRegression")
-    parser.add_option("--large", action="store_true", dest="plotLarge")
-    parser.add_option("--markdiag", action="store_true", dest="markDiag")
-    parser.add_option("--title", type="int", dest="figtitle")
-    parser.add_option("--verbose", action="store_true", dest="verbose")
-    parser.set_defaults(forcexmin=0.0, forceymin=0.0, forcexmax=-1, forceymax=-1, doLogF1=False,
-                        doLogF2=False, doArcsinh=False, fitOrder=1, base=10, markFile=None,
-                        foldChange=None, doRegression=True, plotLarge=False, markDiag=False,
-                        figtitle="", verbose=False)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 6:
@@ -70,6 +50,52 @@ def main(argv=None):
                    options.markDiag, options.figtitle, options.verbose)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--xmin", type="float", dest="forcexmin")
+    parser.add_option("--ymin", type="float", dest="forceymin")
+    parser.add_option("--xmax", type="float", dest="forcexmax")
+    parser.add_option("--ymax", type="float", dest="forceymax")
+    parser.add_option("--doLogF1", action="store_true", dest="doLogF1")
+    parser.add_option("--doLogF2", action="store_true", dest="doLogF2")
+    parser.add_option("--arcsinh", action="store_true", dest="doArcsinh")
+    parser.add_option("--order", type="int", dest="fitOrder")
+    parser.add_option("--base", type="int", dest="base")
+    parser.add_option("--markGenes", dest="markFile")
+    parser.add_option("--markfold", type="float", dest="foldChange")
+    parser.add_option("--noregression", action="store_false", dest="doRegression")
+    parser.add_option("--large", action="store_true", dest="plotLarge")
+    parser.add_option("--markdiag", action="store_true", dest="markDiag")
+    parser.add_option("--title", type="int", dest="figtitle")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+
+    configParser = getConfigParser()
+    section = "scatterfields"
+    forcexmin = getConfigFloatOption(configParser, section, "forcexmin", 0.0)
+    forceymin = getConfigFloatOption(configParser, section, "forceymin", 0.0)
+    forcexmax = getConfigIntOption(configParser, section, "forcexmax", -1)
+    forceymax = getConfigIntOption(configParser, section, "forceymax", -1)
+    doLogF1 = getConfigBoolOption(configParser, section, "doLogF1", False)
+    doLogF2 = getConfigBoolOption(configParser, section, "doLogF2", False)
+    doArcsinh = getConfigBoolOption(configParser, section, "doArcsinh", False)
+    fitOrder = getConfigIntOption(configParser, section, "fitOrder", 1)
+    base = getConfigIntOption(configParser, section, "base", 10)
+    markFile = getConfigOption(configParser, section, "markFile", None)
+    foldChange = getConfigOption(configParser, section, "foldChange", None)
+    doRegression = getConfigBoolOption(configParser, section, "doRegression", True)
+    plotLarge = getConfigBoolOption(configParser, section, "plotLarge", False)
+    markDiag = getConfigBoolOption(configParser, section, "markDiag", False)
+    figtitle = getConfigOption(configParser, section, "figtitle", "")
+    verbose = getConfigBoolOption(configParser, section, "verbose", False)
+
+    parser.set_defaults(forcexmin=forcexmin, forceymin=forceymin, forcexmax=forcexmax, forceymax=forceymax, doLogF1=doLogF1,
+                        doLogF2=doLogF2, doArcsinh=doArcsinh, fitOrder=fitOrder, base=base, markFile=markFile,
+                        foldChange=foldChange, doRegression=doRegression, plotLarge=plotLarge, markDiag=markDiag,
+                        figtitle=figtitle, verbose=verbose)
+
+    return parser
+
+
  def scatterfields(infilename, xaxis, xField, yaxis, yField, outfilename, forcexmin=0.0, forceymin=0.0,
                    forcexmax=-1, forceymax=-1, doLogF1=False, doLogF2=False, doArcsinh=False, fitOrder=1,
                    base=10, markFile=None, foldChange=None, doRegression=True, plotLarge=False,
diff --git a/siteintersects.py b/siteintersects.py

index ba0f1cd679378e16fc52585fb3e93bf7b40ea739..4826a13bbb31824ffea9b187d383f42b35bbf682 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -5,7 +5,7 @@
  
  import sys
  
-print "%s: version 2.0" % sys.argv[0]
+print "siteintersects: version 2.1"
  
  
  def main(argv=None):
diff --git a/stallCategory.py b/stallCategory.py

index 92cd5199c63be59aa2c8b2e39c6847a9ba307c23..5c6145cf46f475836b1b77e32405d219e9feea22 100755 (executable)
--- a/stallCategory.py
+++ b/stallCategory.py
@@ -11,8 +11,9 @@ except:
  
  import sys
  import optparse
+from commoncode import getConfigParser, getConfigOption, getConfigFloatOption
  
-print "%prog: version 1.1"
+print "stallCategory: version 1.2"
  
  
  def main(argv=None):
@@ -21,11 +22,7 @@ def main(argv=None):
  
      usage = "usage: python %prog stalledPercentFile1 stalledPercentFile2 transcriptFile [--out oufile] [--statout statoutfile] [--expression level]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--out", dest="outFileName")
-    parser.add_option("--statout", dest="statOutFileName")
-    parser.add_option("--expression", type="float", dest="expressionLevel")
-    parser.set_defaults(outFileName=None, statOutFileName=None, expressionLevel=0.9)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -39,6 +36,23 @@ def main(argv=None):
      stallCategory(infile1, infile2, transcriptFile, options.outFileName, options.statOutFileName, options.expressionLevel)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--out", dest="outFileName")
+    parser.add_option("--statout", dest="statOutFileName")
+    parser.add_option("--expression", type="float", dest="expressionLevel")
+
+    configParser = getConfigParser()
+    section = "stallCategory"
+    outFileName = getConfigOption(configParser, section, "outFileName", None)
+    statOutFileName = getConfigOption(configParser, section, "statOutFileName", None)
+    expressionLevel = getConfigFloatOption(configParser, section, "expressionLevel", 0.9)
+
+    parser.set_defaults(outFileName=outFileName, statOutFileName=statOutFileName, expressionLevel=expressionLevel)
+
+    return parser
+
+
  def stallCategory(inFile1Name, inFile2Name, transcriptFileName, outFileName=None, statOutFileName=None, expressionLevel=0.9):
  
      infile1 = open(inFile1Name)
diff --git a/test/testAnalyzeGO.py b/test/testAnalyzeGO.py

index dacf4a2ad63c6841602ff2cc88cadf49cb126fbb..c9178a9e3db0d16d83c0238727920789f7e3d2d0 100644 (file)
--- a/test/testAnalyzeGO.py
+++ b/test/testAnalyzeGO.py
@@ -5,7 +5,7 @@ Created on Aug 26, 2010
  '''
  import unittest
  import os
-from Erange import analyzego
+from erange import analyzego
  
  
  class TestAnalyzeGO(unittest.TestCase):
diff --git a/test/testChksnp.py b/test/testChksnp.py

index 3f2e8ae95cf25a092f0bab78cb21b0026afb44ca..b41fe659421377de86e576a7644abf9e8c5971b6 100644 (file)
--- a/test/testChksnp.py
+++ b/test/testChksnp.py
@@ -6,7 +6,7 @@ Created on Aug 25, 2010
  import unittest
  import string
  import os
-from Erange import chksnp
+from erange import chksnp
  
  dbPath = "/Users/sau/work/snpdb/hg18"
  
diff --git a/test/testCommoncode.py b/test/testCommoncode.py

index 1ea4f803c2a7dcd7357dff35995d19ca357728a9..43eb96bc2a3153e7cbf0bb5719fa13320195512c 100644 (file)
--- a/test/testCommoncode.py
+++ b/test/testCommoncode.py
@@ -7,7 +7,7 @@ import unittest
  import os
  import string
  from array import array
-from Erange import commoncode
+from erange import commoncode
  from cistematic.genomes import Genome
  
  
@@ -277,7 +277,7 @@ class TestCommoncode(unittest.TestCase):
          result = ([], 0.0, array("f"), 0.0)
          self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
  
-        hitList= [[4, "+", 0.5]]
+        hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
          result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
          self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
          result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
@@ -287,11 +287,11 @@ class TestCommoncode(unittest.TestCase):
          result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
          self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
  
-        hitList= [[4, "+", 0.5]]
+        hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
          result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
          self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
  
-        hitList= [[4, "+", 0.5]]
+        hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
          result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
          self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
          result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
@@ -300,12 +300,26 @@ class TestCommoncode(unittest.TestCase):
  
      #TODO: write test
      def testGetBestShiftForRegion(self):
-        hitList = [[14, "-", 1.0], [16, "-", 1.0], [24, "+", 1.0],  [26, "+", 10.0]]
+        hitList = [{"start": 14, "sense": "-", "weight": 1.0},
+                   {"start": 16, "sense": "-", "weight": 1.0},
+                   {"start": 24, "sense": "+", "weight": 1.0},
+                   {"start": 26, "sense": "+", "weight": 10.0}
+        ]
          self.assertEquals(74, commoncode.getBestShiftForRegion(hitList, 0, 100))
          self.assertEquals(16, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=30))
          self.assertEquals(0, commoncode.getBestShiftForRegion(hitList, 0, 100, maxShift=10))
  
  
+    #TODO: write test
+    def testFindPeakSequenceArray(self):
+        pass
+
+
+    #TODO: write test
+    def testGetPeakPositionList(self):
+        pass
+
+
      #TODO: write test
      def testGetFeaturesByChromDict(self):
          firstFeatures = {"I": (4123, 4219, "Y74C9A.3", "R", "3UTR"),
@@ -396,42 +410,42 @@ class TestCommoncode(unittest.TestCase):
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[1, "+", 1.0]]}
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
          result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
                                "2": [("regionID2", 1, 1000, 1000, "F")]
          }
-        hitDict = {"1": [[1, "+", 1.0]],
-                   "2": [[1, "+", 1.0]]
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+                   "2": [{"start": 1, "sense": "+", "weight": 1.0}]
          }
          result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}]}
          result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [15, "+", 1.0]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}, {"start": 15, "sense": "+", "weight": 1.0}]}
          result = ({"regionID": [2.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [80, "+", 0.5], [200, "+", 2.0]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 80, "sense": "+", "weight": 0.5}, {"start": 200, "sense": "+", "weight": 2.0}]}
          result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[1, "+", 1.0]]}
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
          regionList = ["regionID"]
          result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[1, "+", 1.0]]}
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
          regionList = ["empty region"]
          result = ({"empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
@@ -439,8 +453,8 @@ class TestCommoncode(unittest.TestCase):
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
                                "2": [("regionID2", 1, 1000, 1000, "F")]
          }
-        hitDict = {"1": [[1, "+", 1.0]],
-                   "2": [[1, "+", 1.0]]
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+                   "2": [{"start": 1, "sense": "+", "weight": 1.0}]
          }
          regionList = ["regionID", "regionID2"]
          result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
@@ -449,8 +463,8 @@ class TestCommoncode(unittest.TestCase):
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
                                "2": [("regionID2", 1, 1000, 1000, "F")]
          }
-        hitDict = {"1": [[1, "+", 1.0]],
-                   "2": [[1, "+", 1.0]]
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+                   "2": [{"start": 1, "sense": "+", "weight": 1.0}]
          }
          regionList = ["empty region", "regionID2"]
          result = ({"regionID2": [1.0, 0.0, 0.0, 0.0], "empty region": [0.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
@@ -459,73 +473,73 @@ class TestCommoncode(unittest.TestCase):
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")],
                                "2": [("regionID2", 1, 1000, 1000, "F")]
          }
-        hitDict = {"1": [[1, "+", 1.0]],
-                   "2": [[1, "+", 1.0]]
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}],
+                   "2": [{"start": 1, "sense": "+", "weight": 1.0}]
          }
          regionList = ["regionID2"]
          result = ({"regionID2": [1.0, 0.0, 0.0, 0.0]}, {"regionID2": 1000, "regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[1, "+", 1.0]]}
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
          result = ({"regionID": [2.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, normalizedTag=2.0))
  
          regionsByChromDict = {"1": [(1, 100, "regionID", 100, "F")]}
-        hitDict = {"1": [[1, "+", 1.0]]}
+        hitDict = {"1": [{"start": 1, "sense": "+", "weight": 1.0}]}
          result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, defaultRegionFormat=False))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}]}
          fixedFirstBin = 20
          result = ({"regionID": [1.0, 0.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}]}
          fixedFirstBin = 5
          result = ({"regionID": [0.0, 1.0, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
          fixedFirstBin = 20
          result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[80, "+", 1.0], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 80, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
          fixedFirstBin = 5
          result = ({"regionID": [0.0, 1.5, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, fixedFirstBin=fixedFirstBin))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
          binLength = 25
          result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
          binLength = 50
          result = ({"regionID": [1.0, 0.5, 0.0, 0.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 85, "sense": "+", "weight": 0.5}]}
          binLength = 15
          result = ({"regionID": [1.0, 0.0, 0.0, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "F")]}
-        hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 40, "sense": "+", "weight": 0.7}, {"start": 85, "sense": "+", "weight": 0.5}]}
          binLength = 15
          result = ({"regionID": [1.0, 0.0, 0.7, 0.5]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen, binLength=binLength))
  
          regionsByChromDict = {"1": [("regionID", 1, 100, 100, "R")]}
-        hitDict = {"1": [[10, "+", 1.0], [40, "+", 0.7], [85, "+", 0.5]]}
+        hitDict = {"1": [{"start": 10, "sense": "+", "weight": 1.0}, {"start": 40, "sense": "+", "weight": 0.7}, {"start": 85, "sense": "+", "weight": 0.5}]}
          result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
          self.assertEquals(result, commoncode.computeRegionBins(regionsByChromDict, hitDict, bins, readlen))
          result = ({"regionID": [0.5, 0.0, 0.7, 1.0]}, {"regionID": 100})
diff --git a/test/testErange.py b/test/testErange.py

index d9392ff2fe435a1f87821bd5928e00943f6f6787..4455fff7211ff686832b81cb80ed6cc7adcfba03 100644 (file)
--- a/test/testErange.py
+++ b/test/testErange.py
@@ -14,6 +14,7 @@ import testAnalyzeGO
  import testChksnp
  import testCommoncode
  import testGeneMrnaCounts
+import testGeneMrnaCountsWeighted
  #import testGetFasta
  import testGetNovelSNPs
  import testGetSNPGeneInfo
@@ -24,13 +25,14 @@ import testmakebedfromrds
  import testMakeRdsFromBam
  import testMakeSNPTrack
  import testMarkLinkers
+import testPeak
  import testPeaksToRegion
  import testProcessVelvet
  import testReadDataset
  import testRnaAToIFilter
  import testRnaEditing
  import testRNAPATH
-import testTranscripts
+#import testTranscripts
  
  
  def main(argv=None):
@@ -42,6 +44,7 @@ def main(argv=None):
      suite.addTest(testChksnp.suite())
      suite.addTest(testCommoncode.suite())
      suite.addTest(testGeneMrnaCounts.suite())
+    suite.addTest(testGeneMrnaCountsWeighted.suite())
      #suite.addTest(testGetFasta.suite())
      suite.addTest(testGetNovelSNPs.suite())
      suite.addTest(testGetSNPGeneInfo.suite())
@@ -52,6 +55,7 @@ def main(argv=None):
      suite.addTest(testMakeRdsFromBam.suite())
      suite.addTest(testMakeSNPTrack.suite())
      suite.addTest(testMarkLinkers.suite())
+    suite.addTest(testPeak.suite())
      suite.addTest(testPeaksToRegion.suite())
      suite.addTest(testProcessVelvet.suite())
      suite.addTest(testReadDataset.suite())
diff --git a/test/testGeneMrnaCounts.py b/test/testGeneMrnaCounts.py

index 62f1649d8cb27be429c8b56e41c9c4a5a674eeba..4f2ea083dbfbc3034a72afcaff99896dddfe8cce 100644 (file)
--- a/test/testGeneMrnaCounts.py
+++ b/test/testGeneMrnaCounts.py
@@ -12,10 +12,10 @@ Located feature 728439 by:
  '''
  import unittest
  import os
-from Erange import geneMrnaCounts
+from erange import geneMrnaCounts
  from cistematic.core.geneinfo import geneinfoDB
  from cistematic.genomes import Genome
-from Erange.commoncode import readDataset
+from erange import ReadDataset
  
  
  class TestGeneMrnaCounts(unittest.TestCase):
@@ -25,7 +25,7 @@ class TestGeneMrnaCounts(unittest.TestCase):
      outfilename = "testGeneMrnaCounts.txt"
  
      def setUp(self):
-        self.rds = readDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
+        self.rds = ReadDataset.ReadDataset(self.testDBName, initialize=True, datasetType="RNA", verbose=False)
  
  
      def tearDown(self):
@@ -66,7 +66,7 @@ class TestGeneMrnaCounts(unittest.TestCase):
  
          outfile.close()
          os.remove(self.outfilename)
-        reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+        reads = self.rds.getReadsDict(withFlag=True)
          self.assertEquals("728439", reads["1"][0]["flag"])
  
          geneMrnaCounts.geneMrnaCounts(self.genomeName, self.testDBName, self.outfilename,
@@ -80,7 +80,7 @@ class TestGeneMrnaCounts(unittest.TestCase):
  
          outfile.close()
          os.remove(self.outfilename)
-        reads = self.rds.getReadsDict(withFlag=True, entryDict=True)
+        reads = self.rds.getReadsDict(withFlag=True)
          self.assertEquals("728439", reads["1"][0]["flag"])
  
  
diff --git a/test/testGeneMrnaCountsWeighted.py b/test/testGeneMrnaCountsWeighted.py

new file mode 100644 (file)

index 0000000..108a3b8
--- /dev/null
+++ b/test/testGeneMrnaCountsWeighted.py
@@ -0,0 +1,99 @@
+'''
+Created on Oct 20, 2010
+
+@author: sau
+'''
+import unittest
+from erange import geneMrnaCountsWeighted
+
+
+class TestGeneMrnaCountsWeighted(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    #TODO: write test
+    def testMain(self):
+        pass
+
+
+    #TODO: write test
+    def testGeneMrnaCountsWeighted(self):
+        pass
+
+
+    def testDoNotProcessChromosome(self):
+        chromosomeList = []
+        self.assertTrue(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+        chromosomeList = ["chr1"]
+        self.assertFalse(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+        chromosomeList = ["chr2"]
+        self.assertTrue(geneMrnaCountsWeighted.doNotProcessChromosome("chr1", chromosomeList))
+
+
+    #TODO: write test
+    def testGetReadGIDs(self):
+        pass
+
+
+    def testGetGeneSymbolEmptyDicts(self):
+        geneinfoDict = {}
+        geneannotDict = {}
+        genome = "hsapien"
+        self.assertEquals("FARGene", geneMrnaCountsWeighted.getGeneSymbol("FARGene", genome, geneinfoDict, geneannotDict))
+        self.assertEquals("LOCGene", geneMrnaCountsWeighted.getGeneSymbol("Gene", genome, geneinfoDict, geneannotDict))
+
+
+    def testGetGeneSymbolFromInfoDict(self):
+        geneinfoDict = {"Gene": [("GeneName", "AltGeneName")]}
+        geneannotDict = {}
+        self.assertEquals("GeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "hsapien", geneinfoDict, geneannotDict))
+        self.assertEquals("AltGeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "celegans", geneinfoDict, geneannotDict))
+
+
+    def testGetGeneSymbolFromAnnotDict(self):
+        geneinfoDict = {}
+        geneannotDict = {("hsapien", "Gene"): ["GeneName", "AltGeneName"]}
+        self.assertEquals("GeneName", geneMrnaCountsWeighted.getGeneSymbol("Gene", "hsapien", geneinfoDict, geneannotDict))
+        self.assertEquals("LOCGene", geneMrnaCountsWeighted.getGeneSymbol("Gene", "celegans", geneinfoDict, geneannotDict))
+
+
+    #TODO: write test
+    def testWriteCountsToFile(self):
+        pass
+
+
+    def testGetTagCount(self):
+        self.assertEquals(0.0, geneMrnaCountsWeighted.getTagCount({}, "gene", {"gene": ""}, {}))
+
+        uniqueCountDict = {"gene": 1,
+                           "related1": 1,
+                           "related2": 1
+        }
+        gidReadDict = {"gene": ["read1"]}
+        read2GidDict = {"read1": ["related1", "related2"]}
+        self.assertEquals(0.5, geneMrnaCountsWeighted.getTagCount(uniqueCountDict, "gene", gidReadDict, read2GidDict))
+        self.assertEquals(0.5, geneMrnaCountsWeighted.getTagCount({}, "gene", gidReadDict, read2GidDict))
+
+        uniqueCountDict["gene"] = 2
+        self.assertEquals(1.0, geneMrnaCountsWeighted.getTagCount(uniqueCountDict, "gene", gidReadDict, read2GidDict))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestGeneMrnaCountsWeighted))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testGetFasta.py b/test/testGetFasta.py

index cb71685b0089c13deda115c290734ffa28de7c28..9ea5408ba70336c04a3f729dc0cbbe2f51c3acff 100644 (file)
--- a/test/testGetFasta.py
+++ b/test/testGetFasta.py
@@ -5,9 +5,9 @@ Created on Aug 27, 2010
  '''
  import unittest
  import os
-from Erange import getfasta
-#from Erange import ReadDataset
-from Erange.commoncode import readDataset
+from erange import getfasta
+#from erange import ReadDataset
+from erange import ReadDataset
  
  testDBName = "testRDS.rds"
  
@@ -133,7 +133,7 @@ class TestGetFasta(unittest.TestCase):
      # need to check to see if the issue might be with commoncode.findPeak as there is a lot of questionable
      # logic in that one
      def testGetRegionUsingRDS(self):
-        rds = readDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
+        rds = ReadDataset.ReadDataset(testDBName, initialize=True, datasetType="DNA", verbose=False)
          rds.insertMetadata([("readsize", "100")])
          rdsEntryList = [("testRead", "chr1", 10, 100, "+", 1.0, "", "")]
          rds.insertUniqs(rdsEntryList)
diff --git a/test/testGetSNPGeneInfo.py b/test/testGetSNPGeneInfo.py

index ed33674422474a554067cf2950ee021c7708d5f8..5053d33a1b95ce47dd5379c39bcd415ff93b577e 100644 (file)
--- a/test/testGetSNPGeneInfo.py
+++ b/test/testGetSNPGeneInfo.py
@@ -4,7 +4,7 @@ Created on Aug 26, 2010
  @author: sau
  '''
  import unittest
-from Erange import getSNPGeneInfo
+from erange import getSNPGeneInfo
  
  
  class TestGetSNPGeneInfo(unittest.TestCase):
diff --git a/test/testGetSNPs.py b/test/testGetSNPs.py

index 68ef8c06ce26cdcd0595637382b7cc581f77461e..81b4d2cad69c6455adef42f6481fa207cde95509 100644 (file)
--- a/test/testGetSNPs.py
+++ b/test/testGetSNPs.py
@@ -4,14 +4,14 @@ Created on Jun 4, 2010
  @author: sau
  '''
  import os, unittest
-from Erange.commoncode import readDataset
-from Erange import getSNPs
+from erange import ReadDataset
+from erange import getSNPs
  
  
  class TestGetSNPs(unittest.TestCase):
  
      def setUp(self):
-        self.rdsDNA = readDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
+        self.rdsDNA = ReadDataset.ReadDataset("testDNARDSForUnitTests.rds", True, "DNA", verbose=True)
  
          uniqueInsertList = [("uniqueID1", "chr1", 10, 20, "+", 1.0, "", ""),
                              ("uniqueID2", "chr1", 100, 200, "+", 1.0, "", ""),
@@ -26,6 +26,7 @@ class TestGetSNPs(unittest.TestCase):
  
          self.rdsDNA.insertUniqs(uniqueInsertList)
          self.rdsDNA.insertMulti(multiInsertList)
+        self.rdsDNA.insertMetadata([("readsize", 100)])
  
  
      def tearDown(self):
diff --git a/test/testMakeBamFromRds.py b/test/testMakeBamFromRds.py

index 8c0df53ca4532c08b71c90a5d24f3c0fdc72bf45..a6878802fe7ae344e26c275fd47ad26928b49316 100644 (file)
--- a/test/testMakeBamFromRds.py
+++ b/test/testMakeBamFromRds.py
@@ -4,7 +4,7 @@ Created on Jun 4, 2010
  @author: sau
  '''
  import unittest
-from Erange import MakeBamFromRds
+from erange import MakeBamFromRds
  
  
  class TestMakeBamFromRds(unittest.TestCase):
@@ -18,6 +18,17 @@ class TestMakeBamFromRds(unittest.TestCase):
          pass
  
  
+    #TODO: finish test
+    def testFixSpliceSense(self):
+        seq = "NNNGTAGNNN"
+        startRight = 7
+        stopLeft = 3
+        sense, count = MakeBamFromRds.fixSpliceSense(seq, startRight, stopLeft)
+        self.assertEquals("+", sense)
+        self.assertEquals(0, count)
+
+
+    #TODO: finish test
      def testGetMismatches(self):
          mismatchString = "3A10T"
          self.assertEqual(mismatchString, MakeBamFromRds.getMismatches("A3G, T10A"))
diff --git a/test/testMakeGraphs.py b/test/testMakeGraphs.py

index 567e5d258fb6c8fbe5dcf16384891366648b06f3..dba92388ebef735dbe17978d90b92eb3b79087aa 100644 (file)
--- a/test/testMakeGraphs.py
+++ b/test/testMakeGraphs.py
@@ -5,7 +5,7 @@ Created on Jul 28, 2010
  '''
  
  import os, unittest
-from Erange import makeGraphs
+from erange import makeGraphs
  
  testFileName = "/tmp/testEdgeFileForUnitTests.txt"
  
diff --git a/test/testMakeRdsFromBam.py b/test/testMakeRdsFromBam.py

index 4b4da1a34487a0be992f044eee3a0b47e8fc2158..34f28d0da911ead3578a196dab2aeae799a4d67c 100644 (file)
--- a/test/testMakeRdsFromBam.py
+++ b/test/testMakeRdsFromBam.py
@@ -4,18 +4,10 @@ Created on Jun 10, 2010
  @author: sau
  '''
  import unittest
-from Erange import MakeRdsFromBam
+from erange import MakeRdsFromBam
  
-class TestMakeRdsFromBam(unittest.TestCase):
-
-
-    def testGetSpliceBounds(self):
-        start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
  
-        self.assertEqual(start, 0, "incorrect start position for 262")
-        self.assertEqual(startR, 8, "incorrect right start position for 262")
-        self.assertEqual(stopL, 2, "incorrect left stop position for 262")
-        self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+class TestMakeRdsFromBam(unittest.TestCase):
  
  
      def testGetMismatches(self):
@@ -47,6 +39,47 @@ class TestMakeRdsFromBam(unittest.TestCase):
          self.assertEquals(resultString, MakeRdsFromBam.getMismatches("badMismatchTagData", querySequence, "+"))
  
  
+    #TODO: write test
+    def testGetPairedReadNumberSuffix(self):
+        pass
+
+
+    #TODO: write test
+    def testGetParser(self):
+        pass
+
+
+    #TODO: write test
+    def testGetRDSEntry(self):
+        pass
+
+
+    #TODO: write test
+    def testGetRDSSpliceEntry(self):
+        pass
+
+
+    def testGetReadSense(self):
+        reverse = False
+        self.assertEqual("+", MakeRdsFromBam.getReadSense(reverse))
+        reverse = True
+        self.assertEqual("-", MakeRdsFromBam.getReadSense(reverse))
+
+
+    def testGetSpliceBounds(self):
+        start, startR, stopL, stopR = MakeRdsFromBam.getSpliceBounds(0, 10, [(1,2), (3,6), (1,2)])
+
+        self.assertEqual(start, 0, "incorrect start position for 262")
+        self.assertEqual(startR, 8, "incorrect right start position for 262")
+        self.assertEqual(stopL, 2, "incorrect left stop position for 262")
+        self.assertEqual(stopR, 10, "incorrect right stop position for 262")
+
+
+    #TODO: write test
+    def testIsPairedRead(self):
+        pass
+
+
      def testIsSpliceEntry(self):
          self.assertTrue(MakeRdsFromBam.isSpliceEntry([(1,6), (3, 4), (1, 2)]))
          self.assertFalse(MakeRdsFromBam.isSpliceEntry([(1,6), (2, 4), (1, 2)]))
@@ -54,6 +87,16 @@ class TestMakeRdsFromBam(unittest.TestCase):
          self.assertFalse(MakeRdsFromBam.isSpliceEntry(""))
  
  
+    #TODO: write test
+    def testMain(self):
+        pass
+
+
+    #TODO: write test
+    def testMakeRDSFromBAM(self):
+        pass
+
+
  def suite():
      suite = unittest.TestSuite()
      suite.addTest(unittest.makeSuite(TestMakeRdsFromBam))
diff --git a/test/testMakeSNPTrack.py b/test/testMakeSNPTrack.py

index b52b5461698e9b404717a83ea651752d00e66726..d8c5c05cbff24851fecd00c0382b8a8f168b18db 100644 (file)
--- a/test/testMakeSNPTrack.py
+++ b/test/testMakeSNPTrack.py
@@ -4,7 +4,7 @@ Created on Aug 25, 2010
  @author: sau
  '''
  import unittest
-from Erange import makeSNPtrack
+from erange import makeSNPtrack
  
  
  class TestMakeSNPTrack(unittest.TestCase):
diff --git a/test/testMarkLinkers.py b/test/testMarkLinkers.py

index 775b2e0b845661d2690cb4ca72245af4635fe65e..742a7a8f310351d8d7a1deeb8acd41259e808580 100644 (file)
--- a/test/testMarkLinkers.py
+++ b/test/testMarkLinkers.py
@@ -5,7 +5,7 @@ Created on Sep 15, 2010
  '''
  import unittest
  import os
-from Erange.chiapet import markLinkers
+from erange.chiapet import markLinkers
  
  
  class TestMarkLinkers(unittest.TestCase):
diff --git a/test/testPeak.py b/test/testPeak.py

new file mode 100644 (file)

index 0000000..cba620e
--- /dev/null
+++ b/test/testPeak.py
@@ -0,0 +1,52 @@
+'''
+Created on Oct 29, 2010
+
+@author: sau
+'''
+import unittest
+from array import array
+from erange import Peak
+
+
+class TestPeak(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testInitPeak(self):
+        topPos = 1
+        numHits = 2
+        smoothArray = array("f", [0.] * 10)
+        numPlus = 3
+        peak = Peak.Peak(topPos, numHits, smoothArray, numPlus)
+        self.assertEquals(1, peak.topPos)
+        self.assertEquals(2, peak.numHits)
+
+    def testProperties(self):
+        topPos = 1
+        numHits = 2
+        smoothArray = array("f", [0.] * 10)
+        numPlus = 3
+        peak = Peak.Peak(topPos, numHits, smoothArray, numPlus)
+        peak.topPos = 10
+        self.assertEquals(10, peak.topPos)
+        peak.numHits = 20
+        self.assertEquals(20, peak.numHits)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestPeak))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testPeaksToRegion.py b/test/testPeaksToRegion.py

index dd16d8693d94722dd727c165a3b3da3fac8769b7..350999c10e4fc7deb9a98792922c1d4a2dba45e7 100644 (file)
--- a/test/testPeaksToRegion.py
+++ b/test/testPeaksToRegion.py
@@ -5,7 +5,7 @@ Created on Oct 4, 2010
  '''
  import unittest
  import os
-from Erange import peakstoregion
+from erange import peakstoregion
  
  inFileName = "testPeaksToRegionInFile.txt"
  outFileName = "testPeaksToRegionOutFile.txt"
diff --git a/test/testProcessVelvet.py b/test/testProcessVelvet.py

index 99ef5a3677e7d81fbdd00f5df0d261666ab7cc30..b657c62de566ebf85bc0251db5ab3e20eccf6d4b 100644 (file)
--- a/test/testProcessVelvet.py
+++ b/test/testProcessVelvet.py
@@ -5,7 +5,7 @@ Created on Sep 15, 2010
  '''
  import unittest
  import os
-from Erange.rnapath import processvelvet
+from erange.rnapath import processvelvet
  
  
  class TestProcessVelvet(unittest.TestCase):
diff --git a/test/testRNAPATH.py b/test/testRNAPATH.py

index e217ff16e5f1a4909ef3b7a778d05f79e898f2f4..ec8c1e34334cdb5cbba58aa98df3edcf0c8d8b61 100644 (file)
--- a/test/testRNAPATH.py
+++ b/test/testRNAPATH.py
@@ -5,7 +5,7 @@ Created on Sep 10, 2010
  '''
  import unittest
  import os
-from Erange.rnapath import RNAPATH
+from erange.rnapath import RNAPATH
  
  compDict = {"A": "T",
              "T": "A",
diff --git a/test/testReadDataset.py b/test/testReadDataset.py

index 3ac8f54e5f11c9ad6ab632eb0ee142cbe0c273bd..a2e65bc647d168c10c7baaed3172257f64425df2 100644 (file)
--- a/test/testReadDataset.py
+++ b/test/testReadDataset.py
@@ -6,7 +6,7 @@ Created on Jul 21, 2010
  import unittest
  import os
  import sqlite3 as sqlite
-from Erange import ReadDataset
+from erange import ReadDataset
  
  testDBName = "testRDS.rds"
  rnaTestDBName = "testRDSRNA.rds"
diff --git a/test/testRegion.py b/test/testRegion.py

new file mode 100644 (file)

index 0000000..f1ffd52
--- /dev/null
+++ b/test/testRegion.py
@@ -0,0 +1,44 @@
+import unittest
+from erange import Region
+
+
+class TestRegion(unittest.TestCase):
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        pass
+
+
+    def testRegion(self):
+        factor = "foo"
+        region = Region.Region(factor, "index", "chrom", "start", "stop", "numReads", "foldRatio", "multiP", "peakDescription", "shift")
+        self.assertEquals(factor, region.factor)
+
+
+    def testDirectionalRegion(self):
+        factor = "foo"
+        region = Region.DirectionalRegion(factor, "index", "chrom", "start", "stop", "numReads", "foldRatio", "multiP", "plusP", "leftP", "peakDescription", "shift")
+        self.assertEquals(factor, region.factor)
+
+
+    def testPrintDeirectionalRegionWithShift(self):
+        factor = "foo"
+        region = Region.DirectionalRegion(factor, 1, "chrom", 10, 100, 1.0, 0.5, 0.4, 0.3, 0.2, "peakDescription", 9)
+        result = "foo1\tchrom\t10\t100\t1.0\t0.5\t0.4\t0.3\t0.2\tpeakDescription\t9"
+        self.assertEquals(result, region.printRegionWithShift())
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testRnaAToIFilter.py b/test/testRnaAToIFilter.py

index d2fdfa52bc425165982c5a3e325056fbf0fd2fa6..cadb7759647b3aac2a6ac082d04ed66ec3b71436 100644 (file)
--- a/test/testRnaAToIFilter.py
+++ b/test/testRnaAToIFilter.py
@@ -4,7 +4,7 @@ Created on Aug 25, 2010
  @author: sau
  '''
  import unittest
-from Erange import rnaAToIFilter
+from erange import rnaAToIFilter
  
  
  class TestRnaAToIFilter(unittest.TestCase):
diff --git a/test/testRnaEditing.py b/test/testRnaEditing.py

index 5eb4a0d8eac37401032f4a7bf606288eb34badfc..f0b5dc6b46261697d090a8533cf4f094de3cfeb3 100644 (file)
--- a/test/testRnaEditing.py
+++ b/test/testRnaEditing.py
@@ -4,7 +4,7 @@ Created on Aug 23, 2010
  @author: sau
  '''
  import unittest
-from Erange import rnaEditing
+from erange import rnaEditing
  
  
  class TestRnaEditing(unittest.TestCase):
diff --git a/test/testTranscripts.py b/test/testTranscripts.py

index cf401a74a01af1b8d7dd2fd853b46dcbe2b8760b..4fed488574fddfb1919b23ca2d6cea0d2121a886 100644 (file)
--- a/test/testTranscripts.py
+++ b/test/testTranscripts.py
@@ -5,7 +5,7 @@ Created on Oct 4, 2010
  '''
  import unittest
  import os
-from Erange import transcripts
+from erange import transcripts
  
  inFileName = "testTranscriptsInFile.txt"
  outFileName = "testTranscriptsOutFile.txt"
diff --git a/test/testmakebedfromrds.py b/test/testmakebedfromrds.py

index f11ccd6e401ba06303e70149bb94b8cd1459025c..5ba160dba0e2ef42d9b78c55b1792a39c4f5e3d1 100644 (file)
--- a/test/testmakebedfromrds.py
+++ b/test/testmakebedfromrds.py
@@ -4,7 +4,7 @@ Created on Jun 4, 2010
  @author: sau
  '''
  import unittest
-from Erange import makebedfromrds
+from erange import makebedfromrds
  
  
  class TestMakeBedFromRds(unittest.TestCase):
diff --git a/transcripts.py b/transcripts.py

index 53b6aea2843acadcd93bbe64df414cb6903eb9e7..b3cd572f2ce62e4c3c3a1906980bb96fc2a5d9f9 100755 (executable)
--- a/transcripts.py
+++ b/transcripts.py
@@ -8,23 +8,18 @@
             where transcriptome size is in Gbp, cell count is in arbitrary units and efficiency is a fraction
  """
  
-import sys, optparse
+import sys
+import optparse
+from commoncode import getConfigParser, getConfigFloatOption
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "%prog: version 3.0"
+    print "transcripts: version 3.1"
      usage = "usage: python %prog rpkmFile outFile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--transcriptome", type="float", dest="tSize",
-                      help="transcriptome size in Gbp [default 200000.0]")
-    parser.add_option("--cells", type="float", dest="cellCount",
-                      help="arbitrary units [default 1e6]")
-    parser.add_option("--efficiency", type="float", dest="efficiency",
-                      help="fraction [default 0.3]")
-    parser.set_defaults(tSize=200000.0, cellCount=1e6, efficiency=0.3)
+    parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -37,6 +32,26 @@ def main(argv=None):
      transcripts(infile, outfile, options.tSize, options.cellCount, options.efficiency)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--transcriptome", type="float", dest="tSize",
+                      help="transcriptome size in Gbp [default 200000.0]")
+    parser.add_option("--cells", type="float", dest="cellCount",
+                      help="arbitrary units [default 1e6]")
+    parser.add_option("--efficiency", type="float", dest="efficiency",
+                      help="fraction [default 0.3]")
+
+    configParser = getConfigParser()
+    section = "transcripts"
+    tSize = getConfigFloatOption(configParser, section, "tSize", 200000.0)
+    cellCount = getConfigFloatOption(configParser, section, "cellCount", 1e6)
+    efficiency = getConfigFloatOption(configParser, section, "efficiency", 0.3)
+
+    parser.set_defaults(tSize=tSize, cellCount=cellCount, efficiency=efficiency)
+
+    return parser
+
+
  def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency=0.3):
      infile = open(infilename)
      outfile = open(outfilename, "w")
@@ -46,6 +61,7 @@ def transcripts(infilename, outfilename, tSize=200000, cellCount=1e6, efficiency
          transcripts = rpkm * tSize
          transPerCell = transcripts / cellCount / efficiency
          outfile.write("%s\t%.1f\t%.1f\n" % (fields[0], transcripts, transPerCell))
+
      infile.close()
      outfile.close()
  
diff --git a/trimreads.py b/trimreads.py

index d246e1512a94601dfcaeb6bb4c3473dbe9755b07..88f8851f8d5d88a07f6433fbd34c6110cfe5f166 100755 (executable)
--- a/trimreads.py
+++ b/trimreads.py
@@ -5,10 +5,12 @@
  #  Created by Ali Mortazavi on 8/12/08.
  #
  
-import sys, optparse
+import sys
+import optparse
  from cistematic.core import complement
+from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
  
-print "%prog: version 2.1"
+print "trimreads: version 2.2"
  
  def main(argv=None):
      if not argv:
@@ -16,13 +18,7 @@ def main(argv=None):
  
      usage = "usage: python %prog length infile outfile [--fastq] [--fromback] [--paired] [--flip] [--filter maxN]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--fastq", action="store_true", dest="fastq")
-    parser.add_option("--fromback", action="store_true", dest="fromBack")
-    parser.add_option("--paired", action="store_true", dest="paired")
-    parser.add_option("--flip", action="store_true", dest="flipseq")
-    parser.add_option("--filter", type="int", dest="maxN")
-    parser.set_defaults(fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 3:
@@ -37,6 +33,27 @@ def main(argv=None):
      trimreads(length, infile, outfile, options.fastq, options.fromBack, options.paired, options.flipseq, options.maxN)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--fastq", action="store_true", dest="fastq")
+    parser.add_option("--fromback", action="store_true", dest="fromBack")
+    parser.add_option("--paired", action="store_true", dest="paired")
+    parser.add_option("--flip", action="store_true", dest="flipseq")
+    parser.add_option("--filter", type="int", dest="maxN")
+
+    configParser = getConfigParser()
+    section = "trimreads"
+    fastq = getConfigBoolOption(configParser, section, "fastq", False)
+    fromBack = getConfigBoolOption(configParser, section, "fromBack", False)
+    paired = getConfigBoolOption(configParser, section, "paired", False)
+    flipseq = getConfigBoolOption(configParser, section, "flipseq", False)
+    maxN = getConfigOption(configParser, section, "maxN", None)
+
+    parser.set_defaults(fastq=fastq, fromBack=fromBack, paired=paired, flipseq=flipseq, maxN=maxN)
+
+    return parser
+
+
  def trimreads(length, inFileName, outFileName, fastq=False, fromBack=False, paired=False, flipseq=False, maxN=None):
      infile = open(inFileName)
      outfile = open(outFileName, "w")
diff --git a/utrChanges.py b/utrChanges.py

index d95d18ceea423fa373ea408b3e22bc39092ff7f3..bb193f2f8045f653f38c6bfdd3b6ef51d0a2e010 100755 (executable)
--- a/utrChanges.py
+++ b/utrChanges.py
@@ -13,7 +13,7 @@ import sys
  from commoncode import getMergedRegions, getLocusByChromDict
  from cistematic.genomes import Genome
  
-print "%s: version 1.3" % sys.argv[0]
+print "utrChanges: version 1.4"
  
  
  def main(argv=None):
@@ -37,8 +37,8 @@ def utrChanges(genome, acceptfile, outFileName):
  
      hg = Genome(genome)
  
-    origLocusByChromDict = getLocusByChromDict(hg, keepSense = True)
-    newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict = acceptDict, keepSense = True)
+    origLocusByChromDict = getLocusByChromDict(hg, keepSense=True)
+    newLocusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
      new3utr = 0
      new5utr = 0
diff --git a/weighMultireads.py b/weighMultireads.py

index ed27edf77b6c3d2912b00b0c4c64d6fb7f50167a..f4b1691a07a08b15988539f75f1fbb2a62e148b2 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -12,10 +12,15 @@ try:
  except:
      pass
  
-from commoncode import readDataset
-import sys, time, string, optparse
+import sys
+import time
+import string
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, getConfigOption
  
-print "%prog: version 3.1"
+
+print "weighMultireads: version 3.3"
  
  def main(argv=None):
      if not argv:
@@ -23,13 +28,7 @@ def main(argv=None):
  
      usage = "usage: python %s rdsfile [--radius bp] [--noradius] [--usePairs maxDist] [--verbose] [--cache pages]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--radius", type="int", dest="radius")
-    parser.add_option("--noradius", action="store_false", dest="doRadius")
-    parser.add_option("--usePairs", type="int", dest="pairDist")
-    parser.add_option("--verbose", action="store_true", dest="verbose")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.set_defaults(radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None)
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 1:
@@ -41,260 +40,294 @@ def main(argv=None):
      weighMultireads(rdsfile, options.radius, options.doRadius, options.pairDist, options.verbose, options.cachePages)
  
  
-def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--radius", type="int", dest="radius")
+    parser.add_option("--noradius", action="store_false", dest="doRadius")
+    parser.add_option("--usePairs", type="int", dest="pairDist")
+    parser.add_option("--verbose", action="store_true", dest="verbose")
+    parser.add_option("--cache", type="int", dest="cachePages")
  
-    if radius is not None:
-        doRadius = True
-    else:
-        radius = 100
+    configParser = getConfigParser()
+    section = "weighMultireads"
+    radius = getConfigOption(configParser, section, "radius", None)
+    doRadius = getConfigBoolOption(configParser, section, "doRadius", True)
+    pairDist = getConfigOption(configParser, section, "pairDist", None)
+    verbose = getConfigBoolOption(configParser, section, "verbose", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    
+    parser.set_defaults(radius=radius, doRadius=doRadius, pairDist=pairDist, verbose=verbose, cachePages=cachePages)
  
-    usePairs = False
-    if pairDist is not None:
-        usePairs = True
+    return parser
+
+
+def weighMultireads(rdsfile, radius=None, doRadius=True, pairDist=None, verbose=False, cachePages=None):
  
-    tooFar = pairDist * 10
-    
-    doCache = False
      if cachePages is not None:
          doCache = True
      else:
+        doCache = False
          cachePages = 1
  
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
-    readlen = RDS.getReadSize()
-    halfreadlen = readlen / 2
-
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
      if cachePages > RDS.getDefaultCacheSize():
          RDS.setDBcache(cachePages)
  
      if verbose:
          print time.ctime()
  
-    multiIDs = RDS.getReadIDs(uniqs=False,multi=True)
+    multiIDs = RDS.getReadIDs(uniqs=False, multi=True)
      if verbose:
          print "got multiIDs ", time.ctime()
  
-    fixedPair = 0
      fixedReads = []
-    if usePairs:
-        print "doing pairs with pairDist = %d" % pairDist
-        uidDict = {}
-        midDict = {}
-        jointList = []
-        bothMultiList = []
-        mainIDList = []
-        guDict = {}
-        muDict = {}
-
-        if RDS.dataType == "RNA":
-            uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=True)
-        else:
-            uniqIDs = RDS.getReadIDs(uniqs=True,multi=False,splices=False)
+    if pairDist is not None:
+        fixedReads = reweighUsingPairs(RDS, pairDist, multiIDs, verbose)
  
-        if verbose:
-            print "got uniqIDs ", time.ctime()
+    if radius is not None:
+        doRadius = True
+    else:
+        radius = 100
  
-        for readID in uniqIDs:
-            (mainID, pairID) = readID.split("/")
-            try:
-                uidDict[mainID].append(pairID)
-            except:
-                uidDict[mainID] = [pairID]
-                mainIDList.append(mainID)
+    if doRadius:
+        reweighUsingRadius(RDS, radius, multiIDs, fixedReads, verbose)
  
-        if verbose:
-            print "uidDict all ", len(uidDict), time.ctime()
+    if doCache:
+        RDS.saveCacheDB(rdsfile)
  
-        for mainID in mainIDList:
-            if len(uidDict[mainID]) == 2:
-                del uidDict[mainID]
+    if verbose:
+        print "finished", time.ctime()
  
-        if verbose:
-            print "uidDict first candidates ", len(uidDict), time.ctime()
  
-        for readID in multiIDs:
-            (frontID, multiplicity) = readID.split("::")
-            (mainID, pairID) = frontID.split("/")
-            try:
-                if pairID not in midDict[mainID]:
-                    midDict[mainID].append(pairID)
-            except:
-                midDict[mainID] = [pairID]
+def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
+    fixedPair = 0
+    tooFar = pairDist * 10
+    readlen = RDS.getReadSize()
+    fixedReads = []
+    print "doing pairs with pairDist = %d" % pairDist
+    hasSplices = RDS.dataType == "RNA"
+    uniqIDs = RDS.getReadIDs(uniqs=True, multi=False, splices=hasSplices)
+
+    if verbose:
+        print "got uniqIDs ", time.ctime()
  
-        if verbose:
-            print "all multis ", len(midDict), time.ctime()
+    jointList, bothMultiList = getReadIDLists(uniqIDs, multiIDs, verbose)
+    uniqDict = getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose)
+    if verbose:
+        print "guDict actual ", len(uniqDict), time.ctime()
  
-        mainIDList = uidDict.keys()
-        for mainID in mainIDList:
-            if mainID not in midDict:
-                del uidDict[mainID]
+    multiDict = getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose)
+    if verbose:
+        print "muDict actual ", len(multiDict), time.ctime()
+
+    RDS.setSynchronousPragma("OFF")
+    for readID in jointList:
+        try:
+            ustart = uniqDict[readID]["start"]
+            ustop = ustart + readlen
+        except KeyError:
+            ustart = uniqDict[readID]["startL"]
+            ustop = uniqDict[readID]["stopR"]
+
+        uniqReadChrom = uniqDict[readID]["chrom"]
+        multiReadList = multiDict[readID]
+        numMultiReads = len(multiReadList)
+        bestMatch = [tooFar] * numMultiReads
+        found = False
+        for index in range(numMultiReads):
+            mstart = multiReadList[index]["start"]
+            multiReadChrom = multiReadList[index]["chrom"]
+            mpair = multiReadList[index]["pairID"]
+            if uniqReadChrom != multiReadChrom:
+                continue
  
-        if verbose:
-            print "uidDict actual candidates ", len(uidDict), time.ctime()
+            if abs(mstart - ustart) < pairDist:
+                bestMatch[index] = abs(mstart - ustart)
+                found = True
+            elif abs(mstart - ustop) < pairDist:
+                bestMatch[index] = abs(mstart - ustop)
+                found = True
  
-        for readID in midDict:
-            listLen = len(midDict[readID])
-            if listLen == 1:
-                if readID in uidDict:
-                    jointList.append(readID)
-            elif listLen == 2:
-                bothMultiList.append(readID)
+        if found:
+            theMatch = -1
+            theDist = tooFar
+            reweighList = []
+            for index in range(numMultiReads):
+                if theDist > bestMatch[index]:
+                    theMatch = index
+                    theDist = bestMatch[index]
+
+            theID = string.join([readID, mpair], "/")
+            for index in range(numMultiReads):
+                if index == theMatch:
+                    score = 1 - ((numMultiReads - 1) / (100. * numMultiReads))
+                else:
+                    score = 1 / (100. * numMultiReads)
+
+                start = multiReadList[index][0]
+                chrom = "chr%s" % multiReadList[index][1]
+                reweighList.append((round(score,3), chrom, start, theID))
+
+            #TODO: Is this right? If match index is 0 are we doing nothing?
+            if theMatch > 0:
+                RDS.reweighMultireads(reweighList)
+                fixedPair += 1
+                if verbose and fixedPair % 10000 == 1:
+                    print "fixed %d" % fixedPair
+                    print uniqDict[readID]
+                    print multiDict[readID]
+                    print reweighList
+
+                fixedReads.append(theID)
+
+    RDS.setSynchronousPragma("ON")
+
+    print "fixed %d pairs" % fixedPair
+    print time.ctime()
+
+    return fixedReads
+
+
+def getReadIDLists(uniqIDs, multiIDs, verbose=False):
+    uidDict = {}
+    mainIDList = []
+    for readID in uniqIDs:
+        (mainID, pairID) = readID.split("/")
+        try:
+            uidDict[mainID].append(pairID)
+        except:
+            uidDict[mainID] = [pairID]
+            mainIDList.append(mainID)
  
-        if verbose:
-            print "joint ", len(jointList), time.ctime()
-            print "bothMulti ", len(bothMultiList), time.ctime()
+    if verbose:
+        print "uidDict all ", len(uidDict), time.ctime()
  
-        del uidDict
-        del midDict
-        del mainIDList
-        del uniqIDs
+    for mainID in mainIDList:
+        if len(uidDict[mainID]) == 2:
+            del uidDict[mainID]
  
-        uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
-        if verbose:
-            print "got uniq dict ", len(uniqDict), time.ctime()
-
-        if RDS.dataType == "RNA":
-            spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
-            if verbose:
-                print "got splice dict ", len(spliceDict), time.ctime()
-
-        for readID in jointList:
-            try:
-                guDict[readID] = uniqDict[readID][0]
-            except:
-                if RDS.dataType == "RNA":
-                    guDict[readID] = spliceDict[readID][0]
-
-        del uniqDict
-        del spliceDict
-        if verbose:
-            print "guDict actual ", len(guDict), time.ctime()
+    if verbose:
+        print "uidDict first candidates ", len(uidDict), time.ctime()
+
+    midDict = {}
+    for readID in multiIDs:
+        (frontID, multiplicity) = readID.split("::")
+        (mainID, pairID) = frontID.split("/")
+        try:
+            if pairID not in midDict[mainID]:
+                midDict[mainID].append(pairID)
+        except:
+            midDict[mainID] = [pairID]
  
-        multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
-        if verbose:
-            print "got multi dict ", len(multiDict), time.ctime()
+    if verbose:
+        print "all multis ", len(midDict), time.ctime()
  
-        for readID in jointList:
-            muDict[readID] = multiDict[readID]
+    mainIDList = uidDict.keys()
+    for mainID in mainIDList:
+        if mainID not in midDict:
+            del uidDict[mainID]
  
-        for readID in bothMultiList:
-            muDict[readID] = multiDict[readID]
+    if verbose:
+        print "uidDict actual candidates ", len(uidDict), time.ctime()
+
+    jointList = []
+    bothMultiList = []
+    for readID in midDict:
+        listLen = len(midDict[readID])
+        if listLen == 1:
+            if readID in uidDict:
+                jointList.append(readID)
+        elif listLen == 2:
+            bothMultiList.append(readID)
  
-        del multiDict
-        if verbose:
-            print "muDict actual ", len(muDict), time.ctime()
-
-        RDS.setSynchronousPragma("OFF")
-        for readID in jointList:
-            try:
-                (ustart, uchrom, upair) = guDict[readID]
-                ustop = ustart + readlen
-            except:
-                (ustart, lstop, rstart, ustop, uchrom, upair) = guDict[readID]
-
-            muList = muDict[readID]
-            muLen = len(muList)
-            bestMatch = [tooFar] * muLen
-            found = False
-            for index in range(muLen):
-                (mstart, mchrom, mpair) = muList[index]
-                if uchrom != mchrom:
-                    continue
-
-                if abs(mstart - ustart) < pairDist:
-                    bestMatch[index] = abs(mstart - ustart)
-                    found = True
-                elif abs(mstart - ustop) < pairDist:
-                    bestMatch[index] = abs(mstart - ustop)
-                    found = True
-
-            if found:
-                theMatch = -1
-                theDist = tooFar
-                reweighList = []
-                for index in range(muLen):
-                    if theDist > bestMatch[index]:
-                        theMatch = index
-                        theDist = bestMatch[index]
-
-                theID = string.join([readID, mpair], "/")
-                for index in range(muLen):
-                    if index == theMatch:
-                        score = 1 - (muLen - 1) / (100. * (muLen))
-                    else:
-                        score = 1 / (100. * muLen)
-
-                    start = muList[index][0]
-                    chrom = "chr%s" % muList[index][1]
-                    reweighList.append((round(score,3), chrom, start, theID))
-
-                if theMatch > 0:
-                    RDS.reweighMultireads(reweighList)
-                    fixedPair += 1
-                    if verbose and fixedPair % 10000 == 1:
-                        print "fixed %d" % fixedPair
-                        print guDict[readID]
-                        print muDict[readID]
-                        print reweighList
-
-                    fixedReads.append(theID)
-
-        RDS.setSynchronousPragma("ON")
-
-        del guDict
-        del muDict
-        print "fixed %d pairs" % fixedPair
-        print time.ctime()
+    if verbose:
+        print "joint ", len(jointList), time.ctime()
+        print "bothMulti ", len(bothMultiList), time.ctime()
  
-    skippedReads = 0
-    if doRadius:
-        print "doing uniq read radius with radius = %d" % radius
-        multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
-        print "got multiDict"
-        RDS.setSynchronousPragma("OFF")
-        rindex = 0
-        for readID in multiIDs:
-            theID = readID
-            if theID in fixedReads:
-                skippedReads += 1
-                continue
+    return jointList, bothMultiList
  
-            if "::" in readID:
-                (readID, multiplicity) = readID.split("::")
-
-            scores = []
-            coords = []
-            for read in multiDict[readID]:
-                (start, weight, rID, chrom) = read
-                achrom = "chr%s" % chrom
-                regionStart = start + halfreadlen - radius
-                regionStop = start + halfreadlen + radius 
-                uniqs = RDS.getCounts(achrom, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
-                scores.append(uniqs + 1)
-                coords.append((achrom, start, theID))
-
-            total = float(sum(scores))
-            reweighList = []
-            for index in range(len(scores)):
-                reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
  
-            RDS.reweighMultireads(reweighList)
-            rindex += 1
-            if rindex % 10000 == 0:
-                print rindex
+def getUniqAndSpliceReadsFromReadIDs(RDS, jointList, verbose=False):
+    uniqReadsDict = {}
+    uniqDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
+    if verbose:
+        print "got uniq dict ", len(uniqDict), time.ctime()
  
-        RDS.setSynchronousPragma("ON")
+    if RDS.dataType == "RNA":
+        spliceDict = RDS.getSplicesDict(noSense=True, withChrom=True, withPairID=True, readIDDict=True)
          if verbose:
-            print "skipped ", skippedReads
+            print "got splice dict ", len(spliceDict), time.ctime()
  
-        print "reweighted ", rindex
+    for readID in jointList:
+        try:
+            uniqReadsDict[readID] = uniqDict[readID][0]
+        except KeyError:
+            if RDS.dataType == "RNA":
+                uniqReadsDict[readID] = spliceDict[readID][0]
+
+    return uniqReadsDict
  
-    if doCache:
-        RDS.saveCacheDB(rdsfile)
  
+def getMultiReadsFromReadIDs(RDS, jointList, bothMultiList, verbose=False):
+    multiReadSubsetDict = {}
+    multiDict = RDS.getReadsDict(noSense=True, withChrom=True, withPairID=True, doUniqs=False, doMulti=True, readIDDict=True)
      if verbose:
-        print "finished", time.ctime()
-    
+        print "got multi dict ", len(multiDict), time.ctime()
+
+    for readID in jointList:
+        multiReadSubsetDict[readID] = multiDict[readID]
+
+    for readID in bothMultiList:
+        multiReadSubsetDict[readID] = multiDict[readID]
+
+    return multiReadSubsetDict
+
+
+def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
+    skippedReads = 0
+    readlen = RDS.getReadSize()
+    halfreadlen = readlen / 2
+    print "doing uniq read radius with radius = %d" % radius
+    multiDict = RDS.getReadsDict(noSense=True, withWeight=True, withChrom=True, withID=True, doUniqs=False, doMulti=True, readIDDict=True)
+    print "got multiDict"
+    RDS.setSynchronousPragma("OFF")
+    reweighedCount = 0
+    for readID in multiIDs:
+        originalMultiReadID = readID
+        if originalMultiReadID in readsToSkip:
+            skippedReads += 1
+            continue
+
+        if "::" in readID:
+            (readID, multiplicity) = readID.split("::")
+
+        scores = []
+        coords = []
+        for read in multiDict[readID]:
+            start = read["start"]
+            chromosome = "chr%s" % read["chrom"]
+            regionStart = start + halfreadlen - radius
+            regionStop = start + halfreadlen + radius 
+            uniqs = RDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=False, splices=False, reportCombined=True)
+            scores.append(uniqs + 1)
+            coords.append((chromosome, start, originalMultiReadID))
+
+        total = float(sum(scores))
+        reweighList = []
+        for index in range(len(scores)):
+            reweighList.append((round(scores[index]/total,2), coords[index][0], coords[index][1], coords[index][2]))
+
+        RDS.reweighMultireads(reweighList)
+        reweighedCount += 1
+        if reweighedCount % 10000 == 0:
+            print reweighedCount
+
+    RDS.setSynchronousPragma("ON")
+    if verbose:
+        print "skipped ", skippedReads
+
+    print "reweighted ", reweighedCount
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
author	Sean Upchurch <sau@caltech.edu>
	Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)
committer	Sean Upchurch <sau@caltech.edu>
	Tue, 30 Nov 2010 22:26:37 +0000 (14:26 -0800)
MakeBamFromRds.py		patch \| blob \| history
MakeRdsFromBam.py		patch \| blob \| history
Peak.py	[new file with mode: 0644]	patch \| blob
ReadDataset.py		patch \| blob \| history
Region.py	[new file with mode: 0644]	patch \| blob
altSpliceCounts.py		patch \| blob \| history
analyzego.py		patch \| blob \| history
bedtoregion.py		patch \| blob \| history
binstocdf.py		patch \| blob \| history
buildMatrix.py		patch \| blob \| history
buildrmaskdb.py		patch \| blob \| history
buildsnpdb.py		patch \| blob \| history
checkrmask.py		patch \| blob \| history
chkSNPrmask.py		patch \| blob \| history
chksnp.py		patch \| blob \| history
colsum.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
combinerds.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
crossmatch.py		patch \| blob \| history
distalPairs.py		patch \| blob \| history
docs/README.build-rds		patch \| blob \| history
docs/README.chip-seq		patch \| blob \| history
docs/README.rna-seq		patch \| blob \| history
docs/README.rnapath		patch \| blob \| history
docs/RNA-seq.analysisSteps.txt		patch \| blob \| history
docs/buildMatrix.sh		patch \| blob \| history
docs/partition.sh		patch \| blob \| history
docs/regionCounts.sh		patch \| blob \| history
docs/runRNAPairedAnalysis.sh		patch \| blob \| history
docs/runSNPAnalysis.sh		patch \| blob \| history
docs/runStandardAnalysis.sh		patch \| blob \| history
docs/runStrandedAnalysis.sh		patch \| blob \| history
farPairs.py		patch \| blob \| history
featureIntersects.py		patch \| blob \| history
findMotifs.py		patch \| blob \| history
findall.py		patch \| blob \| history
fraction.py		patch \| blob \| history
geneDownstreamBins.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneLocusCounts.py		patch \| blob \| history
geneLocusPeaks.py		patch \| blob \| history
geneMrnaCounts.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
geneNeighbors.py		patch \| blob \| history
geneStallingBins.py		patch \| blob \| history
geneStartBins.py		patch \| blob \| history
geneUpstreamBins.py		patch \| blob \| history
getGOgenes.py		patch \| blob \| history
getNovelSNPs.py		patch \| blob \| history
getSNPGeneInfo.py		patch \| blob \| history
getSNPs.py		patch \| blob \| history
getallNRSE.py		patch \| blob \| history
getallgenes.py		patch \| blob \| history
getallsites.py		patch \| blob \| history
getfasta.py		patch \| blob \| history
getgosig.py		patch \| blob \| history
getmers.py		patch \| blob \| history
getsplicefa.py		patch \| blob \| history
gointersects.py		patch \| blob \| history
hepg2.rds	[deleted file]	patch \| blob \| history
intersects.py		patch \| blob \| history
listGeneFeatures.py		patch \| blob \| history
makeGraphs.py		patch \| blob \| history
makeSNPtrack.py		patch \| blob \| history
makebedfromrds.py		patch \| blob \| history
makerdsfrombed.py		patch \| blob \| history
makerdsfromblat.py		patch \| blob \| history
makerdsfrombowtie.py		patch \| blob \| history
makerdsfromeland2.py		patch \| blob \| history
makesitetrack.py		patch \| blob \| history
makewiggle.py		patch \| blob \| history
normalizeExpandedExonic.py		patch \| blob \| history
normalizeFinalExonic.py		patch \| blob \| history
partition.py		patch \| blob \| history
peakstoregion.py		patch \| blob \| history
plotbardist.py		patch \| blob \| history
plotnomogram.py		patch \| blob \| history
plotprofile.py		patch \| blob \| history
predictSpliceCount.py		patch \| blob \| history
profilebins.py		patch \| blob \| history
ratio.py		patch \| blob \| history
rdsmetadata.py		patch \| blob \| history
regionBins.py		patch \| blob \| history
regionCounts.py		patch \| blob \| history
regionintersects.py		patch \| blob \| history
regiontobed.py		patch \| blob \| history
rnaEditing.py		patch \| blob \| history
rnafarPairs.py		patch \| blob \| history
scatterfields.py		patch \| blob \| history
siteintersects.py		patch \| blob \| history
stallCategory.py		patch \| blob \| history
test/testAnalyzeGO.py		patch \| blob \| history
test/testChksnp.py		patch \| blob \| history
test/testCommoncode.py		patch \| blob \| history
test/testErange.py		patch \| blob \| history
test/testGeneMrnaCounts.py		patch \| blob \| history
test/testGeneMrnaCountsWeighted.py	[new file with mode: 0644]	patch \| blob
test/testGetFasta.py		patch \| blob \| history
test/testGetSNPGeneInfo.py		patch \| blob \| history
test/testGetSNPs.py		patch \| blob \| history
test/testMakeBamFromRds.py		patch \| blob \| history
test/testMakeGraphs.py		patch \| blob \| history
test/testMakeRdsFromBam.py		patch \| blob \| history
test/testMakeSNPTrack.py		patch \| blob \| history
test/testMarkLinkers.py		patch \| blob \| history
test/testPeak.py	[new file with mode: 0644]	patch \| blob
test/testPeaksToRegion.py		patch \| blob \| history
test/testProcessVelvet.py		patch \| blob \| history
test/testRNAPATH.py		patch \| blob \| history
test/testReadDataset.py		patch \| blob \| history
test/testRegion.py	[new file with mode: 0644]	patch \| blob
test/testRnaAToIFilter.py		patch \| blob \| history
test/testRnaEditing.py		patch \| blob \| history
test/testTranscripts.py		patch \| blob \| history
test/testmakebedfromrds.py		patch \| blob \| history
transcripts.py		patch \| blob \| history
trimreads.py		patch \| blob \| history
utrChanges.py		patch \| blob \| history
weighMultireads.py		patch \| blob \| history