Release version for Erange 4.0a

author Sean Upchurch <sau@caltech.edu>

Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)

committer Sean Upchurch <sau@caltech.edu>

Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
author Sean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committer Sean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py

index bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59..730c20be6fb147f84e58b7f80485199e0975ad3a 100644 (file)
--- a/MakeBamFromRds.py
+++ b/MakeBamFromRds.py
@@ -22,13 +22,13 @@ import pysam
  import ReadDataset
  from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
+VERSION = "1.0"
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "makeBamFromRds: version 1.0"
-    print verstring
+    print "makeBamFromRds: version %s" % VERSION
  
      doPairs = False
      
@@ -134,7 +134,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
      for chrom in chromRemoveList:
          chromList.remove(chrom)
  
-    header = {"HD": {"VN": "1.0"}}
+    header = {"HD": {"VN": VERSION}}
      if referenceSequenceList:
          header["SQ"] = referenceSequenceList
  
diff --git a/ReadDataset.py b/ReadDataset.py

index 5ff60e2e6954c862888c5807495ef49142e14fa3..56e28a8995b734735ace06d1d990ac9b46753580 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -153,42 +153,42 @@ class ReadDataset():
              self.cachedDB = ""
  
  
-    def attachDB(self, filename, asname):
+    def attachDB(self, filename, dbName):
          """ attach another database file to the readDataset.
          """
-        stmt = "attach '%s' as %s" % (filename, asname)
+        stmt = "attach '%s' as %s" % (filename, dbName)
          self.execute(stmt)
  
  
-    def detachDB(self, asname):
+    def detachDB(self, dbName):
          """ detach a database file to the readDataset.
          """
-        stmt = "detach %s" % (asname)
+        stmt = "detach %s" % (dbName)
          self.execute(stmt)
  
  
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+    def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
          """ import into current RDS the table (with columns destcolumns,
              with default all columns) from the database file asname,
              using the column specification of ascolumns (default all).
          """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
          if flagged != "":
              stmt += " where flag = '%s' " % flagged
  
          self.executeCommit(stmt)
  
  
-    def getTables(self, asname=""):
+    def getTables(self, dbName=""):
          """ get a list of table names in a particular database file.
          """
          resultList = []
          sql = self.getSqlCursor()
  
-        if asname != "":
-            asname += "."
+        if dbName != "":
+            dbName = "%s." % dbName
  
-        stmt = "select name from %ssqlite_master where type='table'" % asname
+        stmt = "select name from %ssqlite_master where type='table'" % dbName
          sql.execute(stmt)
          results = sql.fetchall()
  
@@ -291,11 +291,15 @@ class ReadDataset():
          if "readsize" not in metadata:
              raise ReadDatasetError("no readsize parameter defined")
          else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
+            readSize = metadata["readsize"]
+            if "import" in readSize:
+                readSize = readSize.split()[0]
  
-            return int(mysize)
+            readSize = int(readSize)
+            if readSize < 0:
+                raise ReadDatasetError("readsize is negative")
+
+            return readSize
  
  
      def getDefaultCacheSize(self):
@@ -317,11 +321,9 @@ class ReadDataset():
                  if row["chrom"] not in results:
                      results.append(row["chrom"])
              else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
+                shortName = row["chrom"][3:]
+                if  len(shortName.strip()) > 0 and shortName not in results:
+                    results.append(shortName)
  
          results.sort()
  
@@ -333,32 +335,17 @@ class ReadDataset():
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
-        sql = self.getSqlCursor()
  
          if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
  
          if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
+            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+            maxCoord = max(spliceMax, maxCoord)
  
          if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
+            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+            maxCoord = max(multiMax, maxCoord)
  
          if verbose:
              print "%s maxCoord: %d" % (chrom, maxCoord)
@@ -366,6 +353,19 @@ class ReadDataset():
          return maxCoord
  
  
+    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+        maxCoord = 0
+        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+        sql = self.getSqlCursor()
+        try:
+            sql.execute(sqlStatement)
+            maxCoord = int(sql.fetchall()[0][0])
+        except:
+            print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        return maxCoord
+
+
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
@@ -378,67 +378,14 @@ class ReadDataset():
          
          Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
          """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
  
-        groupBy = []
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
+            selectQuery = "select start, sense, sum(weight)"
          else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
+            selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
  
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
+        groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
          if doUniqs:
              stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
              if doMulti:
@@ -498,6 +445,7 @@ class ReadDataset():
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
+        resultsDict = {}
          if findallOptimize:
              resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
              if self.memBacked:
@@ -562,20 +510,17 @@ class ReadDataset():
          return resultsDict
  
  
-    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
+    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+        if splice:
+            startText = "startL"
+            stopText = "stopR"
+        else:
+            startText = "start"
+            stopText = "stop"
  
+        whereClause = []
          if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
+            whereClause.append("chrom = '%s'" % chrom)
  
          if flag != "":
              if flagLike:
@@ -584,25 +529,37 @@ class ReadDataset():
              else:
                  whereClause.append("flag = '%s'" % flag)
  
+        if start > -1:
+            whereClause.append("%s > %d" % (startText, start))
+
+        if stop > -1:
+            whereClause.append("%s < %d" % (stopText, stop))
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
          if hasMismatch:
              whereClause.append("mismatch != ''")
  
-        if strand != "":
+        if strand in ["+", "-"]:
              whereClause.append("sense = '%s'" % strand)
  
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
          if len(whereClause) > 0:
              whereStatement = string.join(whereClause, " and ")
              whereQuery = "where %s" % whereStatement
          else:
              whereQuery = ""
  
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        return whereQuery
+
+
+    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+        selectClause = [baseSelect]
+        if bothEnds:
+            selectClause.append("stop")
+
          if not noSense:
              selectClause.append("sense")
  
@@ -615,7 +572,36 @@ class ReadDataset():
          if withMismatch:
              selectClause.append("mismatch")
  
-        selectQuery = string.join(selectClause, " ,")
+        selectQuery = string.join(selectClause, ",")
+
+        return selectQuery
+
+
+    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+        groupBy = []
+        if findallOptimize:
+            groupBy = ["GROUP BY start, sense"]
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        groupQuery = string.join(groupBy)
+
+        return groupQuery
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+        selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+        selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
          if self.memBacked:
              sql = self.memcon.cursor()
          else:
@@ -625,6 +611,7 @@ class ReadDataset():
          sql.execute(stmt)
          currentReadID = ""
          currentChrom = ""
+        resultsDict = {}
          for row in sql:
              pairID = 0
              readID = row["readID"]
@@ -796,10 +783,6 @@ class ReadDataset():
          """ get readID's.
          """
          stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
          if uniqs:
              stmt.append("select readID from uniqs")
  
@@ -814,6 +797,10 @@ class ReadDataset():
          else:
              selectPart = ""
  
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
          if self.memBacked:
              sql = self.memcon.cursor()
@@ -845,12 +832,11 @@ class ReadDataset():
                  print "getting mismatches from chromosome %s" % (achrom)
  
              snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if useSplices and self.dataType == "RNA":
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    spliceEntry = spliceDict[k][0]
+                for spliceID in spliceIDList:
+                    spliceEntry = spliceDict[spliceID][0]
                      startpos = spliceEntry["startL"]
                      lefthalf = spliceEntry["stopL"]
                      rightstart = spliceEntry["startR"]
@@ -881,6 +867,7 @@ class ReadDataset():
  
                          snpDict[achrom].append([startpos, change_at, change_base, change_from])
  
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if achrom not in hitDict.keys():
                  continue
  
@@ -910,7 +897,7 @@ class ReadDataset():
  
  
      def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        useSplices=False, normalizationFactor=1.0, trackStrand=False,
                          keepStrand="both", shiftValue=0):
          """return a profile of the chromosome as an array of per-base read coverage....
              keepStrand = 'both', 'plusOnly', or 'minusOnly'.
@@ -925,8 +912,8 @@ class ReadDataset():
          dataType = metadata["dataType"]
          scale = 1. / normalizationFactor
          shift = {}
-        shift['+'] = int(shiftValue)
-        shift['-'] = -1 * int(shiftValue)
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
  
          if cstop > 0:
              lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
diff --git a/altSpliceCounts.py b/altSpliceCounts.py

index 12077c1ec7d9d7e3ba499c67c99734a8a9a54157..c59dc9249b6c82ded0466f872e98a9900e702c46 100755 (executable)
--- a/altSpliceCounts.py
+++ b/altSpliceCounts.py
@@ -2,7 +2,7 @@ try:
      import psyco
      psyco.full()
  except:
-    print 'psyco not running'
+    print "psyco not running"
  
  print "altSpliceCounts: version 3.7"
  
@@ -56,13 +56,13 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
      stopDict = {}
      resultDict = {}
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
      readlen = hitRDS.getReadSize()
      hitDict = hitRDS.getSplicesDict(noSense=True)
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
  
      for chrom in hitDict:
          startDict[chrom] = []
@@ -155,7 +155,7 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
  
          resultDict[chrom].sort()
          for line in resultDict[chrom]:
-            outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
+            outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
              alternative += 1
  
          print chrom, maxIndex, spliceEvent, altSpliceEvent
diff --git a/bedtoregion.py b/bedtoregion.py

index 3bcd554e5406a09f31ecc988e8c60733616d0dd5..a6a344d549b907b98343d731b3a348dd9802d0ec 100755 (executable)
--- a/bedtoregion.py
+++ b/bedtoregion.py
@@ -21,14 +21,16 @@ def main(argv=None):
  def bedToRegion(factor, infilename, outfilename):
      index = 1
      infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
      for line in infile:
-        if 'track' in line:
+        if "track" in line:
              continue
+
          fields = line.split()
-        line = string.join(fields, '\t')
-        outfile.write('%s%d\t%s\n' % (factor, index, line))
+        line = string.join(fields, "\t")
+        outfile.write("%s%d\t%s\n" % (factor, index, line))
          index += 1
+
      infile.close()
      outfile.close()
  
diff --git a/binstocdf.py b/binstocdf.py

index 63aa9558b8848183d4528bb16149cd0ced38aadd..27b48e22e8ecde5b4fd9c817ff17d4ab75dcbd13 100755 (executable)
--- a/binstocdf.py
+++ b/binstocdf.py
@@ -1,4 +1,5 @@
  import sys
+import string
  
  print "binstocdf: version 1.1"
  
@@ -6,19 +7,19 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    if len(argv) < 2:
-        print 'usage: python %s infile outfile' % sys.argv[0]
+    if len(argv) < 3:
+        print "usage: python %s infile outfile" % sys.argv[0]
          sys.exit(1)
  
-    infilename = argv[0]
-    outfilename = argv[1]
+    infilename = argv[1]
+    outfilename = argv[2]
  
      binToCDF(infilename, outfilename)
  
  
  def binToCDF(infilename, outfilename):
      infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
  
      for line in infile:
          fields = line.strip().split()
@@ -30,14 +31,15 @@ def binToCDF(infilename, outfilename):
              outfile.write(line)
              continue
  
-        outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
-        cum = 0
+        outputFields = fields[:4]
+        runningTotal = 0
          for bin in fields[4:]:
-            cum += int(bin)
-            percent = 100 * cum / total
-            outfile.write('\t%d' % percent)
+            runningTotal += int(bin)
+            percent = 100 * runningTotal / total
+            outputFields.append("%d" % percent)
  
-        outfile.write('\n')
+        outputLine = string.join(outputFields, "\t")
+        outfile.write("%s\n" % outputLine)
  
      infile.close()
      outfile.close()
diff --git a/buildMatrix.py b/buildMatrix.py

index c7b6dd0c801025552c365003cf9067b2550c6b0c..a49120cf574f9c0213bd569edfe8507f923435a2 100755 (executable)
--- a/buildMatrix.py
+++ b/buildMatrix.py
@@ -78,7 +78,7 @@ def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
      if header.strip() == "":
          header = "#\t"
  
-    outfile.write( "%s\t%s\n" % (header, colID))
+    outfile.write("%s\t%s\n" % (header, colID))
  
      values = []
      min = 20000000000.
diff --git a/cdfdist.py b/cdfdist.py

index 71662442afff0bdeb68fb5604307f72b8cedda91..52963b7445a1e28c8f251c94a6220f5fe1dc92f6 100755 (executable)
--- a/cdfdist.py
+++ b/cdfdist.py
@@ -8,11 +8,15 @@ def main(argv=None):
          print "usage: python %s bins percent infile" % sys.argv[0]
          sys.exit(1)
  
-    bins = int(argv[0])
-    percent = int(argv[1])
-    infilename = argv[2]
+    bins = int(argv[1])
+    percent = int(argv[2])
+    infilename = argv[3]
  
-    cdfDist(bins, percent, infilename)
+    printCdfDist(bins, percent, infilename)
+
+
+def printCdfDist(bins, percent, infilename):
+    print cdfDist(bins, percent, infilename)
  
  
  def cdfDist(bins, percent, infilename):
@@ -30,7 +34,7 @@ def cdfDist(bins, percent, infilename):
              index += 1
  
      infile.close()
-    print binsList
+    return binsList
  
  
  if __name__ == "__main__":
diff --git a/colsum.py b/colsum.py

index f6d1ff968c5260e35e876fb79a82ac201b088585..643aaeaa82b8bc0c444e33ae909d4f5859e2b1d7 100755 (executable)
--- a/colsum.py
+++ b/colsum.py
@@ -28,7 +28,7 @@ def colsum(fieldID, filename):
                  count += float(fields[fieldID])
              else:
                  count += int(fields[fieldID])
-        except ValueError:
+        except (ValueError, IndexError):
              pass
  
      infile.close()
diff --git a/combineRPKMs.py b/combineRPKMs.py

index ead4e1b81d0b37335fbf00908f1aea615cec33dd..77f30d6fc2acb2a9424634a138a03ee30e314f91 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -51,23 +51,8 @@ def makeParser(usage=""):
  
  def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
  
-    firstDict = {}
-    firstfile = open(firstfileName)
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
-
-    firstfile.close()
-
-    expandedDict = {}
-    gidDict = {}
-    expandedfile = open(expandedfileName)
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
-
-    expandedfile.close()
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
  
      if doFraction:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
@@ -97,5 +82,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      outfile.close()
  
  
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/combinerds.py b/combinerds.py

index 4c826b04ed6c668a3d5b2a66ee4b842789a483a0..2878423a9716a0e4f31d0a1738a0770339c8ed6c 100755 (executable)
--- a/combinerds.py
+++ b/combinerds.py
@@ -10,7 +10,9 @@ except:
      pass
  
  import sys
+import optparse
  import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
  
  print "combinerds: version 1.2"
  
@@ -19,97 +21,107 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    if len(argv) < 2:
-        print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
-        #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    parser = makeParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
          sys.exit(1)
  
-    doCache = False
-    cachePages = -1
-    if '--cache' in argv:
-        doCache = True
-        try:
-            cachePages =  int(argv[sys.argv.index('-cache') + 1])
-        except: 
-            pass
-
-    datafile = argv[1]
-    infileList = []
-    for index in range(2, len(argv)):
-        if argv[index][0] == '-':
-            break
-        infileList.append(sys.argv[index])
+    datafile = args[0]
+    infileList = args[1:]
  
-    print "destination RDS: %s" % datafile
+    combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA)
  
-    if '--initrna' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
-    elif '--init' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True)
  
-    withFlag = ''
-    if '--flag' in argv:
-        withFlag = argv[sys.argv.index('-flag') + 1]
-        print "restrict to flag = %s" % withFlag
+def makeParser():
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--table", action="append", dest="tablelist")
+    parser.add_option("--init", action="store_true", dest="doInit")
+    parser.add_option("--initrna", action="store_true", dest="initRNA")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--flag", dest="withFlag")
+
+    configParser = getConfigParser()
+    section = "combinerds"
+    doInit = getConfigBoolOption(configParser, section, "doInit", False)
+    initRNA = getConfigBoolOption(configParser, section, "initRNA", False)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+
+    parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages,
+                        withFlag=withFlag)
+
+    return parser
+
  
-    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
+def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False):
  
+    print "destination RDS: %s" % datafile
+    datasetType="DNA"
+    if initRNA:
+        doInit = True
+        datasetType="RNA"
+
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = -1
+
+    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType)
      if cachePages > rds.getDefaultCacheSize():
          rds.setDBcache(cachePages)
-        cacheVal = cachePages
      else:
-        cacheVal = rds.getDefaultCacheSize()
-
-    doIndex = False
-    if '--index' in argv:
-        doIndex = True
+        cachePages = rds.getDefaultCacheSize()
  
-    tableList = []
-    if '--table' in argv:
-        tableList.append(argv[argv.index('-table') + 1])
-    else:
+    if tableList == []:
          tableList = rds.getTables()
  
-    combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
-
+    if withFlag != "":
+        print "restrict to flag = %s" % withFlag
  
-def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
      metaDict = rds.getMetadata()
      if "numberImports" not in metaDict:
          origIndex = 0
-        rds.insertMetadata([("numberImports", str(0))])
+        rds.insertMetadata([("numberImports", "0")])
      else:
          origIndex = int(metaDict["numberImports"])
  
      index = origIndex
      for inputfile in infileList:
-        asname = "input" + str(index)
-        rds.attachDB(inputfile,asname)
+        dbName = "input%s" % str(index)
+        rds.attachDB(inputfile, dbName)
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
-            ascols = "*"
+            dbColumns = "*"
              if table == "uniqs":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "multi":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "splices":
-                ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
-            elif table == "metadata":
-                ascols = "name, value || ' (import_%d)'" % index
-                rds.importFromDB(asname, table, ascols)
+                dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
  
-            if table != "metadata":
-                rds.importFromDB(asname, table, ascols, withFlag)
+            if table == "metadata":
+                dbColumns = "name, value || ' (import_%d)'" % index
+                rds.importFromDB(dbName, table, dbColumns)
+            else:
+                rds.importFromDB(dbName, table, dbColumns, withFlag)
  
-        rds.detachDB(asname)
-        rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+        rds.detachDB(dbName)
+        rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))])
          index += 1
  
      rds.updateMetadata("numberImports", index, origIndex)
      if doIndex:
          print "building index...."
-        if cacheVal > 0:
-            rds.buildIndex(cacheVal)
+        if cachePages > 0:
+            rds.buildIndex(cachePages)
          else:
              rds.buildIndex()
  
diff --git a/commoncode.py b/commoncode.py

index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..f526daacca0dc91245e71205363216bc17eba2e9 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -17,6 +17,9 @@ import Region
  commoncodeVersion = 5.6
  currentRDSversion = 2.0
  
+class ErangeError(Exception):
+    pass
+
  
  def getReverseComplement(base):
      revComp = {"A": "T",
@@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False):
      return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  
  
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
      if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
  
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
  
      return geneannotDict
  
@@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      mergeCount = 0
      chromField = int(chromField)
      count = 0
-    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
-    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
-    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
-    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
-    #      be merged with A but that will exit the loop and the merge with C will be missed.
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      """
  
      if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
  
      seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
  
@@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
  
      if leftPlus:
          numLeftPlus = 0
@@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      return peak
  
  
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
      bestShift = 0
      lowestScore = 20000000000
      for testShift in xrange(maxShift + 1):
          shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
                  currentpos += testShift
@@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
-            if doWeight:
+            if useWeight:
                  weight = read["weight"]
              else:
                  weight = 1.0
@@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
              hitIndex += 1
              currentpos += 1
  
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
              seqArray[currentpos] += weight
              hitIndex += 1
              currentpos += 1
@@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length):
          if topNucleotide < smoothArray[currentpos]:
              topNucleotide = smoothArray[currentpos]
              peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
              peakList.append(currentpos)
  
      return peakList
@@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
      """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
          print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
          return locusByChromDict
  
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
      for gid in featuresDict:
          featureList = featuresDict[gid]
          newFeatureList = []
@@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
          else:
              if not useCDS and upstream > 0:
                  if upstreamSpanTSS:
@@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
  
-        glen = abs(gstop - gstart)
          if chrom not in locusByChromDict:
              locusByChromDict[chrom] = []
  
@@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
  def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                        normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                        binLength=-1):
@@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                  rlen = regionTuple[lengthField]
                  try:
                      rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                      rsense = "F"
  
                  if tagStart > stop:
diff --git a/distalPairs.py b/distalPairs.py

index 5bc25321b25b8a30e9b09511cabbee4355010b71..be403109110c3fb40f7bba831f6c2ea48e496536 100755 (executable)
--- a/distalPairs.py
+++ b/distalPairs.py
@@ -85,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
  
      print time.ctime()
  
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
      print "getting uniq reads"    
      uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
      print "got uniqs"
  
      if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
  
      if doVerbose:
          print len(uniqDict), time.ctime()
  
      outfile = open(outfilename,"w")
-
      diffChrom = 0
      distal = 0
      total = 0
@@ -132,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                      continue
                  else:
                      outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print diffChrom, outline
              else:
                  dist = abs(start1 - start2)
-
                  if minDist < dist < maxDist:
                      distal += 1
                      outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print distal, outline
  
@@ -157,5 +140,22 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
      print time.ctime()
  
  
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        del read0[1]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                del read2[1]
+                uniqDict[readID] = [read0,read2]
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/farPairs.py b/farPairs.py

index 00cc91844b23084d712573261e996524739c9870..712e60eff9815f9728b6a03983385f83d96bc6f5 100644 (file)
--- a/farPairs.py
+++ b/farPairs.py
@@ -14,8 +14,9 @@ except:
  import sys
  import time
  import optparse
+import string
  import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
  
  print "farPairs: version 1.4"
  
@@ -38,14 +39,13 @@ def main(argv=None):
      outfilename = args[1]
      outbedname = args[2]
  
-    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+    farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
               options.cachePages, options.minDist, options.maxDist, options.minCount,
               options.label)
  
  
  def getParser(usage):
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
      parser.add_option("--cache", type="int", dest="cachePages")
      parser.add_option("--verbose", action="store_true", dest="doVerbose")
      parser.add_option("--minDist", type="int", dest="minDist")
@@ -55,7 +55,6 @@ def getParser(usage):
  
      configParser = getConfigParser
      section = "farPairs"
-    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
      doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
      minDist = getConfigIntOption(configParser, section, "minDist", 1000)
@@ -63,15 +62,33 @@ def getParser(usage):
      minCount = getConfigIntOption(configParser, section, "minCount", 2)
      label = getConfigOption(configParser, section, "label", None)
  
-    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+    parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
                          minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
  
      return parser
  
  
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
               cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
  
+    flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+    outfile = open(outfilename, "w")
+    for region in flagDict:
+        regionConnections = countDuplicatesInList(flagDict[region])
+        for (connectedRegion, count) in regionConnections:
+            if count >= minCount:
+                outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+                print >> outfile, outline
+                if doVerbose:
+                    print outline
+
+    outfile.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
      doCache = False
      if cachePages is not None:
          doCache = True
@@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
  
      if doVerbose:
          print time.ctime()
-
-    total = 0
-    outfile = open(outfilename, "w")
+    
      outbed = open(outbedname, "w")
      outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+    outbed.close()
  
      readlen = RDS.getReadSize()
      flagDict = {}
@@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
          if doNotProcessChromosome(chromosome):
              continue
  
-        print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
-        if doVerbose:
-            print len(uniqDict), time.ctime()
-
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                start1 = readList[0]["start"]
-                flag1 = readList[0]["flag"]
-                start2 = readList[1]["start"]
-                flag2 = readList[1]["flag"]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    startList = [start1, start2]
-                    stopList = [start1 + readlen, start2 + readlen]
-                    startList.sort()
-                    stopList.sort()
-                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
-                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
-                        outbed.write(outputLine)
-                        if doVerbose:
-                            print flag1, flag2, dist
-
-                        try:
-                            flagDict[flag1].append((flag2, start1, start2))
-                        except KeyError:
-                            flagDict[flag1] = [(flag2, start1, start2)]
-
-                        try:
-                            flagDict[flag2].append((flag1, start1, start2))
-                        except KeyError:
-                            flagDict[flag2] = [(flag2, start1, start2)]
+        writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
  
      print "%d connected regions" % len(flagDict)
  
-    for region in flagDict:
-        flagDict[region].sort()
-        regionConnections = {}
-        for (region2, start1, start2) in flagDict[region]:
+    return flagDict
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+    outbed = open(outbedname, "a")
+    print chromosome
+    uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if readsAreFarPair(readList, minDist, maxDist):
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            startList = [start1, start2]
+            startList.sort()
+            outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+            outbed.write(outputLine)
+            flag1 = readList[0]["flag"]
+            flag2 = readList[1]["flag"]
+            if doVerbose:
+                print flag1, flag2, abs(start1 - start2)
+
              try:
-                regionConnections[region2] += 1
+                flagDict[flag1].append(flag2)
              except KeyError:
-                regionConnections[region2] = 1
+                flagDict[flag1] = [flag2]
  
-        for region2 in regionConnections:
-            if regionConnections[region2] >= minCount:
-                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
-                if doVerbose:
-                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+            try:
+                flagDict[flag2].append(flag1)
+            except KeyError:
+                flagDict[flag2] = [flag1]
  
-    outfile.close()
      outbed.close()
-    if doVerbose:
-        print "finished: ", time.ctime()
  
  
-def doNotProcessChromosome(chrom):
-    return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+    isFarPair = False
+    if len(readList) == 2:
+        flag1 = readList[0]["flag"]
+        flag2 = readList[1]["flag"]
+        if flag1 != flag2 and flag1 != "" and flag2 != "":
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            dist = abs(start1 - start2)
+            if minDist < dist < maxDist:
+                isFarPair = True
  
+    return isFarPair
  
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
-    readSizes = "%d" % (stopList[0] - startList[0])
-    readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+    sizes = ["%d" % readlen]
+    coords = ["0"]
      leftStart = startList[0] - 1
-    rightStop = stopList[-1]
+    rightStop = startList[-1]
      for index in range(1, numPieces):
-        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
-        readCoords += ",%d" % (startList[index] - startList[0])
+        sizes.append("%d" % (readlen + 1))
+        coords.append("%d" % (startList[index] - startList[0]))
  
      if rsense == "+":
-        senseCode = plusSense
+        senseCode = plusSenseColor
      else:
-        senseCode = minusSense
+        senseCode = minusSenseColor
  
+    readSizes = string.join(sizes, ",")
+    readCoords = string.join(coords, ",")
      outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
      return outline
  
  
diff --git a/featureIntersects.py b/featureIntersects.py

index cd357cc072c5cc7aa4460371439ef59c428873f5..c24b79edb312fae6c0b19f0d7e3d3ffb45d12984 100755 (executable)
--- a/featureIntersects.py
+++ b/featureIntersects.py
@@ -38,7 +38,7 @@ def main(argv=None):
  
  def makeParser(usage=""):
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--cistype", dest="cistype")
      parser.add_option("--radius", type="int", dest="radius")
  
      configParser = getConfigParser()
@@ -52,10 +52,19 @@ def makeParser(usage=""):
  
  
  def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
-    tabfile = open(tabFileName)
-    previous = ""
+    
+    posList = getPositionList(tabFileName)
+    feats = featuresIntersecting("human", posList, radius, cistype)
+    featkeys = feats.keys()
+    featkeys.sort()
+    for (chrom, pos) in featkeys:
+        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
  
+def getPositionList(tabFileName):
+    previous = ""
      posList = []
+    tabfile = open(tabFileName)
      for line in tabfile:
          fields = line.split("\t")
          current = fields[0]
@@ -66,11 +75,7 @@ def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
          chrom = fields[1][3:]
          posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
  
-    feats = featuresIntersecting("human", posList, radius, cistype)
-    featkeys = feats.keys()
-    featkeys.sort()
-    for (chrom, pos) in featkeys:
-        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+    return posList
  
  
  if __name__ == "__main__":
diff --git a/findall.py b/findall.py

index d20608f1997e2e6a0fd84d9169c78025485e4b57..ec75efb54f3680c892578a38811c1bd241c37241 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
              strandfilter=None, combine5p=False):
  
      shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
  
      if doRevBackground:
          print "Swapping IP and background to calculate FDR"
          pValueType = "back"
  
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
      doPvalue = True
      if ptype is not None:
          ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      else:
          pValueType = "self"
  
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
@@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
  
      stringency = max(stringency, 1.0)
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
      if doControl:
          print "\ncontrol:" 
          mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
      if doAppend:
          fileMode = "a"
      else:
          fileMode = "w"
  
      outfile = open(outfilename, fileMode)
-
      outfile.write("#ERANGE %s\n" % versionString)
      if doControl:
          mockRDSsize = len(mockRDS) / 1000000.
@@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                  foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
  
                  if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                      try:
                          shiftDict[localshift] += 1
                      except KeyError:
@@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
  
  
  def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                  shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                  noMulti, doControl, factor, trimValue, outputRegionList=False):
  
@@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
          weight = read["weight"]
          if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
              sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                  sumAll /= rdsSampleSize
  
              regionStart = currentHitList[0]
@@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
              if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                  sumMulti = 0.
                  #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
                  if foldRatio >= minRatio:
                      # first pass, with absolute numbers
                      peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                      peakScore = peak.smoothArray[bestPos]
                      numPlus = peak.numPlus
                      shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                          peakScore /= rdsSampleSize
  
                      if doTrim:
@@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
  
                          sumAll = trimPeak.numHits
                          numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                              sumAll /= rdsSampleSize
  
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                          if outputRegionList:
                              sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                          # just in case it changed, use latest data
@@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                          except:
                              continue
  
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                              peakScore /= rdsSampleSize
  
                      elif outputRegionList:
                          sumMulti = currentTotalWeight - currentUniqReadCount
  
                      if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                              sumMulti /= rdsSampleSize
  
                          try:
@@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
      return footer
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py

index 4ce97e4fc1380517447153e59fba80e5531e3029..91db4a1a44870602383a072d7a4585568026f165 100755 (executable)
--- a/geneDownstreamBins.py
+++ b/geneDownstreamBins.py
@@ -14,7 +14,7 @@ import sys
  import optparse
  import ReadDataset
  from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
  
  print "geneDownstreamBins: version 2.1"
  
@@ -53,8 +53,6 @@ def makeParser(usage=""):
  
  
  def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
-    bins = 10
-    standardMinThresh = standardMinDist / bins
  
      hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      normalizationFactor = 1.0
@@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac
      geneinfoDict = getGeneInfoDict(genome, cache=True)
      hg = Genome(genome)
      featuresDict = hg.getallGeneFeatures()
-
      outfile = open(outfilename, "w")
-
      gidList = hg.allGIDs()
      gidList.sort()
      for gid in gidList:
-        symbol = "LOC" + gid
-        geneinfo = ""
-        featureList = []
          try:
-            geneinfo = geneinfoDict[gid]
-            featureList = featuresDict[gid]
-            symbol = geneinfo[0][0]
-        except:
+            featuresList = featuresDict[gid]
+        except KeyError:
              print gid
  
-        if len(featureList) == 0:
+        try:
+            binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+        except ErangeError:
              continue
  
-        newfeatureList = []
-        for (ftype, chrom, start, stop, fsense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
+        tagCount *= normalizationFactor
+        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+        for binAmount in binList:
+            outfile.write("\t%.2f" % binAmount)
  
-        if chrom not in hitDict:
-            continue
+        outfile.write("\n")
  
-        newfeatureList.sort()
-        if len(newfeatureList) < 1:
-            continue
+    outfile.close()
  
-        glen = standardMinDist
-        if fsense == "F":
-            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
  
-            if glen < 1:
-                glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
  
-            gstart = newfeatureList[-1][1]
-        else:
-            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
+    symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+    geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+    if geneLength < standardMinDist:
+        raise ErangeError("gene length less than minimum")
  
-            if glen < 1:
-                glen = 1
+    binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+    if tagCount < 2:
+        raise ErangeError("tag count less than minimum")
  
-            gstart = newfeatureList[0][0] - glen
-            if gstart < 0:
-                gstart = 0
+    return binList, symbol, geneLength, tagCount
  
-        tagCount = 0
-        if glen < standardMinDist:
-            continue
  
-        binList = [0.] * bins
-        for read in hitDict[chrom]:
-            tagStart = read["start"]
-            weight = read["weight"]
-            tagStart -= gstart
-            if tagStart >= glen:
-                break
-
-            if tagStart > 0:
-                tagCount += weight
-                if fsense == "F":
-                    # we are relying on python's integer division quirk
-                    binID = tagStart / standardMinThresh 
-                    binList[binID] += weight
-                else:
-                    rdist = glen - tagStart
-                    binID = rdist / standardMinThresh 
-                    binList[binID] += weight
-
-        if tagCount < 2:
-            continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+    if len(featureList) == 0:
+        raise ErangeError("no features found")
  
-        tagCount *= normalizationFactor
-        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
-        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
-        for binAmount in binList:
-            outfile.write("\t%.2f" % binAmount)
+    symbol = "LOC%s" % gid
+    geneinfo = ""
+    try:
+        geneinfo = geneinfoDict[gid]
+        symbol = geneinfo[0][0]
+    except KeyError:
+        print gid
  
-        outfile.write("\n")
+    newfeatureList = []
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+
+    if len(newfeatureList) < 1:
+        raise ErangeError("no features found")
+
+    if chrom not in chromosomeList:
+        raise ErangeError("chromosome not found in reads")
+
+    newfeatureList.sort()
+
+    return symbol, newfeatureList, sense, chrom
  
-    outfile.close()
  
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+    geneLength = minDistance
+    if sense == "F":
+        nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = featureList[-1][1]
+    else:
+        nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = max(featureList[0][0] - geneLength, 0)
+
+    if nextGene < geneLength * 2:
+        geneLength = nextGene / 2
+
+    geneLength = max(geneLength, 1)
+
+    return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+    tagCount = 0
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+    binList = [0.] * bins
+    for read in readList:
+        tagStart = read["start"]
+        if tagStart >= geneLength:
+            break
+
+        tagStart -= geneStart
+        weight = read["weight"]
+        if tagStart > 0:
+            tagCount += weight
+            if sense == "F":
+                # we are relying on python's integer division quirk
+                binID = tagStart / standardMinThresh 
+            else:
+                rdist = geneLength - tagStart
+                binID = rdist / standardMinThresh
+
+            binList[binID] += weight
+
+    return binList, tagCount
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -12,6 +12,7 @@ except:
  
  import sys
  import optparse
+import string
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
  
      return parser
  
+
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
  
-    outfile = open(outfilename,'w')
  
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
      for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                  pass
          else:
              symbol = gid
+
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
          for binAmount in gidBins[gid]:
              if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
      outfile.close()
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 38e853aec6172be26b448066692193f1537b6926..5299d27d26c23f8729063eb31f0da390730b5ce5 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -2,7 +2,7 @@ try:
      import psyco
      psyco.full()
  except:
-    print 'psyco not running'
+    print "psyco not running"
  
  import sys
  import optparse
@@ -68,12 +68,6 @@ def makeParser(usage=""):
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -206,7 +200,6 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
-        #TODO: figure out why this was done in prior implementation...
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
@@ -271,4 +264,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/getNovelSNPs.py b/getNovelSNPs.py

index 4091281c8fe0f5410090b23db7276496fe49d6ed..e95d72693c76df13871da4cf809c99e43c41d36a 100755 (executable)
--- a/getNovelSNPs.py
+++ b/getNovelSNPs.py
@@ -68,7 +68,6 @@ def doNotProcessLine(line):
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
-    #TODO: refactor naming. is fields[8] rpkm?
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
@@ -93,4 +92,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/makerdsfromblat.py b/makerdsfromblat.py

index 37576ca867fb2283f9a5d497c0bbca7d06191f7e..17520401c52b5d153404aa4312a800017e9af3bb 100755 (executable)
--- a/makerdsfromblat.py
+++ b/makerdsfromblat.py
@@ -20,6 +20,9 @@ import ReadDataset
  verstring = "makerdsfromblat: version 3.10"
  print verstring
  
+NUM_HEADER_LINES = 5
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -98,24 +101,15 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
                      verbose=False, cachePages=100000, geneDataFileName="",
                      propertyList=[]):
  
-    delimiter = "|"
-    minIntron = 10
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
      if forceRNA:
          print "forcing datatype to RNA"
          dataType = "RNA"
  
-    if dataType == "RNA":
-        genedatafile = open(geneDataFileName)
-
-    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
-
      geneDict = {}
      mapDict = {}
      if dataType == "RNA" and not forceRNA:
+        genedatafile = open(geneDataFileName)
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
@@ -164,9 +158,10 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  
      # make some assumptions based on first read
      infile = open(filename, "r")
-    for arg in range(6):
+    for arg in range(NUM_HEADER_LINES):
          line = infile.readline()
  
+    line = infile.readline()
      fields = line.split()
      readsize = int(fields[10])
      pairedTest = fields[9][-2:]
@@ -186,8 +181,9 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
          rds.insertMetadata([("blat_mapped", "True")])
  
      minReadScore = readsize - readsize/25 - 1
-    trim = -4
+    maxBorder = 0
      if dataType == "RNA":
+        trim = -4
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
@@ -199,9 +195,12 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
      index = uIndex = mIndex = sIndex = lIndex = 0
      bestScore = 0
      # skip headers
-    for arg in range(5):
+    for arg in range(NUM_HEADER_LINES):
          line = infile.readline()
  
+    insertSize = 100000
+    delimiter = "|"
+    minIntron = 10
      for line in infile:
          lIndex += 1
          fields = line.strip().split()
diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py

index 14be260cdf0721b0514098a22d3a07423756c41b..eadb66b83d59e140b0402a9ea76dd351219abad0 100755 (executable)
--- a/makerdsfrombowtie.py
+++ b/makerdsfrombowtie.py
@@ -87,29 +87,13 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                        flip=False, verbose=False, stripSpace=False, cachePages=100000,
                        propertyList=[]):
  
-    delimiter = "|"
+    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
  
+    geneDict = {}
      dataType = "DNA"
      if genedatafilename is not None:
          dataType = "RNA"
          genedatafile = open(genedatafilename)
-
-
-    forcePair = False
-    if forceID is not None:
-        forcePair = True
-    else:
-        forceID = 0
-
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
-    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
-
-    geneDict = {}
-    mapDict = {}
-    if dataType == "RNA":
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
@@ -130,7 +114,6 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                  totalLength += exonLengths[index]
  
              geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
  
          genedatafile.close()
  
@@ -164,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
      fields = line.split()
      readsize = len(fields[5])
      pairedTest = fields[0][-2:]
+    forcePair = False
+    if forceID is not None:
+        forcePair = True
+    else:
+        forceID = 0
+
      paired = False
      if pairedTest in ["/1", "/2"] or forcePair:
          print "assuming reads are paired"
          paired = True
  
-
      print "read size: %d bp" % readsize
      if init:
          rds.insertMetadata([("readsize", readsize)])
@@ -184,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
  
      infile.close()
  
-    trim = -4
+    maxBorder = 0
      if dataType == "RNA":
+        trim = -4
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
@@ -195,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
      mInsertList = []
      sInsertList = []
      index = uIndex = mIndex = sIndex = lIndex = 0
+    delimiter = "|"
+    insertSize = 100000
      for line in infile:
          lIndex += 1
          if stripSpace:
diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py

index 66209eef973f36072ba964aac73f8579685aed99..b11848d7a93e3968d7b76b286fa415d04ffa078d 100755 (executable)
--- a/makerdsfromeland2.py
+++ b/makerdsfromeland2.py
@@ -106,8 +106,6 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
      insertSize = 100000
  
      geneDict = {}
-    mapDict = {}
-    seenSpliceList = []
      if dataType == 'RNA':
          genedatafile = open(geneDataFileName)
          for line in genedatafile:
@@ -120,17 +118,11 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
              chrom = fields[1]
              sense = fields[2]
              chromstarts = fields[8][:-1].split(',')
-            chromstops = fields[9][:-1].split(',')
-            exonLengths = []
-            totalLength = 0
              for index in range(blockCount):
                  chromstarts[index] = int(chromstarts[index])
-                chromstops[index] = int(chromstops[index])
-                exonLengths.append(chromstops[index] - chromstarts[index])
-                totalLength += exonLengths[index]
  
-            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
+            geneDict[uname] = (sense, blockCount, chrom, chromstarts)
+
          genedatafile.close()
  
      rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
@@ -322,6 +314,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
      print '%d unique reads' % index
      infile.close()
  
+    seenSpliceList = []
      if dataType == 'RNA':
          print 'mapping splices...'
          index = 0
@@ -441,7 +434,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
                      print fields
                      continue
  
-                (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                (sense, blockCount, chrom, chromstarts) = geneDict[model]
                  if extended:
                      if 'F' in thepos:
                          rsense = '+'
diff --git a/predictSpliceCount.py b/predictSpliceCount.py

index bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07..2df39a9412ab6794a1c437bead4b476c482e3918 100755 (executable)
--- a/predictSpliceCount.py
+++ b/predictSpliceCount.py
@@ -8,6 +8,14 @@ import sys
  from cistematic.genomes import Genome
  
  
+class GeneSymbolAndCount():
+
+    def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+        self.symbol = symbol
+        self.uniqueCount = uniqueCount
+        self.spliceCount = spliceCount
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -31,53 +39,65 @@ def main(argv=None):
  def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
      hg = Genome(genome)
  
-    gidDict = {}
-    gidList = []
-    uniqueCountDict = {}
-    spliceCountDict = {}
+    gidDict = getGeneData(uniquefilecount, splicefilecount)
+    gidList = gidDict.keys()
+    gidList.sort()
+
+    outfile = open(outfilename, "w")
+    for gid in gidList:
+        featureList = hg.getGeneFeatures((genome, gid))
+        featuresizesum, splicearea = getStats(featureList, splicelead)
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        geneData = gidDict[gid]
+        uniqueCount = geneData.uniqueCount
+        expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+        symbol = geneData.symbol
+        spliceCount = geneData.spliceCount
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
  
+    outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+    gidDict = {}
      uniquefile = open(uniquefilecount)
      for line in uniquefile:
          fields = line.strip().split()
-        gidDict[fields[0]] = fields[1]
-        gidList.append(fields[0])
-        uniqueCountDict[fields[0]] = int(fields[2])
+        geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+        gidDict[fields[0]] = geneData
  
+    uniquefile.close()
      splicefile = open(splicefilecount)
      for line in splicefile:
          fields = line.strip().split()
-        spliceCountDict[fields[0]] = int(fields[2])
+        gidDict[fields[0]].spliceCount = int(fields[2])
  
-    outfile = open(outfilename,'w')
+    splicefile.close()
  
-    gidList.sort()
-    for gid in gidList:
-        symbol = gidDict[gid]
-        featureList = hg.getGeneFeatures((genome, gid))
-        newfeatureList = []
-        featuresizesum = 0
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
-                featuresizesum += stop - start + 1
+    return gidDict
  
-        if featuresizesum < 1:
-            featuresizesum = 1
  
-        splicearea = (len(newfeatureList) - 1) * splicelead
-        if splicearea < splicelead:
-            splicearea = 0
+def getStats(featureList, splicelead):
+    newfeatureList = []
+    featuresizesum = 0
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+            featuresizesum += stop - start + 1
  
-        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
-        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+    if featuresizesum < 1:
+        featuresizesum = 1
  
-        # this p-value is based on the observed unique count, not the expected total count
-        # nor the multi-read adjusted count
-        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
-        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
-        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+    splicearea = (len(newfeatureList) - 1) * splicelead
+    if splicearea < splicelead:
+        splicearea = 0
  
-    outfile.close()
+    return featuresizesum, splicearea
  
  
  if __name__ == "__main__":
diff --git a/siteintersects.py b/siteintersects.py

index 4826a13bbb31824ffea9b187d383f42b35bbf682..e012bb49436c92f50a1d8722182d0cd72a754897 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -4,10 +4,29 @@
  #
  
  import sys
+from commoncode import regionsOverlap
  
  print "siteintersects: version 2.1"
  
  
+class Site():
+    def __init__(self, line, doExpanded=False):
+        fields = line.strip().split()
+        if doExpanded:
+            self.chrom = fields[1][3:]
+            self.start = int(fields[2])
+            self.stop = int(fields[3])
+            self.rest = fields[4:]
+        else:
+            (chromosome, pos) = fields[0].split(":")
+            self.chrom = chromosome[3:]
+            (start, stop) = pos.split("-")
+            self.start = int(start)
+            self.stop = int(stop)
+            self.rest = fields[1:]
+
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -33,115 +52,114 @@ def main(argv=None):
      siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
  
  
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
  
-    siteDict = {}
-    file1Dict = {}
+    siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+    commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
  
-    infile1count = 0
-    infile = open(sitefilename1)
+    if doReject and siteRejectFilename is not None:
+        writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+    print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+    siteDict = {}
+    rejectSiteDict = {}
+    processedLineCount = 0
+    infile = open(sitefilename)
      infile.readline()
      for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
              continue
  
-        infile1count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            start = int(start)
-            stop = int(stop)
-            rest = fields[1:]
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
+        rest = site.fieldList
  
          try:
              siteDict[chrom].append((start, stop, rest))
-        except:
+        except KeyError:
              siteDict[chrom] = [(start, stop, rest)]
  
          if doReject:
-            file1Dict[str((chrom, start, stop, rest))] = line
+            rejectSiteDict[str((chrom, start, stop, rest))] = line
  
      infile.close()
+    print "file1: %d" % processedLineCount
  
-    print "file1: %d" % infile1count
+    return siteDict, rejectSiteDict
  
-    infile2count = 0
-    infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+    processedLineCount = 0
+    infile = open(siteFilename)
      infile.readline()
  
      commonSites = 0
-    unique2List = []
+    if doReject and rejectFilename is not None:
+        rejectfile = open(rejectFilename, "w")
+    else:
+        doReject=False
+
      outfile = open(outfilename, "w")
      for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
              continue
  
-        infile2count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            rest = str(fields[1:])
-
-        start = int(start)
-        stop = int(stop)
-        mid = start + abs(stop - start)/2
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
          if chrom not in siteDict:
              if doReject:
-                unique2List.append(line)
-                continue
+                rejectfile.write(line)
+
+            continue
  
-        twoNotCommon = True
+        siteNotCommon = True
          for (rstart, rstop, rline) in siteDict[chrom]:
-            rsize = abs(rstart - rstop) /2
-            rmid = rstart + abs(rstop - rstart)/2
-            if abs(mid - rmid) < rsize:
+            if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
-                if twoNotCommon:
-                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
-                    twoNotCommon = False
+                if siteNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+                    siteNotCommon = False
  
                  try:
                      if doReject:
-                        del file1Dict[str((chrom, rstart, rstop, rline))]
-                except:
+                        del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+                except KeyError:
                      pass
  
-        if doReject and twoNotCommon:
-            unique2List.append(line)
+        if doReject and siteNotCommon:
+            rejectfile.write(line)
  
-    outfile.close()
+    if doReject:
+        rejectfile.close()
  
-    print "file2: %d" % infile2count
+    outfile.close()
  
-    if doReject:
-        reject1file = open(reject1filename, "w")
-        reject2file = open(reject2filename, "w")
+    print "file2: %d" % processedLineCount
  
-        for key in file1Dict:
-            reject1file.write(file1Dict[key])
+    return commonSites
  
-        for line in unique2List:
-            reject2file.write(line)
  
-        reject1file.close()
-        reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+    rejectfile = open(siteRejectFilename, "w")
  
-    print commonSites
+    for key in rejectSiteDict:
+        rejectfile.write(rejectSiteDict[key])
  
+    rejectfile.close()
+    
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/test/testBedToRegion.py b/test/testBedToRegion.py

new file mode 100644 (file)

index 0000000..b5b4c03
--- /dev/null
+++ b/test/testBedToRegion.py
@@ -0,0 +1,65 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import bedtoregion
+
+
+class TestBedToRegion(unittest.TestCase):
+
+    testBedFile = "erangeTestBedFile"
+    testRegionFile = "erangeTestRegionFile"
+
+
+    def setUp(self):
+        bedfile = open(self.testBedFile, "w")
+        bedfile.write("tab\tdelimited\tfields\n")
+        bedfile.write("space delimited fields\n")
+        bedfile.write("track\n")
+        bedfile.write("line after track will not be processed\n")
+        bedfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testBedFile)
+        except OSError:
+            print "bed file does not exist"
+
+        try:
+            os.remove(self.testRegionFile)
+        except OSError:
+            print "region file does not exist"
+
+
+    def testBedToRegion(self):
+        bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile)
+        resultFile = open(self.testRegionFile)
+        regionList = resultFile.readlines()
+        self.assertEquals(2, len(regionList))
+        self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0])
+        self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1])
+
+
+    def testMain(self):
+        argv = ["bedtoregion"]
+        self.assertRaises(SystemExit, bedtoregion.main, argv)
+        argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile]
+        bedtoregion.main(argv)
+        resultFile = open(self.testRegionFile)
+        self.assertEquals(2, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBedToRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testBinsToCdf.py b/test/testBinsToCdf.py

new file mode 100644 (file)

index 0000000..688e688
--- /dev/null
+++ b/test/testBinsToCdf.py
@@ -0,0 +1,66 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import binstocdf
+
+
+class TestBinsToCdf(unittest.TestCase):
+
+    testInputFile = "erangeTestBinFile"
+    testOutputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        binfile = open(self.testInputFile, "w")
+        binfile.write("field1\tfield2\t10\tfield4\t2\t3\n")
+        binfile.write("field1\tfield2\t20\tfield4\t1\t4\n")
+        binfile.write("field1\tfield2\t0\tfield4\n")
+        binfile.write("too short\n")
+        binfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "bin file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        binstocdf.binToCDF(self.testInputFile, self.testOutputFile)
+        resultFile = open(self.testOutputFile)
+        resultList = resultFile.readlines()
+        self.assertEquals(3, len(resultList))
+        self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0])
+        self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1])
+        self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2])
+
+
+    def testMain(self):
+        argv = ["binstocdf"]
+        self.assertRaises(SystemExit, binstocdf.main, argv)
+        argv = ["binstocdf", self.testInputFile, self.testOutputFile]
+        binstocdf.main(argv)
+        resultFile = open(self.testOutputFile)
+        self.assertEquals(3, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBinsToCdf))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testCdfDist.py b/test/testCdfDist.py

new file mode 100644 (file)

index 0000000..9048749
--- /dev/null
+++ b/test/testCdfDist.py
@@ -0,0 +1,58 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import cdfdist
+
+
+class TestCdfDist(unittest.TestCase):
+
+    testInputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1 30 60\n")
+        cdffile.write("line2 90 99\n")
+        cdffile.write("line3 5 80\n")
+        cdffile.write("line4 10 14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        bins = 2
+        self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile))
+        self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile))
+        self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile))
+        self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile))
+
+
+    def testMain(self):
+        bins = 2
+        percent = 50
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, cdfdist.main, argv)
+        argv = ["cdfdist", bins, percent, self.testInputFile]
+        cdfdist.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCdfDist))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testChksnp.py b/test/testChksnp.py

index b41fe659421377de86e576a7644abf9e8c5971b6..0fbbb7b2835ad2e7b8c52f38fdb4f116268444bc 100644 (file)
--- a/test/testChksnp.py
+++ b/test/testChksnp.py
@@ -23,6 +23,8 @@ class TestChksnp(unittest.TestCase):
  
      snpDB = "%s/dbSNP128.db" % dbPath
      altSnpDB = "%s/snp129cDNA.db" % dbPath
+    inputFileName = "testChkSNP_input.txt"
+    outputFileName = "testChkSNP_output.txt"
  
      def setUp(self):
          pass
@@ -33,8 +35,7 @@ class TestChksnp(unittest.TestCase):
  
  
      def testChkSNPFile(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
@@ -42,10 +43,8 @@ class TestChksnp(unittest.TestCase):
          infile.write("%s\n" % snpEntry)
          infile.close()
  
-        outputFileName = "testChkSNP_output.txt"
-
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName)
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -53,10 +52,10 @@ class TestChksnp(unittest.TestCase):
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
-        os.remove(outputFileName)
+        os.remove(self.outputFileName)
  
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB])
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -65,13 +64,12 @@ class TestChksnp(unittest.TestCase):
          self.assertEquals(result, line)
          outfile.close()
  
-        os.remove(inputFileName)
-        os.remove(outputFileName)
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
  
  
      def testMain(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
@@ -79,11 +77,9 @@ class TestChksnp(unittest.TestCase):
          infile.write("%s\n" % snpEntry)
          infile.close()
  
-        outputFileName = "testChkSNP_output.txt"
-
-        argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+        argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName]
          chksnp.main(argv)
-        outfile = open(outputFileName, "r")
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -91,7 +87,10 @@ class TestChksnp(unittest.TestCase):
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
-        os.remove(outputFileName)
+
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
+
  
      def testChkSNP(self):
          snpPropertiesList = []
diff --git a/test/testColsum.py b/test/testColsum.py

new file mode 100644 (file)

index 0000000..5c149e7
--- /dev/null
+++ b/test/testColsum.py
@@ -0,0 +1,56 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import colsum
+
+
+class TestColsum(unittest.TestCase):
+
+    testInputFile = "erangeTestColsumFile"
+
+
+    def setUp(self):
+        colfile = open(self.testInputFile, "w")
+        colfile.write("line1 30 60.5\n")
+        colfile.write("line2 90 99\n")
+        colfile.write("line3 5 80\n")
+        colfile.write("line4 10 1\n")
+        colfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        self.assertEquals(0, colsum.colsum(0, self.testInputFile))
+        self.assertEquals(135, colsum.colsum(1, self.testInputFile))
+        self.assertEquals(240.5, colsum.colsum(2, self.testInputFile))
+        self.assertEquals(0, colsum.colsum(3, self.testInputFile))
+
+
+    def testMain(self):
+        field = 1
+        argv = ["colsum"]
+        self.assertRaises(SystemExit, colsum.main, argv)
+        argv = ["colsum", field, self.testInputFile]
+        colsum.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestColsum))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testCommoncode.py b/test/testCommoncode.py

index 43eb96bc2a3153e7cbf0bb5719fa13320195512c..ce8be55e8c9d9aed892a8eba023af0e79b812a8d 100644 (file)
--- a/test/testCommoncode.py
+++ b/test/testCommoncode.py
@@ -8,6 +8,7 @@ import os
  import string
  from array import array
  from erange import commoncode
+from erange import Region
  from cistematic.genomes import Genome
  
  
@@ -122,8 +123,10 @@ class TestCommoncode(unittest.TestCase):
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          testfile.write(regionEntry)
          testfile.close()
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+        result = commoncode.getMergedRegions("regionTestFile")
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
          os.remove("regionTestFile")
  
  
@@ -132,108 +135,277 @@ class TestCommoncode(unittest.TestCase):
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(5, 25, 20)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
-        result = {"1": [(12, 18, 6)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
-        result = {"chr1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=5)
+        self.assertEquals(5, result["1"][0].start)
+        self.assertEquals(25, result["1"][0].stop)
+        self.assertEquals(20, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=-2)
+        self.assertEquals(12, result["1"][0].start)
+        self.assertEquals(18, result["1"][0].stop)
+        self.assertEquals(6, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True)
+        self.assertEquals(10, result["chr1"][0].start)
+        self.assertEquals(20, result["chr1"][0].stop)
+        self.assertEquals(10, result["chr1"][0].length)
  
          regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 40, 30)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10), (15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
-        result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2", result["1"][1].label)
  
          regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1\tspacer", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2\tspacer2", result["1"][1].label)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 2040, 2030)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
-        result = {"1": [(2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(2030, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=5)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
          regionList.append(regionEntry)
-        result = {"2": [(15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+        result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
          self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+
          regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
          regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
          regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
-                  "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
-        }
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(88, result["2"][0].stop)
+        self.assertEquals(73, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("4", result["2"][0].label)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals(32, result["1"][1].peakPos)
+        self.assertEquals(17, result["1"][1].peakHeight)
+        self.assertEquals("3", result["1"][1].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+        self.assertEquals(65, result["2"][1].start)
+        self.assertEquals(88, result["2"][1].stop)
+        self.assertEquals(23, result["2"][1].length)
+        self.assertEquals(72, result["2"][1].peakPos)
+        self.assertEquals(7, result["2"][1].peakHeight)
+        self.assertEquals("4", result["2"][1].label)
  
          regionList = ["# comment"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# pvalue"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# pvalue readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
          regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          #Test fails - the header line is required if there are fields after the peak which isn't so good
          #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
  
@@ -274,28 +446,61 @@ class TestCommoncode(unittest.TestCase):
      #TODO: write test
      def testFindPeak(self):
          hitList = []
-        result = ([], 0.0, array("f"), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+        result = commoncode.findPeak(hitList, 0, 0)
+        self.assertEquals([], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f"), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
-        result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, doWeight=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(0.5, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.5, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+        self.assertEquals(6, result.shift)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(3, result.shift)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+        self.assertEquals(3, result.shift)
  
  
      #TODO: write test
@@ -354,10 +559,13 @@ class TestCommoncode(unittest.TestCase):
          }
          self.assertEquals(result, complementDict)
  
-        regionDict = {"I": [("new feature", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new feature"
+        regionDict = {"I": [additionalRegion]}
          featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
          result = (100, 150, "new feature", "+", "custom")
          self.assertEquals(result, featureDict["I"][0])
+        
  
  
      def testGetLocusByChromDict(self):
@@ -382,7 +590,9 @@ class TestCommoncode(unittest.TestCase):
              self.assertTrue(chrom in self.celegansChroms)
              self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
  
-        regionDict = {"I": [("new region", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new region"
+        regionDict = {"I": [additionalRegion]}
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
          self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
diff --git a/test/testErange.py b/test/testErange.py

index 4455fff7211ff686832b81cb80ed6cc7adcfba03..d993fad8ba7e7f1dd208d6760d465842538ddba7 100644 (file)
--- a/test/testErange.py
+++ b/test/testErange.py
@@ -11,8 +11,13 @@ Created on Sep 8, 2010
  import sys
  import unittest
  import testAnalyzeGO
+import testBedToRegion
+import testBinsToCdf
+import testCdfDist
  import testChksnp
+import testColsum
  import testCommoncode
+import testFeatureIntersects
  import testGeneMrnaCounts
  import testGeneMrnaCountsWeighted
  #import testGetFasta
@@ -23,6 +28,7 @@ import testMakeBamFromRds
  import testmakebedfromrds
  #import testMakeGraphs
  import testMakeRdsFromBam
+import testMakeSiteTrack
  import testMakeSNPTrack
  import testMarkLinkers
  import testPeak
@@ -41,8 +47,13 @@ def main(argv=None):
  
      suite = unittest.TestSuite()
      suite.addTest(testAnalyzeGO.suite())
+    suite.addTest(testBedToRegion.suite())
+    suite.addTest(testBinsToCdf.suite())
+    suite.addTest(testCdfDist.suite())
      suite.addTest(testChksnp.suite())
+    suite.addTest(testColsum.suite())
      suite.addTest(testCommoncode.suite())
+    suite.addTest(testFeatureIntersects.suite())
      suite.addTest(testGeneMrnaCounts.suite())
      suite.addTest(testGeneMrnaCountsWeighted.suite())
      #suite.addTest(testGetFasta.suite())
@@ -53,6 +64,7 @@ def main(argv=None):
      suite.addTest(testmakebedfromrds.suite())
      #suite.addTest(testMakeGraphs.suite())
      suite.addTest(testMakeRdsFromBam.suite())
+    suite.addTest(testMakeSiteTrack.suite())
      suite.addTest(testMakeSNPTrack.suite())
      suite.addTest(testMarkLinkers.suite())
      suite.addTest(testPeak.suite())
diff --git a/test/testFeatureIntersects.py b/test/testFeatureIntersects.py

new file mode 100644 (file)

index 0000000..9633f8c
--- /dev/null
+++ b/test/testFeatureIntersects.py
@@ -0,0 +1,73 @@
+
+import unittest
+import os
+from erange import featureIntersects
+
+
+class TestFeatureIntersects(unittest.TestCase):
+
+    testInputFile = "erangeTestFeatureIntersects"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1\tchr1\t30\t60\n")
+        cdffile.write("line2\tchr1\t90\t99\n")
+        cdffile.write("line3\tchr1\t5\t80\n")
+        cdffile.write("line4\tchr2\t10\t14\n")
+        cdffile.write("line4\tnot to be processed\n")
+        cdffile.write("line5\tchr2\t10\t14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "position file does not exist"
+
+
+    #TODO: write test
+    def testFeatureIntersects(self):
+        pass
+
+
+    #TODO: write test
+    def testGetPositionList(self):
+        pass
+
+
+    #TODO: write test
+    def testMain(self):
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, featureIntersects.main, argv)
+        argv = ["cdfdist", self.testInputFile]
+        featureIntersects.main(argv)
+
+
+    def testMakeParser(self):
+        parser = featureIntersects.makeParser("")
+        argv = []
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual([], args)
+        argv = [self.testInputFile]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual(self.testInputFile, args[0])
+        self.assertEqual("TFBSCONSSITES", options.cistype)
+        self.assertEqual(100, options.radius)
+        argv = [self.testInputFile, "--cistype", "test", "--radius", "50"]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual("test", options.cistype)
+        self.assertEqual(50, options.radius)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestFeatureIntersects))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testMakeSiteTrack.py b/test/testMakeSiteTrack.py

new file mode 100644 (file)

index 0000000..7e609e7
--- /dev/null
+++ b/test/testMakeSiteTrack.py
@@ -0,0 +1,94 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import makesitetrack
+
+
+class TestMakeSiteTrack(unittest.TestCase):
+
+    testInputFile = "erangeTestSiteFile"
+    testOutputFile = "erangeTestBedFile"
+
+
+    def setUp(self):
+        self.createCompactInputFile()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "bed file does not exist"
+
+
+    def createStandardInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def createCompactInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("chr1:100-175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def testMakeSiteTrack(self):
+        self.createCompactInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile)
+        bedfile = open(self.testOutputFile)
+        header = 'track name="" visibility=4 itemRgb="On"\n'
+        self.assertEquals(header, bedfile.readline())
+        result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n"
+        self.assertEquals(result, bedfile.readline())
+
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True)
+        bedfile = open(self.testOutputFile)
+        header = bedfile.readline()
+        firstEntry = bedfile.readline()
+        self.assertEquals(firstEntry, bedfile.readline())
+
+
+    def testNonCompactMakeSiteTrack(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        self.createStandardInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255")
+        bedfile = open(self.testOutputFile)
+        bedfile.readline()
+        result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n"
+        self.assertEquals(result, bedfile.readline())
+
+
+    def testMain(self):
+        self.createCompactInputFile()
+        argv = ["makesitetrack"]
+        self.assertRaises(SystemExit, makesitetrack.main, argv)
+        argv = ["makesitetrack", self.testInputFile, self.testOutputFile]
+        makesitetrack.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeSiteTrack))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/weighMultireads.py b/weighMultireads.py

index f4b1691a07a08b15988539f75f1fbb2a62e148b2..9fd041d4462abb552eef87ebd82430177106231f 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -168,7 +168,6 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
-            #TODO: Is this right? If match index is 0 are we doing nothing?
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
@@ -330,4 +329,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
author	Sean Upchurch <sau@caltech.edu>
	Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committer	Sean Upchurch <sau@caltech.edu>
	Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
MakeBamFromRds.py		patch \| blob \| history
ReadDataset.py		patch \| blob \| history
altSpliceCounts.py		patch \| blob \| history
bedtoregion.py		patch \| blob \| history
binstocdf.py		patch \| blob \| history
buildMatrix.py		patch \| blob \| history
cdfdist.py		patch \| blob \| history
colsum.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
combinerds.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
distalPairs.py		patch \| blob \| history
farPairs.py		patch \| blob \| history
featureIntersects.py		patch \| blob \| history
findall.py		patch \| blob \| history
geneDownstreamBins.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
getNovelSNPs.py		patch \| blob \| history
makerdsfromblat.py		patch \| blob \| history
makerdsfrombowtie.py		patch \| blob \| history
makerdsfromeland2.py		patch \| blob \| history
predictSpliceCount.py		patch \| blob \| history
siteintersects.py		patch \| blob \| history
test/testBedToRegion.py	[new file with mode: 0644]	patch \| blob
test/testBinsToCdf.py	[new file with mode: 0644]	patch \| blob
test/testCdfDist.py	[new file with mode: 0644]	patch \| blob
test/testChksnp.py		patch \| blob \| history
test/testColsum.py	[new file with mode: 0644]	patch \| blob
test/testCommoncode.py		patch \| blob \| history
test/testErange.py		patch \| blob \| history
test/testFeatureIntersects.py	[new file with mode: 0644]	patch \| blob
test/testMakeSiteTrack.py	[new file with mode: 0644]	patch \| blob
weighMultireads.py		patch \| blob \| history