Release version for Erange 4.0a

author Sean Upchurch <sau@caltech.edu>

Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)

committer Sean Upchurch <sau@caltech.edu>

Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
author Sean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committer Sean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py

index bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59..730c20be6fb147f84e58b7f80485199e0975ad3a 100644 (file)
--- a/MakeBamFromRds.py
+++ b/MakeBamFromRds.py
@@ -22,13 +22,13 @@ import pysam
  import ReadDataset
  from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
  import ReadDataset
  from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
+VERSION = "1.0"
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "makeBamFromRds: version 1.0"
-    print verstring
+    print "makeBamFromRds: version %s" % VERSION
  
      doPairs = False
      
  
      doPairs = False
      
@@ -134,7 +134,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
      for chrom in chromRemoveList:
          chromList.remove(chrom)
  
      for chrom in chromRemoveList:
          chromList.remove(chrom)
  
-    header = {"HD": {"VN": "1.0"}}
+    header = {"HD": {"VN": VERSION}}
      if referenceSequenceList:
          header["SQ"] = referenceSequenceList
  
      if referenceSequenceList:
          header["SQ"] = referenceSequenceList
  
diff --git a/ReadDataset.py b/ReadDataset.py

index 5ff60e2e6954c862888c5807495ef49142e14fa3..56e28a8995b734735ace06d1d990ac9b46753580 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -153,42 +153,42 @@ class ReadDataset():
              self.cachedDB = ""
  
  
              self.cachedDB = ""
  
  
-    def attachDB(self, filename, asname):
+    def attachDB(self, filename, dbName):
          """ attach another database file to the readDataset.
          """
          """ attach another database file to the readDataset.
          """
-        stmt = "attach '%s' as %s" % (filename, asname)
+        stmt = "attach '%s' as %s" % (filename, dbName)
          self.execute(stmt)
  
  
          self.execute(stmt)
  
  
-    def detachDB(self, asname):
+    def detachDB(self, dbName):
          """ detach a database file to the readDataset.
          """
          """ detach a database file to the readDataset.
          """
-        stmt = "detach %s" % (asname)
+        stmt = "detach %s" % (dbName)
          self.execute(stmt)
  
  
          self.execute(stmt)
  
  
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+    def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
          """ import into current RDS the table (with columns destcolumns,
              with default all columns) from the database file asname,
              using the column specification of ascolumns (default all).
          """
          """ import into current RDS the table (with columns destcolumns,
              with default all columns) from the database file asname,
              using the column specification of ascolumns (default all).
          """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
          if flagged != "":
              stmt += " where flag = '%s' " % flagged
  
          self.executeCommit(stmt)
  
  
          if flagged != "":
              stmt += " where flag = '%s' " % flagged
  
          self.executeCommit(stmt)
  
  
-    def getTables(self, asname=""):
+    def getTables(self, dbName=""):
          """ get a list of table names in a particular database file.
          """
          resultList = []
          sql = self.getSqlCursor()
  
          """ get a list of table names in a particular database file.
          """
          resultList = []
          sql = self.getSqlCursor()
  
-        if asname != "":
-            asname += "."
+        if dbName != "":
+            dbName = "%s." % dbName
  
  
-        stmt = "select name from %ssqlite_master where type='table'" % asname
+        stmt = "select name from %ssqlite_master where type='table'" % dbName
          sql.execute(stmt)
          results = sql.fetchall()
  
          sql.execute(stmt)
          results = sql.fetchall()
  
@@ -291,11 +291,15 @@ class ReadDataset():
          if "readsize" not in metadata:
              raise ReadDatasetError("no readsize parameter defined")
          else:
          if "readsize" not in metadata:
              raise ReadDatasetError("no readsize parameter defined")
          else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
+            readSize = metadata["readsize"]
+            if "import" in readSize:
+                readSize = readSize.split()[0]
  
  
-            return int(mysize)
+            readSize = int(readSize)
+            if readSize < 0:
+                raise ReadDatasetError("readsize is negative")
+
+            return readSize
  
  
      def getDefaultCacheSize(self):
  
  
      def getDefaultCacheSize(self):
@@ -317,11 +321,9 @@ class ReadDataset():
                  if row["chrom"] not in results:
                      results.append(row["chrom"])
              else:
                  if row["chrom"] not in results:
                      results.append(row["chrom"])
              else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
+                shortName = row["chrom"][3:]
+                if  len(shortName.strip()) > 0 and shortName not in results:
+                    results.append(shortName)
  
          results.sort()
  
  
          results.sort()
  
@@ -333,32 +335,17 @@ class ReadDataset():
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
-        sql = self.getSqlCursor()
  
          if doUniqs:
  
          if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
  
          if doSplices:
  
          if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
+            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+            maxCoord = max(spliceMax, maxCoord)
  
          if doMulti:
  
          if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
+            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+            maxCoord = max(multiMax, maxCoord)
  
          if verbose:
              print "%s maxCoord: %d" % (chrom, maxCoord)
  
          if verbose:
              print "%s maxCoord: %d" % (chrom, maxCoord)
@@ -366,6 +353,19 @@ class ReadDataset():
          return maxCoord
  
  
          return maxCoord
  
  
+    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+        maxCoord = 0
+        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+        sql = self.getSqlCursor()
+        try:
+            sql.execute(sqlStatement)
+            maxCoord = int(sql.fetchall()[0][0])
+        except:
+            print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        return maxCoord
+
+
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
@@ -378,67 +378,14 @@ class ReadDataset():
          
          Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
          """
          
          Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
          """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
  
  
-        groupBy = []
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
          if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
+            selectQuery = "select start, sense, sum(weight)"
          else:
          else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
+            selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
  
  
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
+        groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
          if doUniqs:
              stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
              if doMulti:
          if doUniqs:
              stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
              if doMulti:
@@ -498,6 +445,7 @@ class ReadDataset():
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
+        resultsDict = {}
          if findallOptimize:
              resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
              if self.memBacked:
          if findallOptimize:
              resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
              if self.memBacked:
@@ -562,20 +510,17 @@ class ReadDataset():
          return resultsDict
  
  
          return resultsDict
  
  
-    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
+    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+        if splice:
+            startText = "startL"
+            stopText = "stopR"
+        else:
+            startText = "start"
+            stopText = "stop"
  
  
+        whereClause = []
          if chrom != "" and chrom != self.memChrom:
          if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
+            whereClause.append("chrom = '%s'" % chrom)
  
          if flag != "":
              if flagLike:
  
          if flag != "":
              if flagLike:
@@ -584,25 +529,37 @@ class ReadDataset():
              else:
                  whereClause.append("flag = '%s'" % flag)
  
              else:
                  whereClause.append("flag = '%s'" % flag)
  
+        if start > -1:
+            whereClause.append("%s > %d" % (startText, start))
+
+        if stop > -1:
+            whereClause.append("%s < %d" % (stopText, stop))
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
          if hasMismatch:
              whereClause.append("mismatch != ''")
  
          if hasMismatch:
              whereClause.append("mismatch != ''")
  
-        if strand != "":
+        if strand in ["+", "-"]:
              whereClause.append("sense = '%s'" % strand)
  
              whereClause.append("sense = '%s'" % strand)
  
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
          if len(whereClause) > 0:
              whereStatement = string.join(whereClause, " and ")
              whereQuery = "where %s" % whereStatement
          else:
              whereQuery = ""
  
          if len(whereClause) > 0:
              whereStatement = string.join(whereClause, " and ")
              whereQuery = "where %s" % whereStatement
          else:
              whereQuery = ""
  
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        return whereQuery
+
+
+    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+        selectClause = [baseSelect]
+        if bothEnds:
+            selectClause.append("stop")
+
          if not noSense:
              selectClause.append("sense")
  
          if not noSense:
              selectClause.append("sense")
  
@@ -615,7 +572,36 @@ class ReadDataset():
          if withMismatch:
              selectClause.append("mismatch")
  
          if withMismatch:
              selectClause.append("mismatch")
  
-        selectQuery = string.join(selectClause, " ,")
+        selectQuery = string.join(selectClause, ",")
+
+        return selectQuery
+
+
+    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+        groupBy = []
+        if findallOptimize:
+            groupBy = ["GROUP BY start, sense"]
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        groupQuery = string.join(groupBy)
+
+        return groupQuery
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+        selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+        selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
          if self.memBacked:
              sql = self.memcon.cursor()
          else:
          if self.memBacked:
              sql = self.memcon.cursor()
          else:
@@ -625,6 +611,7 @@ class ReadDataset():
          sql.execute(stmt)
          currentReadID = ""
          currentChrom = ""
          sql.execute(stmt)
          currentReadID = ""
          currentChrom = ""
+        resultsDict = {}
          for row in sql:
              pairID = 0
              readID = row["readID"]
          for row in sql:
              pairID = 0
              readID = row["readID"]
@@ -796,10 +783,6 @@ class ReadDataset():
          """ get readID's.
          """
          stmt = []
          """ get readID's.
          """
          stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
          if uniqs:
              stmt.append("select readID from uniqs")
  
          if uniqs:
              stmt.append("select readID from uniqs")
  
@@ -814,6 +797,10 @@ class ReadDataset():
          else:
              selectPart = ""
  
          else:
              selectPart = ""
  
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
          if self.memBacked:
              sql = self.memcon.cursor()
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
          if self.memBacked:
              sql = self.memcon.cursor()
@@ -845,12 +832,11 @@ class ReadDataset():
                  print "getting mismatches from chromosome %s" % (achrom)
  
              snpDict[achrom] = []
                  print "getting mismatches from chromosome %s" % (achrom)
  
              snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if useSplices and self.dataType == "RNA":
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
              if useSplices and self.dataType == "RNA":
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    spliceEntry = spliceDict[k][0]
+                for spliceID in spliceIDList:
+                    spliceEntry = spliceDict[spliceID][0]
                      startpos = spliceEntry["startL"]
                      lefthalf = spliceEntry["stopL"]
                      rightstart = spliceEntry["startR"]
                      startpos = spliceEntry["startL"]
                      lefthalf = spliceEntry["stopL"]
                      rightstart = spliceEntry["startR"]
@@ -881,6 +867,7 @@ class ReadDataset():
  
                          snpDict[achrom].append([startpos, change_at, change_base, change_from])
  
  
                          snpDict[achrom].append([startpos, change_at, change_base, change_from])
  
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if achrom not in hitDict.keys():
                  continue
  
              if achrom not in hitDict.keys():
                  continue
  
@@ -910,7 +897,7 @@ class ReadDataset():
  
  
      def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
  
  
      def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        useSplices=False, normalizationFactor=1.0, trackStrand=False,
                          keepStrand="both", shiftValue=0):
          """return a profile of the chromosome as an array of per-base read coverage....
              keepStrand = 'both', 'plusOnly', or 'minusOnly'.
                          keepStrand="both", shiftValue=0):
          """return a profile of the chromosome as an array of per-base read coverage....
              keepStrand = 'both', 'plusOnly', or 'minusOnly'.
@@ -925,8 +912,8 @@ class ReadDataset():
          dataType = metadata["dataType"]
          scale = 1. / normalizationFactor
          shift = {}
          dataType = metadata["dataType"]
          scale = 1. / normalizationFactor
          shift = {}
-        shift['+'] = int(shiftValue)
-        shift['-'] = -1 * int(shiftValue)
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
  
          if cstop > 0:
              lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
  
          if cstop > 0:
              lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
diff --git a/altSpliceCounts.py b/altSpliceCounts.py

index 12077c1ec7d9d7e3ba499c67c99734a8a9a54157..c59dc9249b6c82ded0466f872e98a9900e702c46 100755 (executable)
--- a/altSpliceCounts.py
+++ b/altSpliceCounts.py
@@ -2,7 +2,7 @@ try:
      import psyco
      psyco.full()
  except:
      import psyco
      psyco.full()
  except:
-    print 'psyco not running'
+    print "psyco not running"
  
  print "altSpliceCounts: version 3.7"
  
  
  print "altSpliceCounts: version 3.7"
  
@@ -56,13 +56,13 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
      stopDict = {}
      resultDict = {}
  
      stopDict = {}
      resultDict = {}
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
      readlen = hitRDS.getReadSize()
      hitDict = hitRDS.getSplicesDict(noSense=True)
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
      readlen = hitRDS.getReadSize()
      hitDict = hitRDS.getSplicesDict(noSense=True)
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
  
      for chrom in hitDict:
          startDict[chrom] = []
  
      for chrom in hitDict:
          startDict[chrom] = []
@@ -155,7 +155,7 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
  
          resultDict[chrom].sort()
          for line in resultDict[chrom]:
  
          resultDict[chrom].sort()
          for line in resultDict[chrom]:
-            outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
+            outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
              alternative += 1
  
          print chrom, maxIndex, spliceEvent, altSpliceEvent
              alternative += 1
  
          print chrom, maxIndex, spliceEvent, altSpliceEvent
diff --git a/bedtoregion.py b/bedtoregion.py

index 3bcd554e5406a09f31ecc988e8c60733616d0dd5..a6a344d549b907b98343d731b3a348dd9802d0ec 100755 (executable)
--- a/bedtoregion.py
+++ b/bedtoregion.py
@@ -21,14 +21,16 @@ def main(argv=None):
  def bedToRegion(factor, infilename, outfilename):
      index = 1
      infile = open(infilename)
  def bedToRegion(factor, infilename, outfilename):
      index = 1
      infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
      for line in infile:
      for line in infile:
-        if 'track' in line:
+        if "track" in line:
              continue
              continue
+
          fields = line.split()
          fields = line.split()
-        line = string.join(fields, '\t')
-        outfile.write('%s%d\t%s\n' % (factor, index, line))
+        line = string.join(fields, "\t")
+        outfile.write("%s%d\t%s\n" % (factor, index, line))
          index += 1
          index += 1
+
      infile.close()
      outfile.close()
  
      infile.close()
      outfile.close()
  
diff --git a/binstocdf.py b/binstocdf.py

index 63aa9558b8848183d4528bb16149cd0ced38aadd..27b48e22e8ecde5b4fd9c817ff17d4ab75dcbd13 100755 (executable)
--- a/binstocdf.py
+++ b/binstocdf.py
@@ -1,4 +1,5 @@
  import sys
  import sys
+import string
  
  print "binstocdf: version 1.1"
  
  
  print "binstocdf: version 1.1"
  
@@ -6,19 +7,19 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
      if not argv:
          argv = sys.argv
  
-    if len(argv) < 2:
-        print 'usage: python %s infile outfile' % sys.argv[0]
+    if len(argv) < 3:
+        print "usage: python %s infile outfile" % sys.argv[0]
          sys.exit(1)
  
          sys.exit(1)
  
-    infilename = argv[0]
-    outfilename = argv[1]
+    infilename = argv[1]
+    outfilename = argv[2]
  
      binToCDF(infilename, outfilename)
  
  
  def binToCDF(infilename, outfilename):
      infile = open(infilename)
  
      binToCDF(infilename, outfilename)
  
  
  def binToCDF(infilename, outfilename):
      infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
  
      for line in infile:
          fields = line.strip().split()
  
      for line in infile:
          fields = line.strip().split()
@@ -30,14 +31,15 @@ def binToCDF(infilename, outfilename):
              outfile.write(line)
              continue
  
              outfile.write(line)
              continue
  
-        outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
-        cum = 0
+        outputFields = fields[:4]
+        runningTotal = 0
          for bin in fields[4:]:
          for bin in fields[4:]:
-            cum += int(bin)
-            percent = 100 * cum / total
-            outfile.write('\t%d' % percent)
+            runningTotal += int(bin)
+            percent = 100 * runningTotal / total
+            outputFields.append("%d" % percent)
  
  
-        outfile.write('\n')
+        outputLine = string.join(outputFields, "\t")
+        outfile.write("%s\n" % outputLine)
  
      infile.close()
      outfile.close()
  
      infile.close()
      outfile.close()
diff --git a/buildMatrix.py b/buildMatrix.py

index c7b6dd0c801025552c365003cf9067b2550c6b0c..a49120cf574f9c0213bd569edfe8507f923435a2 100755 (executable)
--- a/buildMatrix.py
+++ b/buildMatrix.py
@@ -78,7 +78,7 @@ def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
      if header.strip() == "":
          header = "#\t"
  
      if header.strip() == "":
          header = "#\t"
  
-    outfile.write( "%s\t%s\n" % (header, colID))
+    outfile.write("%s\t%s\n" % (header, colID))
  
      values = []
      min = 20000000000.
  
      values = []
      min = 20000000000.
diff --git a/cdfdist.py b/cdfdist.py

index 71662442afff0bdeb68fb5604307f72b8cedda91..52963b7445a1e28c8f251c94a6220f5fe1dc92f6 100755 (executable)
--- a/cdfdist.py
+++ b/cdfdist.py
@@ -8,11 +8,15 @@ def main(argv=None):
          print "usage: python %s bins percent infile" % sys.argv[0]
          sys.exit(1)
  
          print "usage: python %s bins percent infile" % sys.argv[0]
          sys.exit(1)
  
-    bins = int(argv[0])
-    percent = int(argv[1])
-    infilename = argv[2]
+    bins = int(argv[1])
+    percent = int(argv[2])
+    infilename = argv[3]
  
  
-    cdfDist(bins, percent, infilename)
+    printCdfDist(bins, percent, infilename)
+
+
+def printCdfDist(bins, percent, infilename):
+    print cdfDist(bins, percent, infilename)
  
  
  def cdfDist(bins, percent, infilename):
  
  
  def cdfDist(bins, percent, infilename):
@@ -30,7 +34,7 @@ def cdfDist(bins, percent, infilename):
              index += 1
  
      infile.close()
              index += 1
  
      infile.close()
-    print binsList
+    return binsList
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
diff --git a/colsum.py b/colsum.py

index f6d1ff968c5260e35e876fb79a82ac201b088585..643aaeaa82b8bc0c444e33ae909d4f5859e2b1d7 100755 (executable)
--- a/colsum.py
+++ b/colsum.py
@@ -28,7 +28,7 @@ def colsum(fieldID, filename):
                  count += float(fields[fieldID])
              else:
                  count += int(fields[fieldID])
                  count += float(fields[fieldID])
              else:
                  count += int(fields[fieldID])
-        except ValueError:
+        except (ValueError, IndexError):
              pass
  
      infile.close()
              pass
  
      infile.close()
diff --git a/combineRPKMs.py b/combineRPKMs.py

index ead4e1b81d0b37335fbf00908f1aea615cec33dd..77f30d6fc2acb2a9424634a138a03ee30e314f91 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -51,23 +51,8 @@ def makeParser(usage=""):
  
  def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
  
  
  def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
  
-    firstDict = {}
-    firstfile = open(firstfileName)
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
-
-    firstfile.close()
-
-    expandedDict = {}
-    gidDict = {}
-    expandedfile = open(expandedfileName)
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
-
-    expandedfile.close()
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
  
      if doFraction:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
  
      if doFraction:
          header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
@@ -97,5 +82,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      outfile.close()
  
  
      outfile.close()
  
  
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
  if __name__ == "__main__":
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/combinerds.py b/combinerds.py

index 4c826b04ed6c668a3d5b2a66ee4b842789a483a0..2878423a9716a0e4f31d0a1738a0770339c8ed6c 100755 (executable)
--- a/combinerds.py
+++ b/combinerds.py
@@ -10,7 +10,9 @@ except:
      pass
  
  import sys
      pass
  
  import sys
+import optparse
  import ReadDataset
  import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
  
  print "combinerds: version 1.2"
  
  
  print "combinerds: version 1.2"
  
@@ -19,97 +21,107 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
      if not argv:
          argv = sys.argv
  
-    if len(argv) < 2:
-        print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
-        #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    parser = makeParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
          sys.exit(1)
  
          sys.exit(1)
  
-    doCache = False
-    cachePages = -1
-    if '--cache' in argv:
-        doCache = True
-        try:
-            cachePages =  int(argv[sys.argv.index('-cache') + 1])
-        except: 
-            pass
-
-    datafile = argv[1]
-    infileList = []
-    for index in range(2, len(argv)):
-        if argv[index][0] == '-':
-            break
-        infileList.append(sys.argv[index])
+    datafile = args[0]
+    infileList = args[1:]
  
  
-    print "destination RDS: %s" % datafile
+    combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA)
  
  
-    if '--initrna' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
-    elif '--init' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True)
  
  
-    withFlag = ''
-    if '--flag' in argv:
-        withFlag = argv[sys.argv.index('-flag') + 1]
-        print "restrict to flag = %s" % withFlag
+def makeParser():
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--table", action="append", dest="tablelist")
+    parser.add_option("--init", action="store_true", dest="doInit")
+    parser.add_option("--initrna", action="store_true", dest="initRNA")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--flag", dest="withFlag")
+
+    configParser = getConfigParser()
+    section = "combinerds"
+    doInit = getConfigBoolOption(configParser, section, "doInit", False)
+    initRNA = getConfigBoolOption(configParser, section, "initRNA", False)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+
+    parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages,
+                        withFlag=withFlag)
+
+    return parser
+
  
  
-    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
+def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False):
  
  
+    print "destination RDS: %s" % datafile
+    datasetType="DNA"
+    if initRNA:
+        doInit = True
+        datasetType="RNA"
+
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = -1
+
+    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType)
      if cachePages > rds.getDefaultCacheSize():
          rds.setDBcache(cachePages)
      if cachePages > rds.getDefaultCacheSize():
          rds.setDBcache(cachePages)
-        cacheVal = cachePages
      else:
      else:
-        cacheVal = rds.getDefaultCacheSize()
-
-    doIndex = False
-    if '--index' in argv:
-        doIndex = True
+        cachePages = rds.getDefaultCacheSize()
  
  
-    tableList = []
-    if '--table' in argv:
-        tableList.append(argv[argv.index('-table') + 1])
-    else:
+    if tableList == []:
          tableList = rds.getTables()
  
          tableList = rds.getTables()
  
-    combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
-
+    if withFlag != "":
+        print "restrict to flag = %s" % withFlag
  
  
-def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
      metaDict = rds.getMetadata()
      if "numberImports" not in metaDict:
          origIndex = 0
      metaDict = rds.getMetadata()
      if "numberImports" not in metaDict:
          origIndex = 0
-        rds.insertMetadata([("numberImports", str(0))])
+        rds.insertMetadata([("numberImports", "0")])
      else:
          origIndex = int(metaDict["numberImports"])
  
      index = origIndex
      for inputfile in infileList:
      else:
          origIndex = int(metaDict["numberImports"])
  
      index = origIndex
      for inputfile in infileList:
-        asname = "input" + str(index)
-        rds.attachDB(inputfile,asname)
+        dbName = "input%s" % str(index)
+        rds.attachDB(inputfile, dbName)
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
-            ascols = "*"
+            dbColumns = "*"
              if table == "uniqs":
              if table == "uniqs":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "multi":
              elif table == "multi":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "splices":
              elif table == "splices":
-                ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
-            elif table == "metadata":
-                ascols = "name, value || ' (import_%d)'" % index
-                rds.importFromDB(asname, table, ascols)
+                dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
  
  
-            if table != "metadata":
-                rds.importFromDB(asname, table, ascols, withFlag)
+            if table == "metadata":
+                dbColumns = "name, value || ' (import_%d)'" % index
+                rds.importFromDB(dbName, table, dbColumns)
+            else:
+                rds.importFromDB(dbName, table, dbColumns, withFlag)
  
  
-        rds.detachDB(asname)
-        rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+        rds.detachDB(dbName)
+        rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))])
          index += 1
  
      rds.updateMetadata("numberImports", index, origIndex)
      if doIndex:
          print "building index...."
          index += 1
  
      rds.updateMetadata("numberImports", index, origIndex)
      if doIndex:
          print "building index...."
-        if cacheVal > 0:
-            rds.buildIndex(cacheVal)
+        if cachePages > 0:
+            rds.buildIndex(cachePages)
          else:
              rds.buildIndex()
  
          else:
              rds.buildIndex()
  
diff --git a/commoncode.py b/commoncode.py

index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..f526daacca0dc91245e71205363216bc17eba2e9 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -17,6 +17,9 @@ import Region
  commoncodeVersion = 5.6
  currentRDSversion = 2.0
  
  commoncodeVersion = 5.6
  currentRDSversion = 2.0
  
+class ErangeError(Exception):
+    pass
+
  
  def getReverseComplement(base):
      revComp = {"A": "T",
  
  def getReverseComplement(base):
      revComp = {"A": "T",
@@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False):
      return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  
  
      return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  
  
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
      if extendGenome != "":
      if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
  
  
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
  
      return geneannotDict
  
  
      return geneannotDict
  
@@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      mergeCount = 0
      chromField = int(chromField)
      count = 0
      mergeCount = 0
      chromField = int(chromField)
      count = 0
-    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
-    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
-    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
-    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
-    #      be merged with A but that will exit the loop and the merge with C will be missed.
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      """
  
      if shift == "auto":
      """
  
      if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
  
      seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
  
  
      seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
  
@@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      topPos = getPeakPositionList(smoothArray, length)
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
  
      if leftPlus:
          numLeftPlus = 0
  
      if leftPlus:
          numLeftPlus = 0
@@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      return peak
  
  
      return peak
  
  
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
      bestShift = 0
      lowestScore = 20000000000
      for testShift in xrange(maxShift + 1):
          shiftArray = array("f", [0.] * length)
      bestShift = 0
      lowestScore = 20000000000
      for testShift in xrange(maxShift + 1):
          shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
                  currentpos += testShift
              currentpos = read["start"] - start
              if read["sense"] == "+":
                  currentpos += testShift
@@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
-            if doWeight:
+            if useWeight:
                  weight = read["weight"]
              else:
                  weight = 1.0
                  weight = read["weight"]
              else:
                  weight = 1.0
@@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
              hitIndex += 1
              currentpos += 1
  
              hitIndex += 1
              currentpos += 1
  
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
              seqArray[currentpos] += weight
              hitIndex += 1
              currentpos += 1
              seqArray[currentpos] += weight
              hitIndex += 1
              currentpos += 1
@@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length):
          if topNucleotide < smoothArray[currentpos]:
              topNucleotide = smoothArray[currentpos]
              peakList = [currentpos]
          if topNucleotide < smoothArray[currentpos]:
              topNucleotide = smoothArray[currentpos]
              peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
              peakList.append(currentpos)
  
      return peakList
              peakList.append(currentpos)
  
      return peakList
@@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
          return featuresByChromDict
  
  
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
      """ return a dictionary of gene loci. Can be used to retrieve additional
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
      """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
          print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
          return locusByChromDict
  
          print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
          return locusByChromDict
  
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
      for gid in featuresDict:
          featureList = featuresDict[gid]
          newFeatureList = []
      for gid in featuresDict:
          featureList = featuresDict[gid]
          newFeatureList = []
@@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
                  gstart -= distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
                  gstop += distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
  
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
          else:
              if not useCDS and upstream > 0:
                  if upstreamSpanTSS:
          else:
              if not useCDS and upstream > 0:
                  if upstreamSpanTSS:
@@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
                  gstop += distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
                  gstart -= distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
  
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
  
  
-        glen = abs(gstop - gstart)
          if chrom not in locusByChromDict:
              locusByChromDict[chrom] = []
  
          if chrom not in locusByChromDict:
              locusByChromDict[chrom] = []
  
@@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
      return locusByChromDict
  
  
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
  def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                        normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                        binLength=-1):
  def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                        normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                        binLength=-1):
@@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                  rlen = regionTuple[lengthField]
                  try:
                      rsense = regionTuple[senseField]
                  rlen = regionTuple[lengthField]
                  try:
                      rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                      rsense = "F"
  
                  if tagStart > stop:
                      rsense = "F"
  
                  if tagStart > stop:
diff --git a/distalPairs.py b/distalPairs.py

index 5bc25321b25b8a30e9b09511cabbee4355010b71..be403109110c3fb40f7bba831f6c2ea48e496536 100755 (executable)
--- a/distalPairs.py
+++ b/distalPairs.py
@@ -85,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
  
      print time.ctime()
  
  
      print time.ctime()
  
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
      print "getting uniq reads"    
      uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
      print "got uniqs"
  
      if doSplices:
      print "getting uniq reads"    
      uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
      print "got uniqs"
  
      if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
  
      if doVerbose:
          print len(uniqDict), time.ctime()
  
      outfile = open(outfilename,"w")
  
      if doVerbose:
          print len(uniqDict), time.ctime()
  
      outfile = open(outfilename,"w")
-
      diffChrom = 0
      distal = 0
      total = 0
      diffChrom = 0
      distal = 0
      total = 0
@@ -132,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                      continue
                  else:
                      outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
                      continue
                  else:
                      outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print diffChrom, outline
              else:
                  dist = abs(start1 - start2)
                      if doVerbose:
                          print diffChrom, outline
              else:
                  dist = abs(start1 - start2)
-
                  if minDist < dist < maxDist:
                      distal += 1
                      outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
                  if minDist < dist < maxDist:
                      distal += 1
                      outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print distal, outline
  
                      if doVerbose:
                          print distal, outline
  
@@ -157,5 +140,22 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
      print time.ctime()
  
  
      print time.ctime()
  
  
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        del read0[1]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                del read2[1]
+                uniqDict[readID] = [read0,read2]
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/farPairs.py b/farPairs.py

index 00cc91844b23084d712573261e996524739c9870..712e60eff9815f9728b6a03983385f83d96bc6f5 100644 (file)
--- a/farPairs.py
+++ b/farPairs.py
@@ -14,8 +14,9 @@ except:
  import sys
  import time
  import optparse
  import sys
  import time
  import optparse
+import string
  import ReadDataset
  import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
  
  print "farPairs: version 1.4"
  
  
  print "farPairs: version 1.4"
  
@@ -38,14 +39,13 @@ def main(argv=None):
      outfilename = args[1]
      outbedname = args[2]
  
      outfilename = args[1]
      outbedname = args[2]
  
-    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+    farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
               options.cachePages, options.minDist, options.maxDist, options.minCount,
               options.label)
  
  
  def getParser(usage):
      parser = optparse.OptionParser(usage=usage)
               options.cachePages, options.minDist, options.maxDist, options.minCount,
               options.label)
  
  
  def getParser(usage):
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
      parser.add_option("--cache", type="int", dest="cachePages")
      parser.add_option("--verbose", action="store_true", dest="doVerbose")
      parser.add_option("--minDist", type="int", dest="minDist")
      parser.add_option("--cache", type="int", dest="cachePages")
      parser.add_option("--verbose", action="store_true", dest="doVerbose")
      parser.add_option("--minDist", type="int", dest="minDist")
@@ -55,7 +55,6 @@ def getParser(usage):
  
      configParser = getConfigParser
      section = "farPairs"
  
      configParser = getConfigParser
      section = "farPairs"
-    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
      doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
      minDist = getConfigIntOption(configParser, section, "minDist", 1000)
      doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
      minDist = getConfigIntOption(configParser, section, "minDist", 1000)
@@ -63,15 +62,33 @@ def getParser(usage):
      minCount = getConfigIntOption(configParser, section, "minCount", 2)
      label = getConfigOption(configParser, section, "label", None)
  
      minCount = getConfigIntOption(configParser, section, "minCount", 2)
      label = getConfigOption(configParser, section, "label", None)
  
-    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+    parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
                          minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
  
      return parser
  
  
                          minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
  
      return parser
  
  
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
               cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
  
               cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
  
+    flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+    outfile = open(outfilename, "w")
+    for region in flagDict:
+        regionConnections = countDuplicatesInList(flagDict[region])
+        for (connectedRegion, count) in regionConnections:
+            if count >= minCount:
+                outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+                print >> outfile, outline
+                if doVerbose:
+                    print outline
+
+    outfile.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
      doCache = False
      if cachePages is not None:
          doCache = True
      doCache = False
      if cachePages is not None:
          doCache = True
@@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
  
      if doVerbose:
          print time.ctime()
  
      if doVerbose:
          print time.ctime()
-
-    total = 0
-    outfile = open(outfilename, "w")
+    
      outbed = open(outbedname, "w")
      outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
      outbed = open(outbedname, "w")
      outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+    outbed.close()
  
      readlen = RDS.getReadSize()
      flagDict = {}
  
      readlen = RDS.getReadSize()
      flagDict = {}
@@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
          if doNotProcessChromosome(chromosome):
              continue
  
          if doNotProcessChromosome(chromosome):
              continue
  
-        print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
-        if doVerbose:
-            print len(uniqDict), time.ctime()
-
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                start1 = readList[0]["start"]
-                flag1 = readList[0]["flag"]
-                start2 = readList[1]["start"]
-                flag2 = readList[1]["flag"]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    startList = [start1, start2]
-                    stopList = [start1 + readlen, start2 + readlen]
-                    startList.sort()
-                    stopList.sort()
-                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
-                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
-                        outbed.write(outputLine)
-                        if doVerbose:
-                            print flag1, flag2, dist
-
-                        try:
-                            flagDict[flag1].append((flag2, start1, start2))
-                        except KeyError:
-                            flagDict[flag1] = [(flag2, start1, start2)]
-
-                        try:
-                            flagDict[flag2].append((flag1, start1, start2))
-                        except KeyError:
-                            flagDict[flag2] = [(flag2, start1, start2)]
+        writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
  
      print "%d connected regions" % len(flagDict)
  
  
      print "%d connected regions" % len(flagDict)
  
-    for region in flagDict:
-        flagDict[region].sort()
-        regionConnections = {}
-        for (region2, start1, start2) in flagDict[region]:
+    return flagDict
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+    outbed = open(outbedname, "a")
+    print chromosome
+    uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if readsAreFarPair(readList, minDist, maxDist):
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            startList = [start1, start2]
+            startList.sort()
+            outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+            outbed.write(outputLine)
+            flag1 = readList[0]["flag"]
+            flag2 = readList[1]["flag"]
+            if doVerbose:
+                print flag1, flag2, abs(start1 - start2)
+
              try:
              try:
-                regionConnections[region2] += 1
+                flagDict[flag1].append(flag2)
              except KeyError:
              except KeyError:
-                regionConnections[region2] = 1
+                flagDict[flag1] = [flag2]
  
  
-        for region2 in regionConnections:
-            if regionConnections[region2] >= minCount:
-                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
-                if doVerbose:
-                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+            try:
+                flagDict[flag2].append(flag1)
+            except KeyError:
+                flagDict[flag2] = [flag1]
  
  
-    outfile.close()
      outbed.close()
      outbed.close()
-    if doVerbose:
-        print "finished: ", time.ctime()
  
  
  
  
-def doNotProcessChromosome(chrom):
-    return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+    isFarPair = False
+    if len(readList) == 2:
+        flag1 = readList[0]["flag"]
+        flag2 = readList[1]["flag"]
+        if flag1 != flag2 and flag1 != "" and flag2 != "":
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            dist = abs(start1 - start2)
+            if minDist < dist < maxDist:
+                isFarPair = True
  
  
+    return isFarPair
  
  
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
-    readSizes = "%d" % (stopList[0] - startList[0])
-    readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+    sizes = ["%d" % readlen]
+    coords = ["0"]
      leftStart = startList[0] - 1
      leftStart = startList[0] - 1
-    rightStop = stopList[-1]
+    rightStop = startList[-1]
      for index in range(1, numPieces):
      for index in range(1, numPieces):
-        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
-        readCoords += ",%d" % (startList[index] - startList[0])
+        sizes.append("%d" % (readlen + 1))
+        coords.append("%d" % (startList[index] - startList[0]))
  
      if rsense == "+":
  
      if rsense == "+":
-        senseCode = plusSense
+        senseCode = plusSenseColor
      else:
      else:
-        senseCode = minusSense
+        senseCode = minusSenseColor
  
  
+    readSizes = string.join(sizes, ",")
+    readCoords = string.join(coords, ",")
      outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
      outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
      return outline
  
  
      return outline
  
  
diff --git a/featureIntersects.py b/featureIntersects.py

index cd357cc072c5cc7aa4460371439ef59c428873f5..c24b79edb312fae6c0b19f0d7e3d3ffb45d12984 100755 (executable)
--- a/featureIntersects.py
+++ b/featureIntersects.py
@@ -38,7 +38,7 @@ def main(argv=None):
  
  def makeParser(usage=""):
      parser = optparse.OptionParser(usage=usage)
  
  def makeParser(usage=""):
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--cistype", dest="cistype")
      parser.add_option("--radius", type="int", dest="radius")
  
      configParser = getConfigParser()
      parser.add_option("--radius", type="int", dest="radius")
  
      configParser = getConfigParser()
@@ -52,10 +52,19 @@ def makeParser(usage=""):
  
  
  def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
  
  
  def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
-    tabfile = open(tabFileName)
-    previous = ""
+    
+    posList = getPositionList(tabFileName)
+    feats = featuresIntersecting("human", posList, radius, cistype)
+    featkeys = feats.keys()
+    featkeys.sort()
+    for (chrom, pos) in featkeys:
+        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
  
  
+def getPositionList(tabFileName):
+    previous = ""
      posList = []
      posList = []
+    tabfile = open(tabFileName)
      for line in tabfile:
          fields = line.split("\t")
          current = fields[0]
      for line in tabfile:
          fields = line.split("\t")
          current = fields[0]
@@ -66,11 +75,7 @@ def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
          chrom = fields[1][3:]
          posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
  
          chrom = fields[1][3:]
          posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
  
-    feats = featuresIntersecting("human", posList, radius, cistype)
-    featkeys = feats.keys()
-    featkeys.sort()
-    for (chrom, pos) in featkeys:
-        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+    return posList
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
diff --git a/findall.py b/findall.py

index d20608f1997e2e6a0fd84d9169c78025485e4b57..ec75efb54f3680c892578a38811c1bd241c37241 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
              strandfilter=None, combine5p=False):
  
      shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
              strandfilter=None, combine5p=False):
  
      shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
  
      if doRevBackground:
          print "Swapping IP and background to calculate FDR"
          pValueType = "back"
  
  
      if doRevBackground:
          print "Swapping IP and background to calculate FDR"
          pValueType = "back"
  
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
      doPvalue = True
      if ptype is not None:
          ptype = ptype.upper()
      doPvalue = True
      if ptype is not None:
          ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      else:
          pValueType = "self"
  
      else:
          pValueType = "self"
  
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
@@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
  
      stringency = max(stringency, 1.0)
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
  
      stringency = max(stringency, 1.0)
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
      if doControl:
          print "\ncontrol:" 
          mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
      if doControl:
          print "\ncontrol:" 
          mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
      if cachePages > hitRDS.getDefaultCacheSize():
          hitRDS.setDBcache(cachePages)
  
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
      if doAppend:
          fileMode = "a"
      else:
          fileMode = "w"
  
      outfile = open(outfilename, fileMode)
      if doAppend:
          fileMode = "a"
      else:
          fileMode = "w"
  
      outfile = open(outfilename, fileMode)
-
      outfile.write("#ERANGE %s\n" % versionString)
      if doControl:
          mockRDSsize = len(mockRDS) / 1000000.
      outfile.write("#ERANGE %s\n" % versionString)
      if doControl:
          mockRDSsize = len(mockRDS) / 1000000.
@@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                  foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
  
                  if foldRatio >= minRatio:
                  foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
  
                  if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                      try:
                          shiftDict[localshift] += 1
                      except KeyError:
                      try:
                          shiftDict[localshift] += 1
                      except KeyError:
@@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
  
  
  def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
  
  
  def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                  shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                  noMulti, doControl, factor, trimValue, outputRegionList=False):
  
                  shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                  noMulti, doControl, factor, trimValue, outputRegionList=False):
  
@@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
          weight = read["weight"]
          if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
              sumAll = currentTotalWeight
          weight = read["weight"]
          if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
              sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                  sumAll /= rdsSampleSize
  
              regionStart = currentHitList[0]
                  sumAll /= rdsSampleSize
  
              regionStart = currentHitList[0]
@@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
              if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                  sumMulti = 0.
                  #first pass uses getFoldRatio on mockRDS as there may not be control
              if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                  sumMulti = 0.
                  #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
                  if foldRatio >= minRatio:
                      # first pass, with absolute numbers
                      peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
                  if foldRatio >= minRatio:
                      # first pass, with absolute numbers
                      peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                      peakScore = peak.smoothArray[bestPos]
                      numPlus = peak.numPlus
                      shift = peak.shift
                      peakScore = peak.smoothArray[bestPos]
                      numPlus = peak.numPlus
                      shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                          peakScore /= rdsSampleSize
  
                      if doTrim:
                          peakScore /= rdsSampleSize
  
                      if doTrim:
@@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
  
                          sumAll = trimPeak.numHits
                          numPlus = trimPeak.numPlus
  
                          sumAll = trimPeak.numHits
                          numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                              sumAll /= rdsSampleSize
  
                              sumAll /= rdsSampleSize
  
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                          if outputRegionList:
                              sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                          # just in case it changed, use latest data
                          if outputRegionList:
                              sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                          # just in case it changed, use latest data
@@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                          except:
                              continue
  
                          except:
                              continue
  
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                              peakScore /= rdsSampleSize
  
                      elif outputRegionList:
                          sumMulti = currentTotalWeight - currentUniqReadCount
  
                      if outputRegionList:
                              peakScore /= rdsSampleSize
  
                      elif outputRegionList:
                          sumMulti = currentTotalWeight - currentUniqReadCount
  
                      if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                              sumMulti /= rdsSampleSize
  
                          try:
                              sumMulti /= rdsSampleSize
  
                          try:
@@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
      return footer
  
  if __name__ == "__main__":
      return footer
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py

index 4ce97e4fc1380517447153e59fba80e5531e3029..91db4a1a44870602383a072d7a4585568026f165 100755 (executable)
--- a/geneDownstreamBins.py
+++ b/geneDownstreamBins.py
@@ -14,7 +14,7 @@ import sys
  import optparse
  import ReadDataset
  from cistematic.genomes import Genome
  import optparse
  import ReadDataset
  from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
  
  print "geneDownstreamBins: version 2.1"
  
  
  print "geneDownstreamBins: version 2.1"
  
@@ -53,8 +53,6 @@ def makeParser(usage=""):
  
  
  def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
  
  
  def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
-    bins = 10
-    standardMinThresh = standardMinDist / bins
  
      hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      normalizationFactor = 1.0
  
      hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
      normalizationFactor = 1.0
@@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac
      geneinfoDict = getGeneInfoDict(genome, cache=True)
      hg = Genome(genome)
      featuresDict = hg.getallGeneFeatures()
      geneinfoDict = getGeneInfoDict(genome, cache=True)
      hg = Genome(genome)
      featuresDict = hg.getallGeneFeatures()
-
      outfile = open(outfilename, "w")
      outfile = open(outfilename, "w")
-
      gidList = hg.allGIDs()
      gidList.sort()
      for gid in gidList:
      gidList = hg.allGIDs()
      gidList.sort()
      for gid in gidList:
-        symbol = "LOC" + gid
-        geneinfo = ""
-        featureList = []
          try:
          try:
-            geneinfo = geneinfoDict[gid]
-            featureList = featuresDict[gid]
-            symbol = geneinfo[0][0]
-        except:
+            featuresList = featuresDict[gid]
+        except KeyError:
              print gid
  
              print gid
  
-        if len(featureList) == 0:
+        try:
+            binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+        except ErangeError:
              continue
  
              continue
  
-        newfeatureList = []
-        for (ftype, chrom, start, stop, fsense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
+        tagCount *= normalizationFactor
+        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+        for binAmount in binList:
+            outfile.write("\t%.2f" % binAmount)
  
  
-        if chrom not in hitDict:
-            continue
+        outfile.write("\n")
  
  
-        newfeatureList.sort()
-        if len(newfeatureList) < 1:
-            continue
+    outfile.close()
  
  
-        glen = standardMinDist
-        if fsense == "F":
-            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
  
  
-            if glen < 1:
-                glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
  
  
-            gstart = newfeatureList[-1][1]
-        else:
-            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
+    symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+    geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+    if geneLength < standardMinDist:
+        raise ErangeError("gene length less than minimum")
  
  
-            if glen < 1:
-                glen = 1
+    binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+    if tagCount < 2:
+        raise ErangeError("tag count less than minimum")
  
  
-            gstart = newfeatureList[0][0] - glen
-            if gstart < 0:
-                gstart = 0
+    return binList, symbol, geneLength, tagCount
  
  
-        tagCount = 0
-        if glen < standardMinDist:
-            continue
  
  
-        binList = [0.] * bins
-        for read in hitDict[chrom]:
-            tagStart = read["start"]
-            weight = read["weight"]
-            tagStart -= gstart
-            if tagStart >= glen:
-                break
-
-            if tagStart > 0:
-                tagCount += weight
-                if fsense == "F":
-                    # we are relying on python's integer division quirk
-                    binID = tagStart / standardMinThresh 
-                    binList[binID] += weight
-                else:
-                    rdist = glen - tagStart
-                    binID = rdist / standardMinThresh 
-                    binList[binID] += weight
-
-        if tagCount < 2:
-            continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+    if len(featureList) == 0:
+        raise ErangeError("no features found")
  
  
-        tagCount *= normalizationFactor
-        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
-        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
-        for binAmount in binList:
-            outfile.write("\t%.2f" % binAmount)
+    symbol = "LOC%s" % gid
+    geneinfo = ""
+    try:
+        geneinfo = geneinfoDict[gid]
+        symbol = geneinfo[0][0]
+    except KeyError:
+        print gid
  
  
-        outfile.write("\n")
+    newfeatureList = []
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+
+    if len(newfeatureList) < 1:
+        raise ErangeError("no features found")
+
+    if chrom not in chromosomeList:
+        raise ErangeError("chromosome not found in reads")
+
+    newfeatureList.sort()
+
+    return symbol, newfeatureList, sense, chrom
  
  
-    outfile.close()
  
  
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+    geneLength = minDistance
+    if sense == "F":
+        nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = featureList[-1][1]
+    else:
+        nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = max(featureList[0][0] - geneLength, 0)
+
+    if nextGene < geneLength * 2:
+        geneLength = nextGene / 2
+
+    geneLength = max(geneLength, 1)
+
+    return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+    tagCount = 0
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+    binList = [0.] * bins
+    for read in readList:
+        tagStart = read["start"]
+        if tagStart >= geneLength:
+            break
+
+        tagStart -= geneStart
+        weight = read["weight"]
+        if tagStart > 0:
+            tagCount += weight
+            if sense == "F":
+                # we are relying on python's integer division quirk
+                binID = tagStart / standardMinThresh 
+            else:
+                rdist = geneLength - tagStart
+                binID = rdist / standardMinThresh
+
+            binList[binID] += weight
+
+    return binList, tagCount
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -12,6 +12,7 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
+import string
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
  from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
  import ReadDataset
  from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
  
      return parser
  
  
      return parser
  
+
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
  def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                    normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                    acceptfile=None):
  
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
      if acceptfile is None:
          acceptDict = {}
      else:
          acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
  
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
      readlen = hitRDS.getReadSize()
      normalizationFactor = 1.0
      if normalizeBins:
          totalCount = len(hitRDS)
          normalizationFactor = totalCount / 1000000.
  
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
      (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
  
  
-    outfile = open(outfilename,'w')
  
  
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
      for gid in gidList:
      for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                  pass
          else:
              symbol = gid
                  pass
          else:
              symbol = gid
+
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
          if gid in gidBins and gid in gidLen:
              tagCount = 0.
              for binAmount in gidBins[gid]:
                  tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
          for binAmount in gidBins[gid]:
              if normalizeBins:
          for binAmount in gidBins[gid]:
              if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
      outfile.close()
  
  
  if __name__ == "__main__":
      outfile.close()
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 38e853aec6172be26b448066692193f1537b6926..5299d27d26c23f8729063eb31f0da390730b5ce5 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -2,7 +2,7 @@ try:
      import psyco
      psyco.full()
  except:
      import psyco
      psyco.full()
  except:
-    print 'psyco not running'
+    print "psyco not running"
  
  import sys
  import optparse
  
  import sys
  import optparse
@@ -68,12 +68,6 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -206,7 +200,6 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
-        #TODO: figure out why this was done in prior implementation...
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
@@ -271,4 +264,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/getNovelSNPs.py b/getNovelSNPs.py

index 4091281c8fe0f5410090b23db7276496fe49d6ed..e95d72693c76df13871da4cf809c99e43c41d36a 100755 (executable)
--- a/getNovelSNPs.py
+++ b/getNovelSNPs.py
@@ -68,7 +68,6 @@ def doNotProcessLine(line):
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
-    #TODO: refactor naming. is fields[8] rpkm?
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
@@ -93,4 +92,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/makerdsfromblat.py b/makerdsfromblat.py

index 37576ca867fb2283f9a5d497c0bbca7d06191f7e..17520401c52b5d153404aa4312a800017e9af3bb 100755 (executable)
--- a/makerdsfromblat.py
+++ b/makerdsfromblat.py
@@ -20,6 +20,9 @@ import ReadDataset
  verstring = "makerdsfromblat: version 3.10"
  print verstring
  
  verstring = "makerdsfromblat: version 3.10"
  print verstring
  
+NUM_HEADER_LINES = 5
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -98,24 +101,15 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
                      verbose=False, cachePages=100000, geneDataFileName="",
                      propertyList=[]):
  
                      verbose=False, cachePages=100000, geneDataFileName="",
                      propertyList=[]):
  
-    delimiter = "|"
-    minIntron = 10
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
      if forceRNA:
          print "forcing datatype to RNA"
          dataType = "RNA"
  
      if forceRNA:
          print "forcing datatype to RNA"
          dataType = "RNA"
  
-    if dataType == "RNA":
-        genedatafile = open(geneDataFileName)
-
-    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
-
      geneDict = {}
      mapDict = {}
      if dataType == "RNA" and not forceRNA:
      geneDict = {}
      mapDict = {}
      if dataType == "RNA" and not forceRNA:
+        genedatafile = open(geneDataFileName)
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
@@ -164,9 +158,10 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
  
      # make some assumptions based on first read
      infile = open(filename, "r")
  
      # make some assumptions based on first read
      infile = open(filename, "r")
-    for arg in range(6):
+    for arg in range(NUM_HEADER_LINES):
          line = infile.readline()
  
          line = infile.readline()
  
+    line = infile.readline()
      fields = line.split()
      readsize = int(fields[10])
      pairedTest = fields[9][-2:]
      fields = line.split()
      readsize = int(fields[10])
      pairedTest = fields[9][-2:]
@@ -186,8 +181,9 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
          rds.insertMetadata([("blat_mapped", "True")])
  
      minReadScore = readsize - readsize/25 - 1
          rds.insertMetadata([("blat_mapped", "True")])
  
      minReadScore = readsize - readsize/25 - 1
-    trim = -4
+    maxBorder = 0
      if dataType == "RNA":
      if dataType == "RNA":
+        trim = -4
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
@@ -199,9 +195,12 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
      index = uIndex = mIndex = sIndex = lIndex = 0
      bestScore = 0
      # skip headers
      index = uIndex = mIndex = sIndex = lIndex = 0
      bestScore = 0
      # skip headers
-    for arg in range(5):
+    for arg in range(NUM_HEADER_LINES):
          line = infile.readline()
  
          line = infile.readline()
  
+    insertSize = 100000
+    delimiter = "|"
+    minIntron = 10
      for line in infile:
          lIndex += 1
          fields = line.strip().split()
      for line in infile:
          lIndex += 1
          fields = line.strip().split()
diff --git a/makerdsfrombowtie.py b/makerdsfrombowtie.py

index 14be260cdf0721b0514098a22d3a07423756c41b..eadb66b83d59e140b0402a9ea76dd351219abad0 100755 (executable)
--- a/makerdsfrombowtie.py
+++ b/makerdsfrombowtie.py
@@ -87,29 +87,13 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                        flip=False, verbose=False, stripSpace=False, cachePages=100000,
                        propertyList=[]):
  
                        flip=False, verbose=False, stripSpace=False, cachePages=100000,
                        propertyList=[]):
  
-    delimiter = "|"
+    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
  
  
+    geneDict = {}
      dataType = "DNA"
      if genedatafilename is not None:
          dataType = "RNA"
          genedatafile = open(genedatafilename)
      dataType = "DNA"
      if genedatafilename is not None:
          dataType = "RNA"
          genedatafile = open(genedatafilename)
-
-
-    forcePair = False
-    if forceID is not None:
-        forcePair = True
-    else:
-        forceID = 0
-
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
-    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
-
-    geneDict = {}
-    mapDict = {}
-    if dataType == "RNA":
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
          for line in genedatafile:
              fields = line.strip().split("\t")
              blockCount = int(fields[7])
@@ -130,7 +114,6 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                  totalLength += exonLengths[index]
  
              geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
                  totalLength += exonLengths[index]
  
              geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
  
          genedatafile.close()
  
  
          genedatafile.close()
  
@@ -164,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
      fields = line.split()
      readsize = len(fields[5])
      pairedTest = fields[0][-2:]
      fields = line.split()
      readsize = len(fields[5])
      pairedTest = fields[0][-2:]
+    forcePair = False
+    if forceID is not None:
+        forcePair = True
+    else:
+        forceID = 0
+
      paired = False
      if pairedTest in ["/1", "/2"] or forcePair:
          print "assuming reads are paired"
          paired = True
  
      paired = False
      if pairedTest in ["/1", "/2"] or forcePair:
          print "assuming reads are paired"
          paired = True
  
-
      print "read size: %d bp" % readsize
      if init:
          rds.insertMetadata([("readsize", readsize)])
      print "read size: %d bp" % readsize
      if init:
          rds.insertMetadata([("readsize", readsize)])
@@ -184,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
  
      infile.close()
  
  
      infile.close()
  
-    trim = -4
+    maxBorder = 0
      if dataType == "RNA":
      if dataType == "RNA":
+        trim = -4
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
          maxBorder = readsize + trim
  
      infile = open(filename, "r")
@@ -195,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
      mInsertList = []
      sInsertList = []
      index = uIndex = mIndex = sIndex = lIndex = 0
      mInsertList = []
      sInsertList = []
      index = uIndex = mIndex = sIndex = lIndex = 0
+    delimiter = "|"
+    insertSize = 100000
      for line in infile:
          lIndex += 1
          if stripSpace:
      for line in infile:
          lIndex += 1
          if stripSpace:
diff --git a/makerdsfromeland2.py b/makerdsfromeland2.py

index 66209eef973f36072ba964aac73f8579685aed99..b11848d7a93e3968d7b76b286fa415d04ffa078d 100755 (executable)
--- a/makerdsfromeland2.py
+++ b/makerdsfromeland2.py
@@ -106,8 +106,6 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
      insertSize = 100000
  
      geneDict = {}
      insertSize = 100000
  
      geneDict = {}
-    mapDict = {}
-    seenSpliceList = []
      if dataType == 'RNA':
          genedatafile = open(geneDataFileName)
          for line in genedatafile:
      if dataType == 'RNA':
          genedatafile = open(geneDataFileName)
          for line in genedatafile:
@@ -120,17 +118,11 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
              chrom = fields[1]
              sense = fields[2]
              chromstarts = fields[8][:-1].split(',')
              chrom = fields[1]
              sense = fields[2]
              chromstarts = fields[8][:-1].split(',')
-            chromstops = fields[9][:-1].split(',')
-            exonLengths = []
-            totalLength = 0
              for index in range(blockCount):
                  chromstarts[index] = int(chromstarts[index])
              for index in range(blockCount):
                  chromstarts[index] = int(chromstarts[index])
-                chromstops[index] = int(chromstops[index])
-                exonLengths.append(chromstops[index] - chromstarts[index])
-                totalLength += exonLengths[index]
  
  
-            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
+            geneDict[uname] = (sense, blockCount, chrom, chromstarts)
+
          genedatafile.close()
  
      rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
          genedatafile.close()
  
      rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
@@ -322,6 +314,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
      print '%d unique reads' % index
      infile.close()
  
      print '%d unique reads' % index
      infile.close()
  
+    seenSpliceList = []
      if dataType == 'RNA':
          print 'mapping splices...'
          index = 0
      if dataType == 'RNA':
          print 'mapping splices...'
          index = 0
@@ -441,7 +434,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
                      print fields
                      continue
  
                      print fields
                      continue
  
-                (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                (sense, blockCount, chrom, chromstarts) = geneDict[model]
                  if extended:
                      if 'F' in thepos:
                          rsense = '+'
                  if extended:
                      if 'F' in thepos:
                          rsense = '+'
diff --git a/predictSpliceCount.py b/predictSpliceCount.py

index bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07..2df39a9412ab6794a1c437bead4b476c482e3918 100755 (executable)
--- a/predictSpliceCount.py
+++ b/predictSpliceCount.py
@@ -8,6 +8,14 @@ import sys
  from cistematic.genomes import Genome
  
  
  from cistematic.genomes import Genome
  
  
+class GeneSymbolAndCount():
+
+    def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+        self.symbol = symbol
+        self.uniqueCount = uniqueCount
+        self.spliceCount = spliceCount
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -31,53 +39,65 @@ def main(argv=None):
  def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
      hg = Genome(genome)
  
  def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
      hg = Genome(genome)
  
-    gidDict = {}
-    gidList = []
-    uniqueCountDict = {}
-    spliceCountDict = {}
+    gidDict = getGeneData(uniquefilecount, splicefilecount)
+    gidList = gidDict.keys()
+    gidList.sort()
+
+    outfile = open(outfilename, "w")
+    for gid in gidList:
+        featureList = hg.getGeneFeatures((genome, gid))
+        featuresizesum, splicearea = getStats(featureList, splicelead)
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        geneData = gidDict[gid]
+        uniqueCount = geneData.uniqueCount
+        expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+        symbol = geneData.symbol
+        spliceCount = geneData.spliceCount
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
  
  
+    outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+    gidDict = {}
      uniquefile = open(uniquefilecount)
      for line in uniquefile:
          fields = line.strip().split()
      uniquefile = open(uniquefilecount)
      for line in uniquefile:
          fields = line.strip().split()
-        gidDict[fields[0]] = fields[1]
-        gidList.append(fields[0])
-        uniqueCountDict[fields[0]] = int(fields[2])
+        geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+        gidDict[fields[0]] = geneData
  
  
+    uniquefile.close()
      splicefile = open(splicefilecount)
      for line in splicefile:
          fields = line.strip().split()
      splicefile = open(splicefilecount)
      for line in splicefile:
          fields = line.strip().split()
-        spliceCountDict[fields[0]] = int(fields[2])
+        gidDict[fields[0]].spliceCount = int(fields[2])
  
  
-    outfile = open(outfilename,'w')
+    splicefile.close()
  
  
-    gidList.sort()
-    for gid in gidList:
-        symbol = gidDict[gid]
-        featureList = hg.getGeneFeatures((genome, gid))
-        newfeatureList = []
-        featuresizesum = 0
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
-                featuresizesum += stop - start + 1
+    return gidDict
  
  
-        if featuresizesum < 1:
-            featuresizesum = 1
  
  
-        splicearea = (len(newfeatureList) - 1) * splicelead
-        if splicearea < splicelead:
-            splicearea = 0
+def getStats(featureList, splicelead):
+    newfeatureList = []
+    featuresizesum = 0
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+            featuresizesum += stop - start + 1
  
  
-        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
-        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+    if featuresizesum < 1:
+        featuresizesum = 1
  
  
-        # this p-value is based on the observed unique count, not the expected total count
-        # nor the multi-read adjusted count
-        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
-        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
-        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+    splicearea = (len(newfeatureList) - 1) * splicelead
+    if splicearea < splicelead:
+        splicearea = 0
  
  
-    outfile.close()
+    return featuresizesum, splicearea
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
diff --git a/siteintersects.py b/siteintersects.py

index 4826a13bbb31824ffea9b187d383f42b35bbf682..e012bb49436c92f50a1d8722182d0cd72a754897 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -4,10 +4,29 @@
  #
  
  import sys
  #
  
  import sys
+from commoncode import regionsOverlap
  
  print "siteintersects: version 2.1"
  
  
  
  print "siteintersects: version 2.1"
  
  
+class Site():
+    def __init__(self, line, doExpanded=False):
+        fields = line.strip().split()
+        if doExpanded:
+            self.chrom = fields[1][3:]
+            self.start = int(fields[2])
+            self.stop = int(fields[3])
+            self.rest = fields[4:]
+        else:
+            (chromosome, pos) = fields[0].split(":")
+            self.chrom = chromosome[3:]
+            (start, stop) = pos.split("-")
+            self.start = int(start)
+            self.stop = int(stop)
+            self.rest = fields[1:]
+
+
+
  def main(argv=None):
      if not argv:
          argv = sys.argv
  def main(argv=None):
      if not argv:
          argv = sys.argv
@@ -33,115 +52,114 @@ def main(argv=None):
      siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
  
  
      siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
  
  
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
  
  
-    siteDict = {}
-    file1Dict = {}
+    siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+    commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
  
  
-    infile1count = 0
-    infile = open(sitefilename1)
+    if doReject and siteRejectFilename is not None:
+        writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+    print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+    siteDict = {}
+    rejectSiteDict = {}
+    processedLineCount = 0
+    infile = open(sitefilename)
      infile.readline()
      for line in infile:
      infile.readline()
      for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
              continue
  
              continue
  
-        infile1count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            start = int(start)
-            stop = int(stop)
-            rest = fields[1:]
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
+        rest = site.fieldList
  
          try:
              siteDict[chrom].append((start, stop, rest))
  
          try:
              siteDict[chrom].append((start, stop, rest))
-        except:
+        except KeyError:
              siteDict[chrom] = [(start, stop, rest)]
  
          if doReject:
              siteDict[chrom] = [(start, stop, rest)]
  
          if doReject:
-            file1Dict[str((chrom, start, stop, rest))] = line
+            rejectSiteDict[str((chrom, start, stop, rest))] = line
  
      infile.close()
  
      infile.close()
+    print "file1: %d" % processedLineCount
  
  
-    print "file1: %d" % infile1count
+    return siteDict, rejectSiteDict
  
  
-    infile2count = 0
-    infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+    processedLineCount = 0
+    infile = open(siteFilename)
      infile.readline()
  
      commonSites = 0
      infile.readline()
  
      commonSites = 0
-    unique2List = []
+    if doReject and rejectFilename is not None:
+        rejectfile = open(rejectFilename, "w")
+    else:
+        doReject=False
+
      outfile = open(outfilename, "w")
      for line in infile:
      outfile = open(outfilename, "w")
      for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
              continue
  
              continue
  
-        infile2count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            rest = str(fields[1:])
-
-        start = int(start)
-        stop = int(stop)
-        mid = start + abs(stop - start)/2
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
          if chrom not in siteDict:
              if doReject:
          if chrom not in siteDict:
              if doReject:
-                unique2List.append(line)
-                continue
+                rejectfile.write(line)
+
+            continue
  
  
-        twoNotCommon = True
+        siteNotCommon = True
          for (rstart, rstop, rline) in siteDict[chrom]:
          for (rstart, rstop, rline) in siteDict[chrom]:
-            rsize = abs(rstart - rstop) /2
-            rmid = rstart + abs(rstop - rstart)/2
-            if abs(mid - rmid) < rsize:
+            if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
                  commonSites += 1
-                if twoNotCommon:
-                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
-                    twoNotCommon = False
+                if siteNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+                    siteNotCommon = False
  
                  try:
                      if doReject:
  
                  try:
                      if doReject:
-                        del file1Dict[str((chrom, rstart, rstop, rline))]
-                except:
+                        del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+                except KeyError:
                      pass
  
                      pass
  
-        if doReject and twoNotCommon:
-            unique2List.append(line)
+        if doReject and siteNotCommon:
+            rejectfile.write(line)
  
  
-    outfile.close()
+    if doReject:
+        rejectfile.close()
  
  
-    print "file2: %d" % infile2count
+    outfile.close()
  
  
-    if doReject:
-        reject1file = open(reject1filename, "w")
-        reject2file = open(reject2filename, "w")
+    print "file2: %d" % processedLineCount
  
  
-        for key in file1Dict:
-            reject1file.write(file1Dict[key])
+    return commonSites
  
  
-        for line in unique2List:
-            reject2file.write(line)
  
  
-        reject1file.close()
-        reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+    rejectfile = open(siteRejectFilename, "w")
  
  
-    print commonSites
+    for key in rejectSiteDict:
+        rejectfile.write(rejectSiteDict[key])
  
  
+    rejectfile.close()
+    
  
  if __name__ == "__main__":
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/test/testBedToRegion.py b/test/testBedToRegion.py

new file mode 100644 (file)

index 0000000..b5b4c03
--- /dev/null
+++ b/test/testBedToRegion.py
@@ -0,0 +1,65 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import bedtoregion
+
+
+class TestBedToRegion(unittest.TestCase):
+
+    testBedFile = "erangeTestBedFile"
+    testRegionFile = "erangeTestRegionFile"
+
+
+    def setUp(self):
+        bedfile = open(self.testBedFile, "w")
+        bedfile.write("tab\tdelimited\tfields\n")
+        bedfile.write("space delimited fields\n")
+        bedfile.write("track\n")
+        bedfile.write("line after track will not be processed\n")
+        bedfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testBedFile)
+        except OSError:
+            print "bed file does not exist"
+
+        try:
+            os.remove(self.testRegionFile)
+        except OSError:
+            print "region file does not exist"
+
+
+    def testBedToRegion(self):
+        bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile)
+        resultFile = open(self.testRegionFile)
+        regionList = resultFile.readlines()
+        self.assertEquals(2, len(regionList))
+        self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0])
+        self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1])
+
+
+    def testMain(self):
+        argv = ["bedtoregion"]
+        self.assertRaises(SystemExit, bedtoregion.main, argv)
+        argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile]
+        bedtoregion.main(argv)
+        resultFile = open(self.testRegionFile)
+        self.assertEquals(2, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBedToRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testBinsToCdf.py b/test/testBinsToCdf.py

new file mode 100644 (file)

index 0000000..688e688
--- /dev/null
+++ b/test/testBinsToCdf.py
@@ -0,0 +1,66 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import binstocdf
+
+
+class TestBinsToCdf(unittest.TestCase):
+
+    testInputFile = "erangeTestBinFile"
+    testOutputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        binfile = open(self.testInputFile, "w")
+        binfile.write("field1\tfield2\t10\tfield4\t2\t3\n")
+        binfile.write("field1\tfield2\t20\tfield4\t1\t4\n")
+        binfile.write("field1\tfield2\t0\tfield4\n")
+        binfile.write("too short\n")
+        binfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "bin file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        binstocdf.binToCDF(self.testInputFile, self.testOutputFile)
+        resultFile = open(self.testOutputFile)
+        resultList = resultFile.readlines()
+        self.assertEquals(3, len(resultList))
+        self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0])
+        self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1])
+        self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2])
+
+
+    def testMain(self):
+        argv = ["binstocdf"]
+        self.assertRaises(SystemExit, binstocdf.main, argv)
+        argv = ["binstocdf", self.testInputFile, self.testOutputFile]
+        binstocdf.main(argv)
+        resultFile = open(self.testOutputFile)
+        self.assertEquals(3, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBinsToCdf))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testCdfDist.py b/test/testCdfDist.py

new file mode 100644 (file)

index 0000000..9048749
--- /dev/null
+++ b/test/testCdfDist.py
@@ -0,0 +1,58 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import cdfdist
+
+
+class TestCdfDist(unittest.TestCase):
+
+    testInputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1 30 60\n")
+        cdffile.write("line2 90 99\n")
+        cdffile.write("line3 5 80\n")
+        cdffile.write("line4 10 14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        bins = 2
+        self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile))
+        self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile))
+        self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile))
+        self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile))
+
+
+    def testMain(self):
+        bins = 2
+        percent = 50
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, cdfdist.main, argv)
+        argv = ["cdfdist", bins, percent, self.testInputFile]
+        cdfdist.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCdfDist))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testChksnp.py b/test/testChksnp.py

index b41fe659421377de86e576a7644abf9e8c5971b6..0fbbb7b2835ad2e7b8c52f38fdb4f116268444bc 100644 (file)
--- a/test/testChksnp.py
+++ b/test/testChksnp.py
@@ -23,6 +23,8 @@ class TestChksnp(unittest.TestCase):
  
      snpDB = "%s/dbSNP128.db" % dbPath
      altSnpDB = "%s/snp129cDNA.db" % dbPath
  
      snpDB = "%s/dbSNP128.db" % dbPath
      altSnpDB = "%s/snp129cDNA.db" % dbPath
+    inputFileName = "testChkSNP_input.txt"
+    outputFileName = "testChkSNP_output.txt"
  
      def setUp(self):
          pass
  
      def setUp(self):
          pass
@@ -33,8 +35,7 @@ class TestChksnp(unittest.TestCase):
  
  
      def testChkSNPFile(self):
  
  
      def testChkSNPFile(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
@@ -42,10 +43,8 @@ class TestChksnp(unittest.TestCase):
          infile.write("%s\n" % snpEntry)
          infile.close()
  
          infile.write("%s\n" % snpEntry)
          infile.close()
  
-        outputFileName = "testChkSNP_output.txt"
-
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName)
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -53,10 +52,10 @@ class TestChksnp(unittest.TestCase):
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
-        os.remove(outputFileName)
+        os.remove(self.outputFileName)
  
  
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB])
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -65,13 +64,12 @@ class TestChksnp(unittest.TestCase):
          self.assertEquals(result, line)
          outfile.close()
  
          self.assertEquals(result, line)
          outfile.close()
  
-        os.remove(inputFileName)
-        os.remove(outputFileName)
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
  
  
      def testMain(self):
  
  
      def testMain(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
          infile.write("# header line\n")
          snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
          infile.write("%s\n" % snpEntry)
@@ -79,11 +77,9 @@ class TestChksnp(unittest.TestCase):
          infile.write("%s\n" % snpEntry)
          infile.close()
  
          infile.write("%s\n" % snpEntry)
          infile.close()
  
-        outputFileName = "testChkSNP_output.txt"
-
-        argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+        argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName]
          chksnp.main(argv)
          chksnp.main(argv)
-        outfile = open(outputFileName, "r")
+        outfile = open(self.outputFileName, "r")
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
          line = outfile.readline()
          result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
          self.assertEquals(result, line)
@@ -91,7 +87,10 @@ class TestChksnp(unittest.TestCase):
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
          line = outfile.readline()
          self.assertEquals(result, line)
          outfile.close()
-        os.remove(outputFileName)
+
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
+
  
      def testChkSNP(self):
          snpPropertiesList = []
  
      def testChkSNP(self):
          snpPropertiesList = []
diff --git a/test/testColsum.py b/test/testColsum.py

new file mode 100644 (file)

index 0000000..5c149e7
--- /dev/null
+++ b/test/testColsum.py
@@ -0,0 +1,56 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import colsum
+
+
+class TestColsum(unittest.TestCase):
+
+    testInputFile = "erangeTestColsumFile"
+
+
+    def setUp(self):
+        colfile = open(self.testInputFile, "w")
+        colfile.write("line1 30 60.5\n")
+        colfile.write("line2 90 99\n")
+        colfile.write("line3 5 80\n")
+        colfile.write("line4 10 1\n")
+        colfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        self.assertEquals(0, colsum.colsum(0, self.testInputFile))
+        self.assertEquals(135, colsum.colsum(1, self.testInputFile))
+        self.assertEquals(240.5, colsum.colsum(2, self.testInputFile))
+        self.assertEquals(0, colsum.colsum(3, self.testInputFile))
+
+
+    def testMain(self):
+        field = 1
+        argv = ["colsum"]
+        self.assertRaises(SystemExit, colsum.main, argv)
+        argv = ["colsum", field, self.testInputFile]
+        colsum.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestColsum))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testCommoncode.py b/test/testCommoncode.py

index 43eb96bc2a3153e7cbf0bb5719fa13320195512c..ce8be55e8c9d9aed892a8eba023af0e79b812a8d 100644 (file)
--- a/test/testCommoncode.py
+++ b/test/testCommoncode.py
@@ -8,6 +8,7 @@ import os
  import string
  from array import array
  from erange import commoncode
  import string
  from array import array
  from erange import commoncode
+from erange import Region
  from cistematic.genomes import Genome
  
  
  from cistematic.genomes import Genome
  
  
@@ -122,8 +123,10 @@ class TestCommoncode(unittest.TestCase):
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          testfile.write(regionEntry)
          testfile.close()
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          testfile.write(regionEntry)
          testfile.close()
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+        result = commoncode.getMergedRegions("regionTestFile")
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
          os.remove("regionTestFile")
  
  
          os.remove("regionTestFile")
  
  
@@ -132,108 +135,277 @@ class TestCommoncode(unittest.TestCase):
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(5, 25, 20)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
-        result = {"1": [(12, 18, 6)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
-        result = {"chr1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=5)
+        self.assertEquals(5, result["1"][0].start)
+        self.assertEquals(25, result["1"][0].stop)
+        self.assertEquals(20, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=-2)
+        self.assertEquals(12, result["1"][0].start)
+        self.assertEquals(18, result["1"][0].stop)
+        self.assertEquals(6, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True)
+        self.assertEquals(10, result["chr1"][0].start)
+        self.assertEquals(20, result["chr1"][0].stop)
+        self.assertEquals(10, result["chr1"][0].length)
  
          regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
          regionList = [regionEntry]
  
          regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 40, 30)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10), (15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
-        result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2", result["1"][1].label)
  
          regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
  
          regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1\tspacer", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2\tspacer2", result["1"][1].label)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
          regionList.append(regionEntry)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 2040, 2030)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
-        result = {"1": [(2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(2030, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=5)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
          regionList.append(regionEntry)
  
          regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
          regionList = [regionEntry]
          regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
          regionList.append(regionEntry)
-        result = {"2": [(15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+        result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
          self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList = [regionEntry]
          self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
          self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
  
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+
          regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
          regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
          regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
          regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
          regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
          regionList.append(regionEntry)
          regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
-                  "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
-        }
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(88, result["2"][0].stop)
+        self.assertEquals(73, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("4", result["2"][0].label)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals(32, result["1"][1].peakPos)
+        self.assertEquals(17, result["1"][1].peakHeight)
+        self.assertEquals("3", result["1"][1].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+        self.assertEquals(65, result["2"][1].start)
+        self.assertEquals(88, result["2"][1].stop)
+        self.assertEquals(23, result["2"][1].length)
+        self.assertEquals(72, result["2"][1].peakPos)
+        self.assertEquals(7, result["2"][1].peakHeight)
+        self.assertEquals("4", result["2"][1].label)
  
          regionList = ["# comment"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList.append(regionEntry)
  
          regionList = ["# comment"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# pvalue"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
          regionList = ["# pvalue"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
          regionList = ["# readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
          regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          regionList = ["# pvalue readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
          regionList.append(regionEntry)
          regionList = ["# pvalue readShift"]
          regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
          regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
          #Test fails - the header line is required if there are fields after the peak which isn't so good
          #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
  
          #Test fails - the header line is required if there are fields after the peak which isn't so good
          #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
  
@@ -274,28 +446,61 @@ class TestCommoncode(unittest.TestCase):
      #TODO: write test
      def testFindPeak(self):
          hitList = []
      #TODO: write test
      def testFindPeak(self):
          hitList = []
-        result = ([], 0.0, array("f"), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+        result = commoncode.findPeak(hitList, 0, 0)
+        self.assertEquals([], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f"), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
-        result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, doWeight=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(0.5, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.5, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+        self.assertEquals(6, result.shift)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(3, result.shift)
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
  
          hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+        self.assertEquals(3, result.shift)
  
  
      #TODO: write test
  
  
      #TODO: write test
@@ -354,10 +559,13 @@ class TestCommoncode(unittest.TestCase):
          }
          self.assertEquals(result, complementDict)
  
          }
          self.assertEquals(result, complementDict)
  
-        regionDict = {"I": [("new feature", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new feature"
+        regionDict = {"I": [additionalRegion]}
          featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
          result = (100, 150, "new feature", "+", "custom")
          self.assertEquals(result, featureDict["I"][0])
          featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
          result = (100, 150, "new feature", "+", "custom")
          self.assertEquals(result, featureDict["I"][0])
+        
  
  
      def testGetLocusByChromDict(self):
  
  
      def testGetLocusByChromDict(self):
@@ -382,7 +590,9 @@ class TestCommoncode(unittest.TestCase):
              self.assertTrue(chrom in self.celegansChroms)
              self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
  
              self.assertTrue(chrom in self.celegansChroms)
              self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
  
-        regionDict = {"I": [("new region", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new region"
+        regionDict = {"I": [additionalRegion]}
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
          self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
          self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
          locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
diff --git a/test/testErange.py b/test/testErange.py

index 4455fff7211ff686832b81cb80ed6cc7adcfba03..d993fad8ba7e7f1dd208d6760d465842538ddba7 100644 (file)
--- a/test/testErange.py
+++ b/test/testErange.py
@@ -11,8 +11,13 @@ Created on Sep 8, 2010
  import sys
  import unittest
  import testAnalyzeGO
  import sys
  import unittest
  import testAnalyzeGO
+import testBedToRegion
+import testBinsToCdf
+import testCdfDist
  import testChksnp
  import testChksnp
+import testColsum
  import testCommoncode
  import testCommoncode
+import testFeatureIntersects
  import testGeneMrnaCounts
  import testGeneMrnaCountsWeighted
  #import testGetFasta
  import testGeneMrnaCounts
  import testGeneMrnaCountsWeighted
  #import testGetFasta
@@ -23,6 +28,7 @@ import testMakeBamFromRds
  import testmakebedfromrds
  #import testMakeGraphs
  import testMakeRdsFromBam
  import testmakebedfromrds
  #import testMakeGraphs
  import testMakeRdsFromBam
+import testMakeSiteTrack
  import testMakeSNPTrack
  import testMarkLinkers
  import testPeak
  import testMakeSNPTrack
  import testMarkLinkers
  import testPeak
@@ -41,8 +47,13 @@ def main(argv=None):
  
      suite = unittest.TestSuite()
      suite.addTest(testAnalyzeGO.suite())
  
      suite = unittest.TestSuite()
      suite.addTest(testAnalyzeGO.suite())
+    suite.addTest(testBedToRegion.suite())
+    suite.addTest(testBinsToCdf.suite())
+    suite.addTest(testCdfDist.suite())
      suite.addTest(testChksnp.suite())
      suite.addTest(testChksnp.suite())
+    suite.addTest(testColsum.suite())
      suite.addTest(testCommoncode.suite())
      suite.addTest(testCommoncode.suite())
+    suite.addTest(testFeatureIntersects.suite())
      suite.addTest(testGeneMrnaCounts.suite())
      suite.addTest(testGeneMrnaCountsWeighted.suite())
      #suite.addTest(testGetFasta.suite())
      suite.addTest(testGeneMrnaCounts.suite())
      suite.addTest(testGeneMrnaCountsWeighted.suite())
      #suite.addTest(testGetFasta.suite())
@@ -53,6 +64,7 @@ def main(argv=None):
      suite.addTest(testmakebedfromrds.suite())
      #suite.addTest(testMakeGraphs.suite())
      suite.addTest(testMakeRdsFromBam.suite())
      suite.addTest(testmakebedfromrds.suite())
      #suite.addTest(testMakeGraphs.suite())
      suite.addTest(testMakeRdsFromBam.suite())
+    suite.addTest(testMakeSiteTrack.suite())
      suite.addTest(testMakeSNPTrack.suite())
      suite.addTest(testMarkLinkers.suite())
      suite.addTest(testPeak.suite())
      suite.addTest(testMakeSNPTrack.suite())
      suite.addTest(testMarkLinkers.suite())
      suite.addTest(testPeak.suite())
diff --git a/test/testFeatureIntersects.py b/test/testFeatureIntersects.py

new file mode 100644 (file)

index 0000000..9633f8c
--- /dev/null
+++ b/test/testFeatureIntersects.py
@@ -0,0 +1,73 @@
+
+import unittest
+import os
+from erange import featureIntersects
+
+
+class TestFeatureIntersects(unittest.TestCase):
+
+    testInputFile = "erangeTestFeatureIntersects"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1\tchr1\t30\t60\n")
+        cdffile.write("line2\tchr1\t90\t99\n")
+        cdffile.write("line3\tchr1\t5\t80\n")
+        cdffile.write("line4\tchr2\t10\t14\n")
+        cdffile.write("line4\tnot to be processed\n")
+        cdffile.write("line5\tchr2\t10\t14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "position file does not exist"
+
+
+    #TODO: write test
+    def testFeatureIntersects(self):
+        pass
+
+
+    #TODO: write test
+    def testGetPositionList(self):
+        pass
+
+
+    #TODO: write test
+    def testMain(self):
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, featureIntersects.main, argv)
+        argv = ["cdfdist", self.testInputFile]
+        featureIntersects.main(argv)
+
+
+    def testMakeParser(self):
+        parser = featureIntersects.makeParser("")
+        argv = []
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual([], args)
+        argv = [self.testInputFile]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual(self.testInputFile, args[0])
+        self.assertEqual("TFBSCONSSITES", options.cistype)
+        self.assertEqual(100, options.radius)
+        argv = [self.testInputFile, "--cistype", "test", "--radius", "50"]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual("test", options.cistype)
+        self.assertEqual(50, options.radius)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestFeatureIntersects))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/test/testMakeSiteTrack.py b/test/testMakeSiteTrack.py

new file mode 100644 (file)

index 0000000..7e609e7
--- /dev/null
+++ b/test/testMakeSiteTrack.py
@@ -0,0 +1,94 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import makesitetrack
+
+
+class TestMakeSiteTrack(unittest.TestCase):
+
+    testInputFile = "erangeTestSiteFile"
+    testOutputFile = "erangeTestBedFile"
+
+
+    def setUp(self):
+        self.createCompactInputFile()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "bed file does not exist"
+
+
+    def createStandardInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def createCompactInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("chr1:100-175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def testMakeSiteTrack(self):
+        self.createCompactInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile)
+        bedfile = open(self.testOutputFile)
+        header = 'track name="" visibility=4 itemRgb="On"\n'
+        self.assertEquals(header, bedfile.readline())
+        result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n"
+        self.assertEquals(result, bedfile.readline())
+
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True)
+        bedfile = open(self.testOutputFile)
+        header = bedfile.readline()
+        firstEntry = bedfile.readline()
+        self.assertEquals(firstEntry, bedfile.readline())
+
+
+    def testNonCompactMakeSiteTrack(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        self.createStandardInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255")
+        bedfile = open(self.testOutputFile)
+        bedfile.readline()
+        result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n"
+        self.assertEquals(result, bedfile.readline())
+
+
+    def testMain(self):
+        self.createCompactInputFile()
+        argv = ["makesitetrack"]
+        self.assertRaises(SystemExit, makesitetrack.main, argv)
+        argv = ["makesitetrack", self.testInputFile, self.testOutputFile]
+        makesitetrack.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeSiteTrack))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
+\ No newline at end of file
diff --git a/weighMultireads.py b/weighMultireads.py

index f4b1691a07a08b15988539f75f1fbb2a62e148b2..9fd041d4462abb552eef87ebd82430177106231f 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -168,7 +168,6 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
-            #TODO: Is this right? If match index is 0 are we doing nothing?
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
@@ -330,4 +329,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
author	Sean Upchurch <sau@caltech.edu>
	Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committer	Sean Upchurch <sau@caltech.edu>
	Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
MakeBamFromRds.py		patch \| blob \| history
ReadDataset.py		patch \| blob \| history
altSpliceCounts.py		patch \| blob \| history
bedtoregion.py		patch \| blob \| history
binstocdf.py		patch \| blob \| history
buildMatrix.py		patch \| blob \| history
cdfdist.py		patch \| blob \| history
colsum.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
combinerds.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
distalPairs.py		patch \| blob \| history
farPairs.py		patch \| blob \| history
featureIntersects.py		patch \| blob \| history
findall.py		patch \| blob \| history
geneDownstreamBins.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
getNovelSNPs.py		patch \| blob \| history
makerdsfromblat.py		patch \| blob \| history
makerdsfrombowtie.py		patch \| blob \| history
makerdsfromeland2.py		patch \| blob \| history
predictSpliceCount.py		patch \| blob \| history
siteintersects.py		patch \| blob \| history
test/testBedToRegion.py	[new file with mode: 0644]	patch \| blob
test/testBinsToCdf.py	[new file with mode: 0644]	patch \| blob
test/testCdfDist.py	[new file with mode: 0644]	patch \| blob
test/testChksnp.py		patch \| blob \| history
test/testColsum.py	[new file with mode: 0644]	patch \| blob
test/testCommoncode.py		patch \| blob \| history
test/testErange.py		patch \| blob \| history
test/testFeatureIntersects.py	[new file with mode: 0644]	patch \| blob
test/testMakeSiteTrack.py	[new file with mode: 0644]	patch \| blob
weighMultireads.py		patch \| blob \| history