Release version for Erange 4.0a
authorSean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committerSean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
34 files changed:
MakeBamFromRds.py
ReadDataset.py
altSpliceCounts.py
bedtoregion.py
binstocdf.py
buildMatrix.py
cdfdist.py
colsum.py
combineRPKMs.py
combinerds.py
commoncode.py
distalPairs.py
farPairs.py
featureIntersects.py
findall.py
geneDownstreamBins.py
geneLocusBins.py
geneMrnaCountsWeighted.py
getNovelSNPs.py
makerdsfromblat.py
makerdsfrombowtie.py
makerdsfromeland2.py
predictSpliceCount.py
siteintersects.py
test/testBedToRegion.py [new file with mode: 0644]
test/testBinsToCdf.py [new file with mode: 0644]
test/testCdfDist.py [new file with mode: 0644]
test/testChksnp.py
test/testColsum.py [new file with mode: 0644]
test/testCommoncode.py
test/testErange.py
test/testFeatureIntersects.py [new file with mode: 0644]
test/testMakeSiteTrack.py [new file with mode: 0644]
weighMultireads.py

index bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59..730c20be6fb147f84e58b7f80485199e0975ad3a 100644 (file)
@@ -22,13 +22,13 @@ import pysam
 import ReadDataset
 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
 
 import ReadDataset
 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
 
+VERSION = "1.0"
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    verstring = "makeBamFromRds: version 1.0"
-    print verstring
+    print "makeBamFromRds: version %s" % VERSION
 
     doPairs = False
     
 
     doPairs = False
     
@@ -134,7 +134,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
     for chrom in chromRemoveList:
         chromList.remove(chrom)
 
     for chrom in chromRemoveList:
         chromList.remove(chrom)
 
-    header = {"HD": {"VN": "1.0"}}
+    header = {"HD": {"VN": VERSION}}
     if referenceSequenceList:
         header["SQ"] = referenceSequenceList
 
     if referenceSequenceList:
         header["SQ"] = referenceSequenceList
 
index 5ff60e2e6954c862888c5807495ef49142e14fa3..56e28a8995b734735ace06d1d990ac9b46753580 100644 (file)
@@ -153,42 +153,42 @@ class ReadDataset():
             self.cachedDB = ""
 
 
             self.cachedDB = ""
 
 
-    def attachDB(self, filename, asname):
+    def attachDB(self, filename, dbName):
         """ attach another database file to the readDataset.
         """
         """ attach another database file to the readDataset.
         """
-        stmt = "attach '%s' as %s" % (filename, asname)
+        stmt = "attach '%s' as %s" % (filename, dbName)
         self.execute(stmt)
 
 
         self.execute(stmt)
 
 
-    def detachDB(self, asname):
+    def detachDB(self, dbName):
         """ detach a database file to the readDataset.
         """
         """ detach a database file to the readDataset.
         """
-        stmt = "detach %s" % (asname)
+        stmt = "detach %s" % (dbName)
         self.execute(stmt)
 
 
         self.execute(stmt)
 
 
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+    def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
         """ import into current RDS the table (with columns destcolumns,
             with default all columns) from the database file asname,
             using the column specification of ascolumns (default all).
         """
         """ import into current RDS the table (with columns destcolumns,
             with default all columns) from the database file asname,
             using the column specification of ascolumns (default all).
         """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
         if flagged != "":
             stmt += " where flag = '%s' " % flagged
 
         self.executeCommit(stmt)
 
 
         if flagged != "":
             stmt += " where flag = '%s' " % flagged
 
         self.executeCommit(stmt)
 
 
-    def getTables(self, asname=""):
+    def getTables(self, dbName=""):
         """ get a list of table names in a particular database file.
         """
         resultList = []
         sql = self.getSqlCursor()
 
         """ get a list of table names in a particular database file.
         """
         resultList = []
         sql = self.getSqlCursor()
 
-        if asname != "":
-            asname += "."
+        if dbName != "":
+            dbName = "%s." % dbName
 
 
-        stmt = "select name from %ssqlite_master where type='table'" % asname
+        stmt = "select name from %ssqlite_master where type='table'" % dbName
         sql.execute(stmt)
         results = sql.fetchall()
 
         sql.execute(stmt)
         results = sql.fetchall()
 
@@ -291,11 +291,15 @@ class ReadDataset():
         if "readsize" not in metadata:
             raise ReadDatasetError("no readsize parameter defined")
         else:
         if "readsize" not in metadata:
             raise ReadDatasetError("no readsize parameter defined")
         else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
+            readSize = metadata["readsize"]
+            if "import" in readSize:
+                readSize = readSize.split()[0]
 
 
-            return int(mysize)
+            readSize = int(readSize)
+            if readSize < 0:
+                raise ReadDatasetError("readsize is negative")
+
+            return readSize
 
 
     def getDefaultCacheSize(self):
 
 
     def getDefaultCacheSize(self):
@@ -317,11 +321,9 @@ class ReadDataset():
                 if row["chrom"] not in results:
                     results.append(row["chrom"])
             else:
                 if row["chrom"] not in results:
                     results.append(row["chrom"])
             else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
+                shortName = row["chrom"][3:]
+                if  len(shortName.strip()) > 0 and shortName not in results:
+                    results.append(shortName)
 
         results.sort()
 
 
         results.sort()
 
@@ -333,32 +335,17 @@ class ReadDataset():
         """ returns the maximum coordinate for reads on a given chromosome.
         """
         maxCoord = 0
         """ returns the maximum coordinate for reads on a given chromosome.
         """
         maxCoord = 0
-        sql = self.getSqlCursor()
 
         if doUniqs:
 
         if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
 
         if doSplices:
 
         if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
+            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+            maxCoord = max(spliceMax, maxCoord)
 
         if doMulti:
 
         if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
+            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+            maxCoord = max(multiMax, maxCoord)
 
         if verbose:
             print "%s maxCoord: %d" % (chrom, maxCoord)
 
         if verbose:
             print "%s maxCoord: %d" % (chrom, maxCoord)
@@ -366,6 +353,19 @@ class ReadDataset():
         return maxCoord
 
 
         return maxCoord
 
 
+    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+        maxCoord = 0
+        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+        sql = self.getSqlCursor()
+        try:
+            sql.execute(sqlStatement)
+            maxCoord = int(sql.fetchall()[0][0])
+        except:
+            print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        return maxCoord
+
+
     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
@@ -378,67 +378,14 @@ class ReadDataset():
         
         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
         """
         
         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
         """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
 
 
-        groupBy = []
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
         if findallOptimize:
         if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
+            selectQuery = "select start, sense, sum(weight)"
         else:
         else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
+            selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 
 
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
+        groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
         if doUniqs:
             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
             if doMulti:
         if doUniqs:
             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
             if doMulti:
@@ -498,6 +445,7 @@ class ReadDataset():
         sqlQuery = string.join(stmt)
         sql.execute(sqlQuery)
 
         sqlQuery = string.join(stmt)
         sql.execute(sqlQuery)
 
+        resultsDict = {}
         if findallOptimize:
             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
             if self.memBacked:
         if findallOptimize:
             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
             if self.memBacked:
@@ -562,20 +510,17 @@ class ReadDataset():
         return resultsDict
 
 
         return resultsDict
 
 
-    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
+    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+        if splice:
+            startText = "startL"
+            stopText = "stopR"
+        else:
+            startText = "start"
+            stopText = "stop"
 
 
+        whereClause = []
         if chrom != "" and chrom != self.memChrom:
         if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
+            whereClause.append("chrom = '%s'" % chrom)
 
         if flag != "":
             if flagLike:
 
         if flag != "":
             if flagLike:
@@ -584,25 +529,37 @@ class ReadDataset():
             else:
                 whereClause.append("flag = '%s'" % flag)
 
             else:
                 whereClause.append("flag = '%s'" % flag)
 
+        if start > -1:
+            whereClause.append("%s > %d" % (startText, start))
+
+        if stop > -1:
+            whereClause.append("%s < %d" % (stopText, stop))
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
         if hasMismatch:
             whereClause.append("mismatch != ''")
 
         if hasMismatch:
             whereClause.append("mismatch != ''")
 
-        if strand != "":
+        if strand in ["+", "-"]:
             whereClause.append("sense = '%s'" % strand)
 
             whereClause.append("sense = '%s'" % strand)
 
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
         if len(whereClause) > 0:
             whereStatement = string.join(whereClause, " and ")
             whereQuery = "where %s" % whereStatement
         else:
             whereQuery = ""
 
         if len(whereClause) > 0:
             whereStatement = string.join(whereClause, " and ")
             whereQuery = "where %s" % whereStatement
         else:
             whereQuery = ""
 
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        return whereQuery
+
+
+    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+        selectClause = [baseSelect]
+        if bothEnds:
+            selectClause.append("stop")
+
         if not noSense:
             selectClause.append("sense")
 
         if not noSense:
             selectClause.append("sense")
 
@@ -615,7 +572,36 @@ class ReadDataset():
         if withMismatch:
             selectClause.append("mismatch")
 
         if withMismatch:
             selectClause.append("mismatch")
 
-        selectQuery = string.join(selectClause, " ,")
+        selectQuery = string.join(selectClause, ",")
+
+        return selectQuery
+
+
+    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+        groupBy = []
+        if findallOptimize:
+            groupBy = ["GROUP BY start, sense"]
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        groupQuery = string.join(groupBy)
+
+        return groupQuery
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+        selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+        selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
         if self.memBacked:
             sql = self.memcon.cursor()
         else:
         if self.memBacked:
             sql = self.memcon.cursor()
         else:
@@ -625,6 +611,7 @@ class ReadDataset():
         sql.execute(stmt)
         currentReadID = ""
         currentChrom = ""
         sql.execute(stmt)
         currentReadID = ""
         currentChrom = ""
+        resultsDict = {}
         for row in sql:
             pairID = 0
             readID = row["readID"]
         for row in sql:
             pairID = 0
             readID = row["readID"]
@@ -796,10 +783,6 @@ class ReadDataset():
         """ get readID's.
         """
         stmt = []
         """ get readID's.
         """
         stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
         if uniqs:
             stmt.append("select readID from uniqs")
 
         if uniqs:
             stmt.append("select readID from uniqs")
 
@@ -814,6 +797,10 @@ class ReadDataset():
         else:
             selectPart = ""
 
         else:
             selectPart = ""
 
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
         if self.memBacked:
             sql = self.memcon.cursor()
         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
         if self.memBacked:
             sql = self.memcon.cursor()
@@ -845,12 +832,11 @@ class ReadDataset():
                 print "getting mismatches from chromosome %s" % (achrom)
 
             snpDict[achrom] = []
                 print "getting mismatches from chromosome %s" % (achrom)
 
             snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
             if useSplices and self.dataType == "RNA":
                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                 spliceIDList = spliceDict.keys()
             if useSplices and self.dataType == "RNA":
                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                 spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    spliceEntry = spliceDict[k][0]
+                for spliceID in spliceIDList:
+                    spliceEntry = spliceDict[spliceID][0]
                     startpos = spliceEntry["startL"]
                     lefthalf = spliceEntry["stopL"]
                     rightstart = spliceEntry["startR"]
                     startpos = spliceEntry["startL"]
                     lefthalf = spliceEntry["stopL"]
                     rightstart = spliceEntry["startR"]
@@ -881,6 +867,7 @@ class ReadDataset():
 
                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 
 
                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
             if achrom not in hitDict.keys():
                 continue
 
             if achrom not in hitDict.keys():
                 continue
 
@@ -910,7 +897,7 @@ class ReadDataset():
 
 
     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 
 
     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        useSplices=False, normalizationFactor=1.0, trackStrand=False,
                         keepStrand="both", shiftValue=0):
         """return a profile of the chromosome as an array of per-base read coverage....
             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
                         keepStrand="both", shiftValue=0):
         """return a profile of the chromosome as an array of per-base read coverage....
             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
@@ -925,8 +912,8 @@ class ReadDataset():
         dataType = metadata["dataType"]
         scale = 1. / normalizationFactor
         shift = {}
         dataType = metadata["dataType"]
         scale = 1. / normalizationFactor
         shift = {}
-        shift['+'] = int(shiftValue)
-        shift['-'] = -1 * int(shiftValue)
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
 
         if cstop > 0:
             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 
         if cstop > 0:
             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
index 12077c1ec7d9d7e3ba499c67c99734a8a9a54157..c59dc9249b6c82ded0466f872e98a9900e702c46 100755 (executable)
@@ -2,7 +2,7 @@ try:
     import psyco
     psyco.full()
 except:
     import psyco
     psyco.full()
 except:
-    print 'psyco not running'
+    print "psyco not running"
 
 print "altSpliceCounts: version 3.7"
 
 
 print "altSpliceCounts: version 3.7"
 
@@ -56,13 +56,13 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
     stopDict = {}
     resultDict = {}
 
     stopDict = {}
     resultDict = {}
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
     readlen = hitRDS.getReadSize()
     hitDict = hitRDS.getSplicesDict(noSense=True)
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
     readlen = hitRDS.getReadSize()
     hitDict = hitRDS.getSplicesDict(noSense=True)
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
 
     for chrom in hitDict:
         startDict[chrom] = []
 
     for chrom in hitDict:
         startDict[chrom] = []
@@ -155,7 +155,7 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
 
         resultDict[chrom].sort()
         for line in resultDict[chrom]:
 
         resultDict[chrom].sort()
         for line in resultDict[chrom]:
-            outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
+            outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
             alternative += 1
 
         print chrom, maxIndex, spliceEvent, altSpliceEvent
             alternative += 1
 
         print chrom, maxIndex, spliceEvent, altSpliceEvent
index 3bcd554e5406a09f31ecc988e8c60733616d0dd5..a6a344d549b907b98343d731b3a348dd9802d0ec 100755 (executable)
@@ -21,14 +21,16 @@ def main(argv=None):
 def bedToRegion(factor, infilename, outfilename):
     index = 1
     infile = open(infilename)
 def bedToRegion(factor, infilename, outfilename):
     index = 1
     infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
     for line in infile:
     for line in infile:
-        if 'track' in line:
+        if "track" in line:
             continue
             continue
+
         fields = line.split()
         fields = line.split()
-        line = string.join(fields, '\t')
-        outfile.write('%s%d\t%s\n' % (factor, index, line))
+        line = string.join(fields, "\t")
+        outfile.write("%s%d\t%s\n" % (factor, index, line))
         index += 1
         index += 1
+
     infile.close()
     outfile.close()
 
     infile.close()
     outfile.close()
 
index 63aa9558b8848183d4528bb16149cd0ced38aadd..27b48e22e8ecde5b4fd9c817ff17d4ab75dcbd13 100755 (executable)
@@ -1,4 +1,5 @@
 import sys
 import sys
+import string
 
 print "binstocdf: version 1.1"
 
 
 print "binstocdf: version 1.1"
 
@@ -6,19 +7,19 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
     if not argv:
         argv = sys.argv
 
-    if len(argv) < 2:
-        print 'usage: python %s infile outfile' % sys.argv[0]
+    if len(argv) < 3:
+        print "usage: python %s infile outfile" % sys.argv[0]
         sys.exit(1)
 
         sys.exit(1)
 
-    infilename = argv[0]
-    outfilename = argv[1]
+    infilename = argv[1]
+    outfilename = argv[2]
 
     binToCDF(infilename, outfilename)
 
 
 def binToCDF(infilename, outfilename):
     infile = open(infilename)
 
     binToCDF(infilename, outfilename)
 
 
 def binToCDF(infilename, outfilename):
     infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
 
     for line in infile:
         fields = line.strip().split()
 
     for line in infile:
         fields = line.strip().split()
@@ -30,14 +31,15 @@ def binToCDF(infilename, outfilename):
             outfile.write(line)
             continue
 
             outfile.write(line)
             continue
 
-        outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
-        cum = 0
+        outputFields = fields[:4]
+        runningTotal = 0
         for bin in fields[4:]:
         for bin in fields[4:]:
-            cum += int(bin)
-            percent = 100 * cum / total
-            outfile.write('\t%d' % percent)
+            runningTotal += int(bin)
+            percent = 100 * runningTotal / total
+            outputFields.append("%d" % percent)
 
 
-        outfile.write('\n')
+        outputLine = string.join(outputFields, "\t")
+        outfile.write("%s\n" % outputLine)
 
     infile.close()
     outfile.close()
 
     infile.close()
     outfile.close()
index c7b6dd0c801025552c365003cf9067b2550c6b0c..a49120cf574f9c0213bd569edfe8507f923435a2 100755 (executable)
@@ -78,7 +78,7 @@ def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
     if header.strip() == "":
         header = "#\t"
 
     if header.strip() == "":
         header = "#\t"
 
-    outfile.write( "%s\t%s\n" % (header, colID))
+    outfile.write("%s\t%s\n" % (header, colID))
 
     values = []
     min = 20000000000.
 
     values = []
     min = 20000000000.
index 71662442afff0bdeb68fb5604307f72b8cedda91..52963b7445a1e28c8f251c94a6220f5fe1dc92f6 100755 (executable)
@@ -8,11 +8,15 @@ def main(argv=None):
         print "usage: python %s bins percent infile" % sys.argv[0]
         sys.exit(1)
 
         print "usage: python %s bins percent infile" % sys.argv[0]
         sys.exit(1)
 
-    bins = int(argv[0])
-    percent = int(argv[1])
-    infilename = argv[2]
+    bins = int(argv[1])
+    percent = int(argv[2])
+    infilename = argv[3]
 
 
-    cdfDist(bins, percent, infilename)
+    printCdfDist(bins, percent, infilename)
+
+
+def printCdfDist(bins, percent, infilename):
+    print cdfDist(bins, percent, infilename)
 
 
 def cdfDist(bins, percent, infilename):
 
 
 def cdfDist(bins, percent, infilename):
@@ -30,7 +34,7 @@ def cdfDist(bins, percent, infilename):
             index += 1
 
     infile.close()
             index += 1
 
     infile.close()
-    print binsList
+    return binsList
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
index f6d1ff968c5260e35e876fb79a82ac201b088585..643aaeaa82b8bc0c444e33ae909d4f5859e2b1d7 100755 (executable)
--- a/colsum.py
+++ b/colsum.py
@@ -28,7 +28,7 @@ def colsum(fieldID, filename):
                 count += float(fields[fieldID])
             else:
                 count += int(fields[fieldID])
                 count += float(fields[fieldID])
             else:
                 count += int(fields[fieldID])
-        except ValueError:
+        except (ValueError, IndexError):
             pass
 
     infile.close()
             pass
 
     infile.close()
index ead4e1b81d0b37335fbf00908f1aea615cec33dd..77f30d6fc2acb2a9424634a138a03ee30e314f91 100755 (executable)
@@ -51,23 +51,8 @@ def makeParser(usage=""):
 
 def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
 
 
 def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
 
-    firstDict = {}
-    firstfile = open(firstfileName)
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
-
-    firstfile.close()
-
-    expandedDict = {}
-    gidDict = {}
-    expandedfile = open(expandedfileName)
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
-
-    expandedfile.close()
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
 
     if doFraction:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
 
     if doFraction:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
@@ -97,5 +82,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
     outfile.close()
 
 
     outfile.close()
 
 
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
 if __name__ == "__main__":
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4c826b04ed6c668a3d5b2a66ee4b842789a483a0..2878423a9716a0e4f31d0a1738a0770339c8ed6c 100755 (executable)
@@ -10,7 +10,9 @@ except:
     pass
 
 import sys
     pass
 
 import sys
+import optparse
 import ReadDataset
 import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
 
 print "combinerds: version 1.2"
 
 
 print "combinerds: version 1.2"
 
@@ -19,97 +21,107 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
     if not argv:
         argv = sys.argv
 
-    if len(argv) < 2:
-        print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
-        #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    parser = makeParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
         sys.exit(1)
 
         sys.exit(1)
 
-    doCache = False
-    cachePages = -1
-    if '--cache' in argv:
-        doCache = True
-        try:
-            cachePages =  int(argv[sys.argv.index('-cache') + 1])
-        except: 
-            pass
-
-    datafile = argv[1]
-    infileList = []
-    for index in range(2, len(argv)):
-        if argv[index][0] == '-':
-            break
-        infileList.append(sys.argv[index])
+    datafile = args[0]
+    infileList = args[1:]
 
 
-    print "destination RDS: %s" % datafile
+    combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA)
 
 
-    if '--initrna' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
-    elif '--init' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True)
 
 
-    withFlag = ''
-    if '--flag' in argv:
-        withFlag = argv[sys.argv.index('-flag') + 1]
-        print "restrict to flag = %s" % withFlag
+def makeParser():
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--table", action="append", dest="tablelist")
+    parser.add_option("--init", action="store_true", dest="doInit")
+    parser.add_option("--initrna", action="store_true", dest="initRNA")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--flag", dest="withFlag")
+
+    configParser = getConfigParser()
+    section = "combinerds"
+    doInit = getConfigBoolOption(configParser, section, "doInit", False)
+    initRNA = getConfigBoolOption(configParser, section, "initRNA", False)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+
+    parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages,
+                        withFlag=withFlag)
+
+    return parser
+
 
 
-    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
+def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False):
 
 
+    print "destination RDS: %s" % datafile
+    datasetType="DNA"
+    if initRNA:
+        doInit = True
+        datasetType="RNA"
+
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = -1
+
+    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType)
     if cachePages > rds.getDefaultCacheSize():
         rds.setDBcache(cachePages)
     if cachePages > rds.getDefaultCacheSize():
         rds.setDBcache(cachePages)
-        cacheVal = cachePages
     else:
     else:
-        cacheVal = rds.getDefaultCacheSize()
-
-    doIndex = False
-    if '--index' in argv:
-        doIndex = True
+        cachePages = rds.getDefaultCacheSize()
 
 
-    tableList = []
-    if '--table' in argv:
-        tableList.append(argv[argv.index('-table') + 1])
-    else:
+    if tableList == []:
         tableList = rds.getTables()
 
         tableList = rds.getTables()
 
-    combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
-
+    if withFlag != "":
+        print "restrict to flag = %s" % withFlag
 
 
-def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
     metaDict = rds.getMetadata()
     if "numberImports" not in metaDict:
         origIndex = 0
     metaDict = rds.getMetadata()
     if "numberImports" not in metaDict:
         origIndex = 0
-        rds.insertMetadata([("numberImports", str(0))])
+        rds.insertMetadata([("numberImports", "0")])
     else:
         origIndex = int(metaDict["numberImports"])
 
     index = origIndex
     for inputfile in infileList:
     else:
         origIndex = int(metaDict["numberImports"])
 
     index = origIndex
     for inputfile in infileList:
-        asname = "input" + str(index)
-        rds.attachDB(inputfile,asname)
+        dbName = "input%s" % str(index)
+        rds.attachDB(inputfile, dbName)
         for table in tableList:
             print "importing table %s from file %s" % (table, inputfile)
         for table in tableList:
             print "importing table %s from file %s" % (table, inputfile)
-            ascols = "*"
+            dbColumns = "*"
             if table == "uniqs":
             if table == "uniqs":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "multi":
             elif table == "multi":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "splices":
             elif table == "splices":
-                ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
-            elif table == "metadata":
-                ascols = "name, value || ' (import_%d)'" % index
-                rds.importFromDB(asname, table, ascols)
+                dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
 
 
-            if table != "metadata":
-                rds.importFromDB(asname, table, ascols, withFlag)
+            if table == "metadata":
+                dbColumns = "name, value || ' (import_%d)'" % index
+                rds.importFromDB(dbName, table, dbColumns)
+            else:
+                rds.importFromDB(dbName, table, dbColumns, withFlag)
 
 
-        rds.detachDB(asname)
-        rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+        rds.detachDB(dbName)
+        rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))])
         index += 1
 
     rds.updateMetadata("numberImports", index, origIndex)
     if doIndex:
         print "building index...."
         index += 1
 
     rds.updateMetadata("numberImports", index, origIndex)
     if doIndex:
         print "building index...."
-        if cacheVal > 0:
-            rds.buildIndex(cacheVal)
+        if cachePages > 0:
+            rds.buildIndex(cachePages)
         else:
             rds.buildIndex()
 
         else:
             rds.buildIndex()
 
index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..f526daacca0dc91245e71205363216bc17eba2e9 100755 (executable)
@@ -17,6 +17,9 @@ import Region
 commoncodeVersion = 5.6
 currentRDSversion = 2.0
 
 commoncodeVersion = 5.6
 currentRDSversion = 2.0
 
+class ErangeError(Exception):
+    pass
+
 
 def getReverseComplement(base):
     revComp = {"A": "T",
 
 def getReverseComplement(base):
     revComp = {"A": "T",
@@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False):
     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
 
 
     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
 
 
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
     if extendGenome != "":
     if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
 
 
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
 
     return geneannotDict
 
 
     return geneannotDict
 
@@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
     mergeCount = 0
     chromField = int(chromField)
     count = 0
     mergeCount = 0
     chromField = int(chromField)
     count = 0
-    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
-    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
-    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
-    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
-    #      be merged with A but that will exit the loop and the merge with C will be missed.
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
@@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     """
 
     if shift == "auto":
     """
 
     if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 
     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 
 
     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 
@@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
     topPos = getPeakPositionList(smoothArray, length)
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
     topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 
     if leftPlus:
         numLeftPlus = 0
 
     if leftPlus:
         numLeftPlus = 0
@@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     return peak
 
 
     return peak
 
 
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
     bestShift = 0
     lowestScore = 20000000000
     for testShift in xrange(maxShift + 1):
         shiftArray = array("f", [0.] * length)
     bestShift = 0
     lowestScore = 20000000000
     for testShift in xrange(maxShift + 1):
         shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
             currentpos = read["start"] - start
             if read["sense"] == "+":
                 currentpos += testShift
             currentpos = read["start"] - start
             if read["sense"] == "+":
                 currentpos += testShift
@@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
-            if doWeight:
+            if useWeight:
                 weight = read["weight"]
             else:
                 weight = 1.0
                 weight = read["weight"]
             else:
                 weight = 1.0
@@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
             hitIndex += 1
             currentpos += 1
 
             hitIndex += 1
             currentpos += 1
 
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
             seqArray[currentpos] += weight
             hitIndex += 1
             currentpos += 1
             seqArray[currentpos] += weight
             hitIndex += 1
             currentpos += 1
@@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length):
         if topNucleotide < smoothArray[currentpos]:
             topNucleotide = smoothArray[currentpos]
             peakList = [currentpos]
         if topNucleotide < smoothArray[currentpos]:
             topNucleotide = smoothArray[currentpos]
             peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
             peakList.append(currentpos)
 
     return peakList
             peakList.append(currentpos)
 
     return peakList
@@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
         return featuresByChromDict
 
 
         return featuresByChromDict
 
 
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
     """ return a dictionary of gene loci. Can be used to retrieve additional
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
     """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
         return locusByChromDict
 
         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
         return locusByChromDict
 
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
     for gid in featuresDict:
         featureList = featuresDict[gid]
         newFeatureList = []
     for gid in featuresDict:
         featureList = featuresDict[gid]
         newFeatureList = []
@@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
                 gstart -= distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
                 gstop += distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
 
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
         else:
             if not useCDS and upstream > 0:
                 if upstreamSpanTSS:
         else:
             if not useCDS and upstream > 0:
                 if upstreamSpanTSS:
@@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
                 gstop += distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
                 gstart -= distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
 
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
 
 
-        glen = abs(gstop - gstart)
         if chrom not in locusByChromDict:
             locusByChromDict[chrom] = []
 
         if chrom not in locusByChromDict:
             locusByChromDict[chrom] = []
 
@@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
     return locusByChromDict
 
 
     return locusByChromDict
 
 
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                       binLength=-1):
 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                       binLength=-1):
@@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                 rlen = regionTuple[lengthField]
                 try:
                     rsense = regionTuple[senseField]
                 rlen = regionTuple[lengthField]
                 try:
                     rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                     rsense = "F"
 
                 if tagStart > stop:
                     rsense = "F"
 
                 if tagStart > stop:
index 5bc25321b25b8a30e9b09511cabbee4355010b71..be403109110c3fb40f7bba831f6c2ea48e496536 100755 (executable)
@@ -85,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
 
     print time.ctime()
 
 
     print time.ctime()
 
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
     print "getting uniq reads"    
     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
     print "got uniqs"
 
     if doSplices:
     print "getting uniq reads"    
     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
     print "got uniqs"
 
     if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
 
     if doVerbose:
         print len(uniqDict), time.ctime()
 
     outfile = open(outfilename,"w")
 
     if doVerbose:
         print len(uniqDict), time.ctime()
 
     outfile = open(outfilename,"w")
-
     diffChrom = 0
     distal = 0
     total = 0
     diffChrom = 0
     distal = 0
     total = 0
@@ -132,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                     continue
                 else:
                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
                     continue
                 else:
                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print diffChrom, outline
             else:
                 dist = abs(start1 - start2)
                     if doVerbose:
                         print diffChrom, outline
             else:
                 dist = abs(start1 - start2)
-
                 if minDist < dist < maxDist:
                     distal += 1
                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
                 if minDist < dist < maxDist:
                     distal += 1
                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print distal, outline
 
                     if doVerbose:
                         print distal, outline
 
@@ -157,5 +140,22 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
     print time.ctime()
 
 
     print time.ctime()
 
 
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        del read0[1]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                del read2[1]
+                uniqDict[readID] = [read0,read2]
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index 00cc91844b23084d712573261e996524739c9870..712e60eff9815f9728b6a03983385f83d96bc6f5 100644 (file)
@@ -14,8 +14,9 @@ except:
 import sys
 import time
 import optparse
 import sys
 import time
 import optparse
+import string
 import ReadDataset
 import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
 
 print "farPairs: version 1.4"
 
 
 print "farPairs: version 1.4"
 
@@ -38,14 +39,13 @@ def main(argv=None):
     outfilename = args[1]
     outbedname = args[2]
 
     outfilename = args[1]
     outbedname = args[2]
 
-    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+    farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
              options.cachePages, options.minDist, options.maxDist, options.minCount,
              options.label)
 
 
 def getParser(usage):
     parser = optparse.OptionParser(usage=usage)
              options.cachePages, options.minDist, options.maxDist, options.minCount,
              options.label)
 
 
 def getParser(usage):
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
     parser.add_option("--cache", type="int", dest="cachePages")
     parser.add_option("--verbose", action="store_true", dest="doVerbose")
     parser.add_option("--minDist", type="int", dest="minDist")
     parser.add_option("--cache", type="int", dest="cachePages")
     parser.add_option("--verbose", action="store_true", dest="doVerbose")
     parser.add_option("--minDist", type="int", dest="minDist")
@@ -55,7 +55,6 @@ def getParser(usage):
 
     configParser = getConfigParser
     section = "farPairs"
 
     configParser = getConfigParser
     section = "farPairs"
-    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
@@ -63,15 +62,33 @@ def getParser(usage):
     minCount = getConfigIntOption(configParser, section, "minCount", 2)
     label = getConfigOption(configParser, section, "label", None)
 
     minCount = getConfigIntOption(configParser, section, "minCount", 2)
     label = getConfigOption(configParser, section, "label", None)
 
-    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+    parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
 
     return parser
 
 
                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
 
     return parser
 
 
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
 
              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
 
+    flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+    outfile = open(outfilename, "w")
+    for region in flagDict:
+        regionConnections = countDuplicatesInList(flagDict[region])
+        for (connectedRegion, count) in regionConnections:
+            if count >= minCount:
+                outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+                print >> outfile, outline
+                if doVerbose:
+                    print outline
+
+    outfile.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
     doCache = False
     if cachePages is not None:
         doCache = True
     doCache = False
     if cachePages is not None:
         doCache = True
@@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
 
     if doVerbose:
         print time.ctime()
 
     if doVerbose:
         print time.ctime()
-
-    total = 0
-    outfile = open(outfilename, "w")
+    
     outbed = open(outbedname, "w")
     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
     outbed = open(outbedname, "w")
     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+    outbed.close()
 
     readlen = RDS.getReadSize()
     flagDict = {}
 
     readlen = RDS.getReadSize()
     flagDict = {}
@@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
         if doNotProcessChromosome(chromosome):
             continue
 
         if doNotProcessChromosome(chromosome):
             continue
 
-        print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
-        if doVerbose:
-            print len(uniqDict), time.ctime()
-
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                start1 = readList[0]["start"]
-                flag1 = readList[0]["flag"]
-                start2 = readList[1]["start"]
-                flag2 = readList[1]["flag"]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    startList = [start1, start2]
-                    stopList = [start1 + readlen, start2 + readlen]
-                    startList.sort()
-                    stopList.sort()
-                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
-                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
-                        outbed.write(outputLine)
-                        if doVerbose:
-                            print flag1, flag2, dist
-
-                        try:
-                            flagDict[flag1].append((flag2, start1, start2))
-                        except KeyError:
-                            flagDict[flag1] = [(flag2, start1, start2)]
-
-                        try:
-                            flagDict[flag2].append((flag1, start1, start2))
-                        except KeyError:
-                            flagDict[flag2] = [(flag2, start1, start2)]
+        writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
 
     print "%d connected regions" % len(flagDict)
 
 
     print "%d connected regions" % len(flagDict)
 
-    for region in flagDict:
-        flagDict[region].sort()
-        regionConnections = {}
-        for (region2, start1, start2) in flagDict[region]:
+    return flagDict
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+    outbed = open(outbedname, "a")
+    print chromosome
+    uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if readsAreFarPair(readList, minDist, maxDist):
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            startList = [start1, start2]
+            startList.sort()
+            outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+            outbed.write(outputLine)
+            flag1 = readList[0]["flag"]
+            flag2 = readList[1]["flag"]
+            if doVerbose:
+                print flag1, flag2, abs(start1 - start2)
+
             try:
             try:
-                regionConnections[region2] += 1
+                flagDict[flag1].append(flag2)
             except KeyError:
             except KeyError:
-                regionConnections[region2] = 1
+                flagDict[flag1] = [flag2]
 
 
-        for region2 in regionConnections:
-            if regionConnections[region2] >= minCount:
-                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
-                if doVerbose:
-                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+            try:
+                flagDict[flag2].append(flag1)
+            except KeyError:
+                flagDict[flag2] = [flag1]
 
 
-    outfile.close()
     outbed.close()
     outbed.close()
-    if doVerbose:
-        print "finished: ", time.ctime()
 
 
 
 
-def doNotProcessChromosome(chrom):
-    return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+    isFarPair = False
+    if len(readList) == 2:
+        flag1 = readList[0]["flag"]
+        flag2 = readList[1]["flag"]
+        if flag1 != flag2 and flag1 != "" and flag2 != "":
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            dist = abs(start1 - start2)
+            if minDist < dist < maxDist:
+                isFarPair = True
 
 
+    return isFarPair
 
 
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
-    readSizes = "%d" % (stopList[0] - startList[0])
-    readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+    sizes = ["%d" % readlen]
+    coords = ["0"]
     leftStart = startList[0] - 1
     leftStart = startList[0] - 1
-    rightStop = stopList[-1]
+    rightStop = startList[-1]
     for index in range(1, numPieces):
     for index in range(1, numPieces):
-        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
-        readCoords += ",%d" % (startList[index] - startList[0])
+        sizes.append("%d" % (readlen + 1))
+        coords.append("%d" % (startList[index] - startList[0]))
 
     if rsense == "+":
 
     if rsense == "+":
-        senseCode = plusSense
+        senseCode = plusSenseColor
     else:
     else:
-        senseCode = minusSense
+        senseCode = minusSenseColor
 
 
+    readSizes = string.join(sizes, ",")
+    readCoords = string.join(coords, ",")
     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
     return outline
 
 
     return outline
 
 
index cd357cc072c5cc7aa4460371439ef59c428873f5..c24b79edb312fae6c0b19f0d7e3d3ffb45d12984 100755 (executable)
@@ -38,7 +38,7 @@ def main(argv=None):
 
 def makeParser(usage=""):
     parser = optparse.OptionParser(usage=usage)
 
 def makeParser(usage=""):
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--cistype", dest="cistype")
     parser.add_option("--radius", type="int", dest="radius")
 
     configParser = getConfigParser()
     parser.add_option("--radius", type="int", dest="radius")
 
     configParser = getConfigParser()
@@ -52,10 +52,19 @@ def makeParser(usage=""):
 
 
 def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
 
 
 def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
-    tabfile = open(tabFileName)
-    previous = ""
+    
+    posList = getPositionList(tabFileName)
+    feats = featuresIntersecting("human", posList, radius, cistype)
+    featkeys = feats.keys()
+    featkeys.sort()
+    for (chrom, pos) in featkeys:
+        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
 
 
+def getPositionList(tabFileName):
+    previous = ""
     posList = []
     posList = []
+    tabfile = open(tabFileName)
     for line in tabfile:
         fields = line.split("\t")
         current = fields[0]
     for line in tabfile:
         fields = line.split("\t")
         current = fields[0]
@@ -66,11 +75,7 @@ def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
         chrom = fields[1][3:]
         posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
 
         chrom = fields[1][3:]
         posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
 
-    feats = featuresIntersecting("human", posList, radius, cistype)
-    featkeys = feats.keys()
-    featkeys.sort()
-    for (chrom, pos) in featkeys:
-        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+    return posList
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
index d20608f1997e2e6a0fd84d9169c78025485e4b57..ec75efb54f3680c892578a38811c1bd241c37241 100755 (executable)
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             strandfilter=None, combine5p=False):
 
     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
             strandfilter=None, combine5p=False):
 
     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
 
     if doRevBackground:
         print "Swapping IP and background to calculate FDR"
         pValueType = "back"
 
 
     if doRevBackground:
         print "Swapping IP and background to calculate FDR"
         pValueType = "back"
 
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
     doPvalue = True
     if ptype is not None:
         ptype = ptype.upper()
     doPvalue = True
     if ptype is not None:
         ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     else:
         pValueType = "self"
 
     else:
         pValueType = "self"
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
@@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
 
     stringency = max(stringency, 1.0)
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
 
     stringency = max(stringency, 1.0)
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
     if doAppend:
         fileMode = "a"
     else:
         fileMode = "w"
 
     outfile = open(outfilename, fileMode)
     if doAppend:
         fileMode = "a"
     else:
         fileMode = "w"
 
     outfile = open(outfilename, fileMode)
-
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
         mockRDSsize = len(mockRDS) / 1000000.
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
         mockRDSsize = len(mockRDS) / 1000000.
@@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
 
                 if foldRatio >= minRatio:
                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
 
                 if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                     try:
                         shiftDict[localshift] += 1
                     except KeyError:
                     try:
                         shiftDict[localshift] += 1
                     except KeyError:
@@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
 
 
 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
 
 
 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
@@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
         weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
             sumAll = currentTotalWeight
         weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
             sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                 sumAll /= rdsSampleSize
 
             regionStart = currentHitList[0]
                 sumAll /= rdsSampleSize
 
             regionStart = currentHitList[0]
@@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
                 #first pass uses getFoldRatio on mockRDS as there may not be control
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
                 #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                     peakScore = peak.smoothArray[bestPos]
                     numPlus = peak.numPlus
                     shift = peak.shift
                     peakScore = peak.smoothArray[bestPos]
                     numPlus = peak.numPlus
                     shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                         peakScore /= rdsSampleSize
 
                     if doTrim:
                         peakScore /= rdsSampleSize
 
                     if doTrim:
@@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
 
                         sumAll = trimPeak.numHits
                         numPlus = trimPeak.numPlus
 
                         sumAll = trimPeak.numHits
                         numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                             sumAll /= rdsSampleSize
 
                             sumAll /= rdsSampleSize
 
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                         if outputRegionList:
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
                         if outputRegionList:
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
@@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         except:
                             continue
 
                         except:
                             continue
 
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
                         sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
                         sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             sumMulti /= rdsSampleSize
 
                         try:
                             sumMulti /= rdsSampleSize
 
                         try:
@@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
     return footer
 
 if __name__ == "__main__":
     return footer
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4ce97e4fc1380517447153e59fba80e5531e3029..91db4a1a44870602383a072d7a4585568026f165 100755 (executable)
@@ -14,7 +14,7 @@ import sys
 import optparse
 import ReadDataset
 from cistematic.genomes import Genome
 import optparse
 import ReadDataset
 from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
 
 print "geneDownstreamBins: version 2.1"
 
 
 print "geneDownstreamBins: version 2.1"
 
@@ -53,8 +53,6 @@ def makeParser(usage=""):
 
 
 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
 
 
 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
-    bins = 10
-    standardMinThresh = standardMinDist / bins
 
     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     normalizationFactor = 1.0
 
     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     normalizationFactor = 1.0
@@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac
     geneinfoDict = getGeneInfoDict(genome, cache=True)
     hg = Genome(genome)
     featuresDict = hg.getallGeneFeatures()
     geneinfoDict = getGeneInfoDict(genome, cache=True)
     hg = Genome(genome)
     featuresDict = hg.getallGeneFeatures()
-
     outfile = open(outfilename, "w")
     outfile = open(outfilename, "w")
-
     gidList = hg.allGIDs()
     gidList.sort()
     for gid in gidList:
     gidList = hg.allGIDs()
     gidList.sort()
     for gid in gidList:
-        symbol = "LOC" + gid
-        geneinfo = ""
-        featureList = []
         try:
         try:
-            geneinfo = geneinfoDict[gid]
-            featureList = featuresDict[gid]
-            symbol = geneinfo[0][0]
-        except:
+            featuresList = featuresDict[gid]
+        except KeyError:
             print gid
 
             print gid
 
-        if len(featureList) == 0:
+        try:
+            binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+        except ErangeError:
             continue
 
             continue
 
-        newfeatureList = []
-        for (ftype, chrom, start, stop, fsense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
+        tagCount *= normalizationFactor
+        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+        for binAmount in binList:
+            outfile.write("\t%.2f" % binAmount)
 
 
-        if chrom not in hitDict:
-            continue
+        outfile.write("\n")
 
 
-        newfeatureList.sort()
-        if len(newfeatureList) < 1:
-            continue
+    outfile.close()
 
 
-        glen = standardMinDist
-        if fsense == "F":
-            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
 
 
-            if glen < 1:
-                glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
 
 
-            gstart = newfeatureList[-1][1]
-        else:
-            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
+    symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+    geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+    if geneLength < standardMinDist:
+        raise ErangeError("gene length less than minimum")
 
 
-            if glen < 1:
-                glen = 1
+    binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+    if tagCount < 2:
+        raise ErangeError("tag count less than minimum")
 
 
-            gstart = newfeatureList[0][0] - glen
-            if gstart < 0:
-                gstart = 0
+    return binList, symbol, geneLength, tagCount
 
 
-        tagCount = 0
-        if glen < standardMinDist:
-            continue
 
 
-        binList = [0.] * bins
-        for read in hitDict[chrom]:
-            tagStart = read["start"]
-            weight = read["weight"]
-            tagStart -= gstart
-            if tagStart >= glen:
-                break
-
-            if tagStart > 0:
-                tagCount += weight
-                if fsense == "F":
-                    # we are relying on python's integer division quirk
-                    binID = tagStart / standardMinThresh 
-                    binList[binID] += weight
-                else:
-                    rdist = glen - tagStart
-                    binID = rdist / standardMinThresh 
-                    binList[binID] += weight
-
-        if tagCount < 2:
-            continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+    if len(featureList) == 0:
+        raise ErangeError("no features found")
 
 
-        tagCount *= normalizationFactor
-        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
-        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
-        for binAmount in binList:
-            outfile.write("\t%.2f" % binAmount)
+    symbol = "LOC%s" % gid
+    geneinfo = ""
+    try:
+        geneinfo = geneinfoDict[gid]
+        symbol = geneinfo[0][0]
+    except KeyError:
+        print gid
 
 
-        outfile.write("\n")
+    newfeatureList = []
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+
+    if len(newfeatureList) < 1:
+        raise ErangeError("no features found")
+
+    if chrom not in chromosomeList:
+        raise ErangeError("chromosome not found in reads")
+
+    newfeatureList.sort()
+
+    return symbol, newfeatureList, sense, chrom
 
 
-    outfile.close()
 
 
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+    geneLength = minDistance
+    if sense == "F":
+        nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = featureList[-1][1]
+    else:
+        nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = max(featureList[0][0] - geneLength, 0)
+
+    if nextGene < geneLength * 2:
+        geneLength = nextGene / 2
+
+    geneLength = max(geneLength, 1)
+
+    return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+    tagCount = 0
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+    binList = [0.] * bins
+    for read in readList:
+        tagStart = read["start"]
+        if tagStart >= geneLength:
+            break
+
+        tagStart -= geneStart
+        weight = read["weight"]
+        if tagStart > 0:
+            tagCount += weight
+            if sense == "F":
+                # we are relying on python's integer division quirk
+                binID = tagStart / standardMinThresh 
+            else:
+                rdist = geneLength - tagStart
+                binID = rdist / standardMinThresh
+
+            binList[binID] += weight
+
+    return binList, tagCount
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
@@ -12,6 +12,7 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
+import string
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
 
     return parser
 
 
     return parser
 
+
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
 
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
 
 
-    outfile = open(outfilename,'w')
 
 
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
     for gid in gidList:
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                 pass
         else:
             symbol = gid
                 pass
         else:
             symbol = gid
+
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
         for binAmount in gidBins[gid]:
             if normalizeBins:
         for binAmount in gidBins[gid]:
             if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
     outfile.close()
 
 
 if __name__ == "__main__":
     outfile.close()
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 38e853aec6172be26b448066692193f1537b6926..5299d27d26c23f8729063eb31f0da390730b5ce5 100755 (executable)
@@ -2,7 +2,7 @@ try:
     import psyco
     psyco.full()
 except:
     import psyco
     psyco.full()
 except:
-    print 'psyco not running'
+    print "psyco not running"
 
 import sys
 import optparse
 
 import sys
 import optparse
@@ -68,12 +68,6 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -206,7 +200,6 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
-        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
@@ -271,4 +264,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4091281c8fe0f5410090b23db7276496fe49d6ed..e95d72693c76df13871da4cf809c99e43c41d36a 100755 (executable)
@@ -68,7 +68,6 @@ def doNotProcessLine(line):
 
 def getNovelSNPInfo(genome, snpEntry, hg):
     fields = snpEntry.split()
 
 def getNovelSNPInfo(genome, snpEntry, hg):
     fields = snpEntry.split()
-    #TODO: refactor naming. is fields[8] rpkm?
     if fields[8].find("N\A") == -1: 
         return snpEntry
     else:
     if fields[8].find("N\A") == -1: 
         return snpEntry
     else:
@@ -93,4 +92,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 37576ca867fb2283f9a5d497c0bbca7d06191f7e..17520401c52b5d153404aa4312a800017e9af3bb 100755 (executable)
@@ -20,6 +20,9 @@ import ReadDataset
 verstring = "makerdsfromblat: version 3.10"
 print verstring
 
 verstring = "makerdsfromblat: version 3.10"
 print verstring
 
+NUM_HEADER_LINES = 5
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -98,24 +101,15 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
                     verbose=False, cachePages=100000, geneDataFileName="",
                     propertyList=[]):
 
                     verbose=False, cachePages=100000, geneDataFileName="",
                     propertyList=[]):
 
-    delimiter = "|"
-    minIntron = 10
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
     if forceRNA:
         print "forcing datatype to RNA"
         dataType = "RNA"
 
     if forceRNA:
         print "forcing datatype to RNA"
         dataType = "RNA"
 
-    if dataType == "RNA":
-        genedatafile = open(geneDataFileName)
-
-    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
-
     geneDict = {}
     mapDict = {}
     if dataType == "RNA" and not forceRNA:
     geneDict = {}
     mapDict = {}
     if dataType == "RNA" and not forceRNA:
+        genedatafile = open(geneDataFileName)
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
@@ -164,9 +158,10 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
 
     # make some assumptions based on first read
     infile = open(filename, "r")
 
     # make some assumptions based on first read
     infile = open(filename, "r")
-    for arg in range(6):
+    for arg in range(NUM_HEADER_LINES):
         line = infile.readline()
 
         line = infile.readline()
 
+    line = infile.readline()
     fields = line.split()
     readsize = int(fields[10])
     pairedTest = fields[9][-2:]
     fields = line.split()
     readsize = int(fields[10])
     pairedTest = fields[9][-2:]
@@ -186,8 +181,9 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
         rds.insertMetadata([("blat_mapped", "True")])
 
     minReadScore = readsize - readsize/25 - 1
         rds.insertMetadata([("blat_mapped", "True")])
 
     minReadScore = readsize - readsize/25 - 1
-    trim = -4
+    maxBorder = 0
     if dataType == "RNA":
     if dataType == "RNA":
+        trim = -4
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
@@ -199,9 +195,12 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
     index = uIndex = mIndex = sIndex = lIndex = 0
     bestScore = 0
     # skip headers
     index = uIndex = mIndex = sIndex = lIndex = 0
     bestScore = 0
     # skip headers
-    for arg in range(5):
+    for arg in range(NUM_HEADER_LINES):
         line = infile.readline()
 
         line = infile.readline()
 
+    insertSize = 100000
+    delimiter = "|"
+    minIntron = 10
     for line in infile:
         lIndex += 1
         fields = line.strip().split()
     for line in infile:
         lIndex += 1
         fields = line.strip().split()
index 14be260cdf0721b0514098a22d3a07423756c41b..eadb66b83d59e140b0402a9ea76dd351219abad0 100755 (executable)
@@ -87,29 +87,13 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
                       propertyList=[]):
 
                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
                       propertyList=[]):
 
-    delimiter = "|"
+    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
 
 
+    geneDict = {}
     dataType = "DNA"
     if genedatafilename is not None:
         dataType = "RNA"
         genedatafile = open(genedatafilename)
     dataType = "DNA"
     if genedatafilename is not None:
         dataType = "RNA"
         genedatafile = open(genedatafilename)
-
-
-    forcePair = False
-    if forceID is not None:
-        forcePair = True
-    else:
-        forceID = 0
-
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
-    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
-
-    geneDict = {}
-    mapDict = {}
-    if dataType == "RNA":
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
@@ -130,7 +114,6 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                 totalLength += exonLengths[index]
 
             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
                 totalLength += exonLengths[index]
 
             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
 
         genedatafile.close()
 
 
         genedatafile.close()
 
@@ -164,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
     fields = line.split()
     readsize = len(fields[5])
     pairedTest = fields[0][-2:]
     fields = line.split()
     readsize = len(fields[5])
     pairedTest = fields[0][-2:]
+    forcePair = False
+    if forceID is not None:
+        forcePair = True
+    else:
+        forceID = 0
+
     paired = False
     if pairedTest in ["/1", "/2"] or forcePair:
         print "assuming reads are paired"
         paired = True
 
     paired = False
     if pairedTest in ["/1", "/2"] or forcePair:
         print "assuming reads are paired"
         paired = True
 
-
     print "read size: %d bp" % readsize
     if init:
         rds.insertMetadata([("readsize", readsize)])
     print "read size: %d bp" % readsize
     if init:
         rds.insertMetadata([("readsize", readsize)])
@@ -184,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
 
     infile.close()
 
 
     infile.close()
 
-    trim = -4
+    maxBorder = 0
     if dataType == "RNA":
     if dataType == "RNA":
+        trim = -4
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
@@ -195,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
     mInsertList = []
     sInsertList = []
     index = uIndex = mIndex = sIndex = lIndex = 0
     mInsertList = []
     sInsertList = []
     index = uIndex = mIndex = sIndex = lIndex = 0
+    delimiter = "|"
+    insertSize = 100000
     for line in infile:
         lIndex += 1
         if stripSpace:
     for line in infile:
         lIndex += 1
         if stripSpace:
index 66209eef973f36072ba964aac73f8579685aed99..b11848d7a93e3968d7b76b286fa415d04ffa078d 100755 (executable)
@@ -106,8 +106,6 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
     insertSize = 100000
 
     geneDict = {}
     insertSize = 100000
 
     geneDict = {}
-    mapDict = {}
-    seenSpliceList = []
     if dataType == 'RNA':
         genedatafile = open(geneDataFileName)
         for line in genedatafile:
     if dataType == 'RNA':
         genedatafile = open(geneDataFileName)
         for line in genedatafile:
@@ -120,17 +118,11 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
             chrom = fields[1]
             sense = fields[2]
             chromstarts = fields[8][:-1].split(',')
             chrom = fields[1]
             sense = fields[2]
             chromstarts = fields[8][:-1].split(',')
-            chromstops = fields[9][:-1].split(',')
-            exonLengths = []
-            totalLength = 0
             for index in range(blockCount):
                 chromstarts[index] = int(chromstarts[index])
             for index in range(blockCount):
                 chromstarts[index] = int(chromstarts[index])
-                chromstops[index] = int(chromstops[index])
-                exonLengths.append(chromstops[index] - chromstarts[index])
-                totalLength += exonLengths[index]
 
 
-            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
+            geneDict[uname] = (sense, blockCount, chrom, chromstarts)
+
         genedatafile.close()
 
     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
         genedatafile.close()
 
     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
@@ -322,6 +314,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
     print '%d unique reads' % index
     infile.close()
 
     print '%d unique reads' % index
     infile.close()
 
+    seenSpliceList = []
     if dataType == 'RNA':
         print 'mapping splices...'
         index = 0
     if dataType == 'RNA':
         print 'mapping splices...'
         index = 0
@@ -441,7 +434,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
                     print fields
                     continue
 
                     print fields
                     continue
 
-                (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                (sense, blockCount, chrom, chromstarts) = geneDict[model]
                 if extended:
                     if 'F' in thepos:
                         rsense = '+'
                 if extended:
                     if 'F' in thepos:
                         rsense = '+'
index bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07..2df39a9412ab6794a1c437bead4b476c482e3918 100755 (executable)
@@ -8,6 +8,14 @@ import sys
 from cistematic.genomes import Genome
 
 
 from cistematic.genomes import Genome
 
 
+class GeneSymbolAndCount():
+
+    def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+        self.symbol = symbol
+        self.uniqueCount = uniqueCount
+        self.spliceCount = spliceCount
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -31,53 +39,65 @@ def main(argv=None):
 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
     hg = Genome(genome)
 
 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
     hg = Genome(genome)
 
-    gidDict = {}
-    gidList = []
-    uniqueCountDict = {}
-    spliceCountDict = {}
+    gidDict = getGeneData(uniquefilecount, splicefilecount)
+    gidList = gidDict.keys()
+    gidList.sort()
+
+    outfile = open(outfilename, "w")
+    for gid in gidList:
+        featureList = hg.getGeneFeatures((genome, gid))
+        featuresizesum, splicearea = getStats(featureList, splicelead)
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        geneData = gidDict[gid]
+        uniqueCount = geneData.uniqueCount
+        expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+        symbol = geneData.symbol
+        spliceCount = geneData.spliceCount
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
 
 
+    outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+    gidDict = {}
     uniquefile = open(uniquefilecount)
     for line in uniquefile:
         fields = line.strip().split()
     uniquefile = open(uniquefilecount)
     for line in uniquefile:
         fields = line.strip().split()
-        gidDict[fields[0]] = fields[1]
-        gidList.append(fields[0])
-        uniqueCountDict[fields[0]] = int(fields[2])
+        geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+        gidDict[fields[0]] = geneData
 
 
+    uniquefile.close()
     splicefile = open(splicefilecount)
     for line in splicefile:
         fields = line.strip().split()
     splicefile = open(splicefilecount)
     for line in splicefile:
         fields = line.strip().split()
-        spliceCountDict[fields[0]] = int(fields[2])
+        gidDict[fields[0]].spliceCount = int(fields[2])
 
 
-    outfile = open(outfilename,'w')
+    splicefile.close()
 
 
-    gidList.sort()
-    for gid in gidList:
-        symbol = gidDict[gid]
-        featureList = hg.getGeneFeatures((genome, gid))
-        newfeatureList = []
-        featuresizesum = 0
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
-                featuresizesum += stop - start + 1
+    return gidDict
 
 
-        if featuresizesum < 1:
-            featuresizesum = 1
 
 
-        splicearea = (len(newfeatureList) - 1) * splicelead
-        if splicearea < splicelead:
-            splicearea = 0
+def getStats(featureList, splicelead):
+    newfeatureList = []
+    featuresizesum = 0
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+            featuresizesum += stop - start + 1
 
 
-        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
-        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+    if featuresizesum < 1:
+        featuresizesum = 1
 
 
-        # this p-value is based on the observed unique count, not the expected total count
-        # nor the multi-read adjusted count
-        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
-        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
-        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+    splicearea = (len(newfeatureList) - 1) * splicelead
+    if splicearea < splicelead:
+        splicearea = 0
 
 
-    outfile.close()
+    return featuresizesum, splicearea
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
index 4826a13bbb31824ffea9b187d383f42b35bbf682..e012bb49436c92f50a1d8722182d0cd72a754897 100755 (executable)
@@ -4,10 +4,29 @@
 #
 
 import sys
 #
 
 import sys
+from commoncode import regionsOverlap
 
 print "siteintersects: version 2.1"
 
 
 
 print "siteintersects: version 2.1"
 
 
+class Site():
+    def __init__(self, line, doExpanded=False):
+        fields = line.strip().split()
+        if doExpanded:
+            self.chrom = fields[1][3:]
+            self.start = int(fields[2])
+            self.stop = int(fields[3])
+            self.rest = fields[4:]
+        else:
+            (chromosome, pos) = fields[0].split(":")
+            self.chrom = chromosome[3:]
+            (start, stop) = pos.split("-")
+            self.start = int(start)
+            self.stop = int(stop)
+            self.rest = fields[1:]
+
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -33,115 +52,114 @@ def main(argv=None):
     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
 
 
     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
 
 
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
 
 
-    siteDict = {}
-    file1Dict = {}
+    siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+    commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
 
 
-    infile1count = 0
-    infile = open(sitefilename1)
+    if doReject and siteRejectFilename is not None:
+        writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+    print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+    siteDict = {}
+    rejectSiteDict = {}
+    processedLineCount = 0
+    infile = open(sitefilename)
     infile.readline()
     for line in infile:
     infile.readline()
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
             continue
 
-        infile1count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            start = int(start)
-            stop = int(stop)
-            rest = fields[1:]
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
+        rest = site.fieldList
 
         try:
             siteDict[chrom].append((start, stop, rest))
 
         try:
             siteDict[chrom].append((start, stop, rest))
-        except:
+        except KeyError:
             siteDict[chrom] = [(start, stop, rest)]
 
         if doReject:
             siteDict[chrom] = [(start, stop, rest)]
 
         if doReject:
-            file1Dict[str((chrom, start, stop, rest))] = line
+            rejectSiteDict[str((chrom, start, stop, rest))] = line
 
     infile.close()
 
     infile.close()
+    print "file1: %d" % processedLineCount
 
 
-    print "file1: %d" % infile1count
+    return siteDict, rejectSiteDict
 
 
-    infile2count = 0
-    infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+    processedLineCount = 0
+    infile = open(siteFilename)
     infile.readline()
 
     commonSites = 0
     infile.readline()
 
     commonSites = 0
-    unique2List = []
+    if doReject and rejectFilename is not None:
+        rejectfile = open(rejectFilename, "w")
+    else:
+        doReject=False
+
     outfile = open(outfilename, "w")
     for line in infile:
     outfile = open(outfilename, "w")
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
             continue
 
-        infile2count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            rest = str(fields[1:])
-
-        start = int(start)
-        stop = int(stop)
-        mid = start + abs(stop - start)/2
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
         if chrom not in siteDict:
             if doReject:
         if chrom not in siteDict:
             if doReject:
-                unique2List.append(line)
-                continue
+                rejectfile.write(line)
+
+            continue
 
 
-        twoNotCommon = True
+        siteNotCommon = True
         for (rstart, rstop, rline) in siteDict[chrom]:
         for (rstart, rstop, rline) in siteDict[chrom]:
-            rsize = abs(rstart - rstop) /2
-            rmid = rstart + abs(rstop - rstart)/2
-            if abs(mid - rmid) < rsize:
+            if regionsOverlap(start, stop, rstart, rstop):
                 commonSites += 1
                 commonSites += 1
-                if twoNotCommon:
-                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
-                    twoNotCommon = False
+                if siteNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+                    siteNotCommon = False
 
                 try:
                     if doReject:
 
                 try:
                     if doReject:
-                        del file1Dict[str((chrom, rstart, rstop, rline))]
-                except:
+                        del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+                except KeyError:
                     pass
 
                     pass
 
-        if doReject and twoNotCommon:
-            unique2List.append(line)
+        if doReject and siteNotCommon:
+            rejectfile.write(line)
 
 
-    outfile.close()
+    if doReject:
+        rejectfile.close()
 
 
-    print "file2: %d" % infile2count
+    outfile.close()
 
 
-    if doReject:
-        reject1file = open(reject1filename, "w")
-        reject2file = open(reject2filename, "w")
+    print "file2: %d" % processedLineCount
 
 
-        for key in file1Dict:
-            reject1file.write(file1Dict[key])
+    return commonSites
 
 
-        for line in unique2List:
-            reject2file.write(line)
 
 
-        reject1file.close()
-        reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+    rejectfile = open(siteRejectFilename, "w")
 
 
-    print commonSites
+    for key in rejectSiteDict:
+        rejectfile.write(rejectSiteDict[key])
 
 
+    rejectfile.close()
+    
 
 if __name__ == "__main__":
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
diff --git a/test/testBedToRegion.py b/test/testBedToRegion.py
new file mode 100644 (file)
index 0000000..b5b4c03
--- /dev/null
@@ -0,0 +1,65 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import bedtoregion
+
+
+class TestBedToRegion(unittest.TestCase):
+
+    testBedFile = "erangeTestBedFile"
+    testRegionFile = "erangeTestRegionFile"
+
+
+    def setUp(self):
+        bedfile = open(self.testBedFile, "w")
+        bedfile.write("tab\tdelimited\tfields\n")
+        bedfile.write("space delimited fields\n")
+        bedfile.write("track\n")
+        bedfile.write("line after track will not be processed\n")
+        bedfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testBedFile)
+        except OSError:
+            print "bed file does not exist"
+
+        try:
+            os.remove(self.testRegionFile)
+        except OSError:
+            print "region file does not exist"
+
+
+    def testBedToRegion(self):
+        bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile)
+        resultFile = open(self.testRegionFile)
+        regionList = resultFile.readlines()
+        self.assertEquals(2, len(regionList))
+        self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0])
+        self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1])
+
+
+    def testMain(self):
+        argv = ["bedtoregion"]
+        self.assertRaises(SystemExit, bedtoregion.main, argv)
+        argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile]
+        bedtoregion.main(argv)
+        resultFile = open(self.testRegionFile)
+        self.assertEquals(2, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBedToRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testBinsToCdf.py b/test/testBinsToCdf.py
new file mode 100644 (file)
index 0000000..688e688
--- /dev/null
@@ -0,0 +1,66 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import binstocdf
+
+
+class TestBinsToCdf(unittest.TestCase):
+
+    testInputFile = "erangeTestBinFile"
+    testOutputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        binfile = open(self.testInputFile, "w")
+        binfile.write("field1\tfield2\t10\tfield4\t2\t3\n")
+        binfile.write("field1\tfield2\t20\tfield4\t1\t4\n")
+        binfile.write("field1\tfield2\t0\tfield4\n")
+        binfile.write("too short\n")
+        binfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "bin file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        binstocdf.binToCDF(self.testInputFile, self.testOutputFile)
+        resultFile = open(self.testOutputFile)
+        resultList = resultFile.readlines()
+        self.assertEquals(3, len(resultList))
+        self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0])
+        self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1])
+        self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2])
+
+
+    def testMain(self):
+        argv = ["binstocdf"]
+        self.assertRaises(SystemExit, binstocdf.main, argv)
+        argv = ["binstocdf", self.testInputFile, self.testOutputFile]
+        binstocdf.main(argv)
+        resultFile = open(self.testOutputFile)
+        self.assertEquals(3, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBinsToCdf))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testCdfDist.py b/test/testCdfDist.py
new file mode 100644 (file)
index 0000000..9048749
--- /dev/null
@@ -0,0 +1,58 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import cdfdist
+
+
+class TestCdfDist(unittest.TestCase):
+
+    testInputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1 30 60\n")
+        cdffile.write("line2 90 99\n")
+        cdffile.write("line3 5 80\n")
+        cdffile.write("line4 10 14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        bins = 2
+        self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile))
+        self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile))
+        self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile))
+        self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile))
+
+
+    def testMain(self):
+        bins = 2
+        percent = 50
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, cdfdist.main, argv)
+        argv = ["cdfdist", bins, percent, self.testInputFile]
+        cdfdist.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCdfDist))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index b41fe659421377de86e576a7644abf9e8c5971b6..0fbbb7b2835ad2e7b8c52f38fdb4f116268444bc 100644 (file)
@@ -23,6 +23,8 @@ class TestChksnp(unittest.TestCase):
 
     snpDB = "%s/dbSNP128.db" % dbPath
     altSnpDB = "%s/snp129cDNA.db" % dbPath
 
     snpDB = "%s/dbSNP128.db" % dbPath
     altSnpDB = "%s/snp129cDNA.db" % dbPath
+    inputFileName = "testChkSNP_input.txt"
+    outputFileName = "testChkSNP_output.txt"
 
     def setUp(self):
         pass
 
     def setUp(self):
         pass
@@ -33,8 +35,7 @@ class TestChksnp(unittest.TestCase):
 
 
     def testChkSNPFile(self):
 
 
     def testChkSNPFile(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
@@ -42,10 +43,8 @@ class TestChksnp(unittest.TestCase):
         infile.write("%s\n" % snpEntry)
         infile.close()
 
         infile.write("%s\n" % snpEntry)
         infile.close()
 
-        outputFileName = "testChkSNP_output.txt"
-
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName)
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -53,10 +52,10 @@ class TestChksnp(unittest.TestCase):
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
-        os.remove(outputFileName)
+        os.remove(self.outputFileName)
 
 
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB])
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -65,13 +64,12 @@ class TestChksnp(unittest.TestCase):
         self.assertEquals(result, line)
         outfile.close()
 
         self.assertEquals(result, line)
         outfile.close()
 
-        os.remove(inputFileName)
-        os.remove(outputFileName)
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
 
 
     def testMain(self):
 
 
     def testMain(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
@@ -79,11 +77,9 @@ class TestChksnp(unittest.TestCase):
         infile.write("%s\n" % snpEntry)
         infile.close()
 
         infile.write("%s\n" % snpEntry)
         infile.close()
 
-        outputFileName = "testChkSNP_output.txt"
-
-        argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+        argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName]
         chksnp.main(argv)
         chksnp.main(argv)
-        outfile = open(outputFileName, "r")
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -91,7 +87,10 @@ class TestChksnp(unittest.TestCase):
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
-        os.remove(outputFileName)
+
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
+
 
     def testChkSNP(self):
         snpPropertiesList = []
 
     def testChkSNP(self):
         snpPropertiesList = []
diff --git a/test/testColsum.py b/test/testColsum.py
new file mode 100644 (file)
index 0000000..5c149e7
--- /dev/null
@@ -0,0 +1,56 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import colsum
+
+
+class TestColsum(unittest.TestCase):
+
+    testInputFile = "erangeTestColsumFile"
+
+
+    def setUp(self):
+        colfile = open(self.testInputFile, "w")
+        colfile.write("line1 30 60.5\n")
+        colfile.write("line2 90 99\n")
+        colfile.write("line3 5 80\n")
+        colfile.write("line4 10 1\n")
+        colfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        self.assertEquals(0, colsum.colsum(0, self.testInputFile))
+        self.assertEquals(135, colsum.colsum(1, self.testInputFile))
+        self.assertEquals(240.5, colsum.colsum(2, self.testInputFile))
+        self.assertEquals(0, colsum.colsum(3, self.testInputFile))
+
+
+    def testMain(self):
+        field = 1
+        argv = ["colsum"]
+        self.assertRaises(SystemExit, colsum.main, argv)
+        argv = ["colsum", field, self.testInputFile]
+        colsum.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestColsum))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index 43eb96bc2a3153e7cbf0bb5719fa13320195512c..ce8be55e8c9d9aed892a8eba023af0e79b812a8d 100644 (file)
@@ -8,6 +8,7 @@ import os
 import string
 from array import array
 from erange import commoncode
 import string
 from array import array
 from erange import commoncode
+from erange import Region
 from cistematic.genomes import Genome
 
 
 from cistematic.genomes import Genome
 
 
@@ -122,8 +123,10 @@ class TestCommoncode(unittest.TestCase):
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         testfile.write(regionEntry)
         testfile.close()
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         testfile.write(regionEntry)
         testfile.close()
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+        result = commoncode.getMergedRegions("regionTestFile")
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
         os.remove("regionTestFile")
 
 
         os.remove("regionTestFile")
 
 
@@ -132,108 +135,277 @@ class TestCommoncode(unittest.TestCase):
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(5, 25, 20)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
-        result = {"1": [(12, 18, 6)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
-        result = {"chr1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=5)
+        self.assertEquals(5, result["1"][0].start)
+        self.assertEquals(25, result["1"][0].stop)
+        self.assertEquals(20, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=-2)
+        self.assertEquals(12, result["1"][0].start)
+        self.assertEquals(18, result["1"][0].stop)
+        self.assertEquals(6, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True)
+        self.assertEquals(10, result["chr1"][0].start)
+        self.assertEquals(20, result["chr1"][0].stop)
+        self.assertEquals(10, result["chr1"][0].length)
 
         regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
         regionList = [regionEntry]
 
         regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 40, 30)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10), (15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
-        result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2", result["1"][1].label)
 
         regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
 
         regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1\tspacer", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2\tspacer2", result["1"][1].label)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
         regionList.append(regionEntry)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 2040, 2030)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
-        result = {"1": [(2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(2030, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=5)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
         regionList.append(regionEntry)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
         regionList.append(regionEntry)
-        result = {"2": [(15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+        result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
         self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList = [regionEntry]
         self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+
         regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
         regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
         regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
         regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
         regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
         regionList.append(regionEntry)
         regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
-                  "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
-        }
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(88, result["2"][0].stop)
+        self.assertEquals(73, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("4", result["2"][0].label)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals(32, result["1"][1].peakPos)
+        self.assertEquals(17, result["1"][1].peakHeight)
+        self.assertEquals("3", result["1"][1].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+        self.assertEquals(65, result["2"][1].start)
+        self.assertEquals(88, result["2"][1].stop)
+        self.assertEquals(23, result["2"][1].length)
+        self.assertEquals(72, result["2"][1].peakPos)
+        self.assertEquals(7, result["2"][1].peakHeight)
+        self.assertEquals("4", result["2"][1].label)
 
         regionList = ["# comment"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList.append(regionEntry)
 
         regionList = ["# comment"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# pvalue"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
         regionList = ["# pvalue"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
         regionList = ["# readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# pvalue readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
         regionList.append(regionEntry)
         regionList = ["# pvalue readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
         regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         #Test fails - the header line is required if there are fields after the peak which isn't so good
         #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
 
         #Test fails - the header line is required if there are fields after the peak which isn't so good
         #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
 
@@ -274,28 +446,61 @@ class TestCommoncode(unittest.TestCase):
     #TODO: write test
     def testFindPeak(self):
         hitList = []
     #TODO: write test
     def testFindPeak(self):
         hitList = []
-        result = ([], 0.0, array("f"), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+        result = commoncode.findPeak(hitList, 0, 0)
+        self.assertEquals([], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f"), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
-        result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, doWeight=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(0.5, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.5, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+        self.assertEquals(6, result.shift)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(3, result.shift)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+        self.assertEquals(3, result.shift)
 
 
     #TODO: write test
 
 
     #TODO: write test
@@ -354,10 +559,13 @@ class TestCommoncode(unittest.TestCase):
         }
         self.assertEquals(result, complementDict)
 
         }
         self.assertEquals(result, complementDict)
 
-        regionDict = {"I": [("new feature", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new feature"
+        regionDict = {"I": [additionalRegion]}
         featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
         result = (100, 150, "new feature", "+", "custom")
         self.assertEquals(result, featureDict["I"][0])
         featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
         result = (100, 150, "new feature", "+", "custom")
         self.assertEquals(result, featureDict["I"][0])
+        
 
 
     def testGetLocusByChromDict(self):
 
 
     def testGetLocusByChromDict(self):
@@ -382,7 +590,9 @@ class TestCommoncode(unittest.TestCase):
             self.assertTrue(chrom in self.celegansChroms)
             self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
 
             self.assertTrue(chrom in self.celegansChroms)
             self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
 
-        regionDict = {"I": [("new region", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new region"
+        regionDict = {"I": [additionalRegion]}
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
         self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
         self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
index 4455fff7211ff686832b81cb80ed6cc7adcfba03..d993fad8ba7e7f1dd208d6760d465842538ddba7 100644 (file)
@@ -11,8 +11,13 @@ Created on Sep 8, 2010
 import sys
 import unittest
 import testAnalyzeGO
 import sys
 import unittest
 import testAnalyzeGO
+import testBedToRegion
+import testBinsToCdf
+import testCdfDist
 import testChksnp
 import testChksnp
+import testColsum
 import testCommoncode
 import testCommoncode
+import testFeatureIntersects
 import testGeneMrnaCounts
 import testGeneMrnaCountsWeighted
 #import testGetFasta
 import testGeneMrnaCounts
 import testGeneMrnaCountsWeighted
 #import testGetFasta
@@ -23,6 +28,7 @@ import testMakeBamFromRds
 import testmakebedfromrds
 #import testMakeGraphs
 import testMakeRdsFromBam
 import testmakebedfromrds
 #import testMakeGraphs
 import testMakeRdsFromBam
+import testMakeSiteTrack
 import testMakeSNPTrack
 import testMarkLinkers
 import testPeak
 import testMakeSNPTrack
 import testMarkLinkers
 import testPeak
@@ -41,8 +47,13 @@ def main(argv=None):
 
     suite = unittest.TestSuite()
     suite.addTest(testAnalyzeGO.suite())
 
     suite = unittest.TestSuite()
     suite.addTest(testAnalyzeGO.suite())
+    suite.addTest(testBedToRegion.suite())
+    suite.addTest(testBinsToCdf.suite())
+    suite.addTest(testCdfDist.suite())
     suite.addTest(testChksnp.suite())
     suite.addTest(testChksnp.suite())
+    suite.addTest(testColsum.suite())
     suite.addTest(testCommoncode.suite())
     suite.addTest(testCommoncode.suite())
+    suite.addTest(testFeatureIntersects.suite())
     suite.addTest(testGeneMrnaCounts.suite())
     suite.addTest(testGeneMrnaCountsWeighted.suite())
     #suite.addTest(testGetFasta.suite())
     suite.addTest(testGeneMrnaCounts.suite())
     suite.addTest(testGeneMrnaCountsWeighted.suite())
     #suite.addTest(testGetFasta.suite())
@@ -53,6 +64,7 @@ def main(argv=None):
     suite.addTest(testmakebedfromrds.suite())
     #suite.addTest(testMakeGraphs.suite())
     suite.addTest(testMakeRdsFromBam.suite())
     suite.addTest(testmakebedfromrds.suite())
     #suite.addTest(testMakeGraphs.suite())
     suite.addTest(testMakeRdsFromBam.suite())
+    suite.addTest(testMakeSiteTrack.suite())
     suite.addTest(testMakeSNPTrack.suite())
     suite.addTest(testMarkLinkers.suite())
     suite.addTest(testPeak.suite())
     suite.addTest(testMakeSNPTrack.suite())
     suite.addTest(testMarkLinkers.suite())
     suite.addTest(testPeak.suite())
diff --git a/test/testFeatureIntersects.py b/test/testFeatureIntersects.py
new file mode 100644 (file)
index 0000000..9633f8c
--- /dev/null
@@ -0,0 +1,73 @@
+
+import unittest
+import os
+from erange import featureIntersects
+
+
+class TestFeatureIntersects(unittest.TestCase):
+
+    testInputFile = "erangeTestFeatureIntersects"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1\tchr1\t30\t60\n")
+        cdffile.write("line2\tchr1\t90\t99\n")
+        cdffile.write("line3\tchr1\t5\t80\n")
+        cdffile.write("line4\tchr2\t10\t14\n")
+        cdffile.write("line4\tnot to be processed\n")
+        cdffile.write("line5\tchr2\t10\t14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "position file does not exist"
+
+
+    #TODO: write test
+    def testFeatureIntersects(self):
+        pass
+
+
+    #TODO: write test
+    def testGetPositionList(self):
+        pass
+
+
+    #TODO: write test
+    def testMain(self):
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, featureIntersects.main, argv)
+        argv = ["cdfdist", self.testInputFile]
+        featureIntersects.main(argv)
+
+
+    def testMakeParser(self):
+        parser = featureIntersects.makeParser("")
+        argv = []
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual([], args)
+        argv = [self.testInputFile]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual(self.testInputFile, args[0])
+        self.assertEqual("TFBSCONSSITES", options.cistype)
+        self.assertEqual(100, options.radius)
+        argv = [self.testInputFile, "--cistype", "test", "--radius", "50"]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual("test", options.cistype)
+        self.assertEqual(50, options.radius)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestFeatureIntersects))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeSiteTrack.py b/test/testMakeSiteTrack.py
new file mode 100644 (file)
index 0000000..7e609e7
--- /dev/null
@@ -0,0 +1,94 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import makesitetrack
+
+
+class TestMakeSiteTrack(unittest.TestCase):
+
+    testInputFile = "erangeTestSiteFile"
+    testOutputFile = "erangeTestBedFile"
+
+
+    def setUp(self):
+        self.createCompactInputFile()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "bed file does not exist"
+
+
+    def createStandardInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def createCompactInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("chr1:100-175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def testMakeSiteTrack(self):
+        self.createCompactInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile)
+        bedfile = open(self.testOutputFile)
+        header = 'track name="" visibility=4 itemRgb="On"\n'
+        self.assertEquals(header, bedfile.readline())
+        result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n"
+        self.assertEquals(result, bedfile.readline())
+
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True)
+        bedfile = open(self.testOutputFile)
+        header = bedfile.readline()
+        firstEntry = bedfile.readline()
+        self.assertEquals(firstEntry, bedfile.readline())
+
+
+    def testNonCompactMakeSiteTrack(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        self.createStandardInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255")
+        bedfile = open(self.testOutputFile)
+        bedfile.readline()
+        result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n"
+        self.assertEquals(result, bedfile.readline())
+
+
+    def testMain(self):
+        self.createCompactInputFile()
+        argv = ["makesitetrack"]
+        self.assertRaises(SystemExit, makesitetrack.main, argv)
+        argv = ["makesitetrack", self.testInputFile, self.testOutputFile]
+        makesitetrack.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeSiteTrack))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index f4b1691a07a08b15988539f75f1fbb2a62e148b2..9fd041d4462abb552eef87ebd82430177106231f 100755 (executable)
@@ -168,7 +168,6 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                 chrom = "chr%s" % multiReadList[index][1]
                 reweighList.append((round(score,3), chrom, start, theID))
 
                 chrom = "chr%s" % multiReadList[index][1]
                 reweighList.append((round(score,3), chrom, start, theID))
 
-            #TODO: Is this right? If match index is 0 are we doing nothing?
             if theMatch > 0:
                 RDS.reweighMultireads(reweighList)
                 fixedPair += 1
             if theMatch > 0:
                 RDS.reweighMultireads(reweighList)
                 fixedPair += 1
@@ -330,4 +329,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)