Release version for Erange 4.0a
authorSean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
committerSean Upchurch <sau@caltech.edu>
Fri, 7 Jan 2011 18:05:34 +0000 (10:05 -0800)
34 files changed:
MakeBamFromRds.py
ReadDataset.py
altSpliceCounts.py
bedtoregion.py
binstocdf.py
buildMatrix.py
cdfdist.py
colsum.py
combineRPKMs.py
combinerds.py
commoncode.py
distalPairs.py
farPairs.py
featureIntersects.py
findall.py
geneDownstreamBins.py
geneLocusBins.py
geneMrnaCountsWeighted.py
getNovelSNPs.py
makerdsfromblat.py
makerdsfrombowtie.py
makerdsfromeland2.py
predictSpliceCount.py
siteintersects.py
test/testBedToRegion.py [new file with mode: 0644]
test/testBinsToCdf.py [new file with mode: 0644]
test/testCdfDist.py [new file with mode: 0644]
test/testChksnp.py
test/testColsum.py [new file with mode: 0644]
test/testCommoncode.py
test/testErange.py
test/testFeatureIntersects.py [new file with mode: 0644]
test/testMakeSiteTrack.py [new file with mode: 0644]
weighMultireads.py

index bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59..730c20be6fb147f84e58b7f80485199e0975ad3a 100644 (file)
@@ -22,13 +22,13 @@ import pysam
 import ReadDataset
 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
 
+VERSION = "1.0"
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    verstring = "makeBamFromRds: version 1.0"
-    print verstring
+    print "makeBamFromRds: version %s" % VERSION
 
     doPairs = False
     
@@ -134,7 +134,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
     for chrom in chromRemoveList:
         chromList.remove(chrom)
 
-    header = {"HD": {"VN": "1.0"}}
+    header = {"HD": {"VN": VERSION}}
     if referenceSequenceList:
         header["SQ"] = referenceSequenceList
 
index 5ff60e2e6954c862888c5807495ef49142e14fa3..56e28a8995b734735ace06d1d990ac9b46753580 100644 (file)
@@ -153,42 +153,42 @@ class ReadDataset():
             self.cachedDB = ""
 
 
-    def attachDB(self, filename, asname):
+    def attachDB(self, filename, dbName):
         """ attach another database file to the readDataset.
         """
-        stmt = "attach '%s' as %s" % (filename, asname)
+        stmt = "attach '%s' as %s" % (filename, dbName)
         self.execute(stmt)
 
 
-    def detachDB(self, asname):
+    def detachDB(self, dbName):
         """ detach a database file to the readDataset.
         """
-        stmt = "detach %s" % (asname)
+        stmt = "detach %s" % (dbName)
         self.execute(stmt)
 
 
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+    def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
         """ import into current RDS the table (with columns destcolumns,
             with default all columns) from the database file asname,
             using the column specification of ascolumns (default all).
         """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
         if flagged != "":
             stmt += " where flag = '%s' " % flagged
 
         self.executeCommit(stmt)
 
 
-    def getTables(self, asname=""):
+    def getTables(self, dbName=""):
         """ get a list of table names in a particular database file.
         """
         resultList = []
         sql = self.getSqlCursor()
 
-        if asname != "":
-            asname += "."
+        if dbName != "":
+            dbName = "%s." % dbName
 
-        stmt = "select name from %ssqlite_master where type='table'" % asname
+        stmt = "select name from %ssqlite_master where type='table'" % dbName
         sql.execute(stmt)
         results = sql.fetchall()
 
@@ -291,11 +291,15 @@ class ReadDataset():
         if "readsize" not in metadata:
             raise ReadDatasetError("no readsize parameter defined")
         else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
+            readSize = metadata["readsize"]
+            if "import" in readSize:
+                readSize = readSize.split()[0]
 
-            return int(mysize)
+            readSize = int(readSize)
+            if readSize < 0:
+                raise ReadDatasetError("readsize is negative")
+
+            return readSize
 
 
     def getDefaultCacheSize(self):
@@ -317,11 +321,9 @@ class ReadDataset():
                 if row["chrom"] not in results:
                     results.append(row["chrom"])
             else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
+                shortName = row["chrom"][3:]
+                if  len(shortName.strip()) > 0 and shortName not in results:
+                    results.append(shortName)
 
         results.sort()
 
@@ -333,32 +335,17 @@ class ReadDataset():
         """ returns the maximum coordinate for reads on a given chromosome.
         """
         maxCoord = 0
-        sql = self.getSqlCursor()
 
         if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
 
         if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
+            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+            maxCoord = max(spliceMax, maxCoord)
 
         if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
+            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+            maxCoord = max(multiMax, maxCoord)
 
         if verbose:
             print "%s maxCoord: %d" % (chrom, maxCoord)
@@ -366,6 +353,19 @@ class ReadDataset():
         return maxCoord
 
 
+    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+        maxCoord = 0
+        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+        sql = self.getSqlCursor()
+        try:
+            sql.execute(sqlStatement)
+            maxCoord = int(sql.fetchall()[0][0])
+        except:
+            print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        return maxCoord
+
+
     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
@@ -378,67 +378,14 @@ class ReadDataset():
         
         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
         """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
 
-        groupBy = []
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
         if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
+            selectQuery = "select start, sense, sum(weight)"
         else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
+            selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
+        groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
         if doUniqs:
             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
             if doMulti:
@@ -498,6 +445,7 @@ class ReadDataset():
         sqlQuery = string.join(stmt)
         sql.execute(sqlQuery)
 
+        resultsDict = {}
         if findallOptimize:
             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
             if self.memBacked:
@@ -562,20 +510,17 @@ class ReadDataset():
         return resultsDict
 
 
-    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
+    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+        if splice:
+            startText = "startL"
+            stopText = "stopR"
+        else:
+            startText = "start"
+            stopText = "stop"
 
+        whereClause = []
         if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
+            whereClause.append("chrom = '%s'" % chrom)
 
         if flag != "":
             if flagLike:
@@ -584,25 +529,37 @@ class ReadDataset():
             else:
                 whereClause.append("flag = '%s'" % flag)
 
+        if start > -1:
+            whereClause.append("%s > %d" % (startText, start))
+
+        if stop > -1:
+            whereClause.append("%s < %d" % (stopText, stop))
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
         if hasMismatch:
             whereClause.append("mismatch != ''")
 
-        if strand != "":
+        if strand in ["+", "-"]:
             whereClause.append("sense = '%s'" % strand)
 
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
         if len(whereClause) > 0:
             whereStatement = string.join(whereClause, " and ")
             whereQuery = "where %s" % whereStatement
         else:
             whereQuery = ""
 
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        return whereQuery
+
+
+    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+        selectClause = [baseSelect]
+        if bothEnds:
+            selectClause.append("stop")
+
         if not noSense:
             selectClause.append("sense")
 
@@ -615,7 +572,36 @@ class ReadDataset():
         if withMismatch:
             selectClause.append("mismatch")
 
-        selectQuery = string.join(selectClause, " ,")
+        selectQuery = string.join(selectClause, ",")
+
+        return selectQuery
+
+
+    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+        groupBy = []
+        if findallOptimize:
+            groupBy = ["GROUP BY start, sense"]
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        groupQuery = string.join(groupBy)
+
+        return groupQuery
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+        selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+        selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
         if self.memBacked:
             sql = self.memcon.cursor()
         else:
@@ -625,6 +611,7 @@ class ReadDataset():
         sql.execute(stmt)
         currentReadID = ""
         currentChrom = ""
+        resultsDict = {}
         for row in sql:
             pairID = 0
             readID = row["readID"]
@@ -796,10 +783,6 @@ class ReadDataset():
         """ get readID's.
         """
         stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
         if uniqs:
             stmt.append("select readID from uniqs")
 
@@ -814,6 +797,10 @@ class ReadDataset():
         else:
             selectPart = ""
 
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
         if self.memBacked:
             sql = self.memcon.cursor()
@@ -845,12 +832,11 @@ class ReadDataset():
                 print "getting mismatches from chromosome %s" % (achrom)
 
             snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
             if useSplices and self.dataType == "RNA":
                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                 spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    spliceEntry = spliceDict[k][0]
+                for spliceID in spliceIDList:
+                    spliceEntry = spliceDict[spliceID][0]
                     startpos = spliceEntry["startL"]
                     lefthalf = spliceEntry["stopL"]
                     rightstart = spliceEntry["startR"]
@@ -881,6 +867,7 @@ class ReadDataset():
 
                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
             if achrom not in hitDict.keys():
                 continue
 
@@ -910,7 +897,7 @@ class ReadDataset():
 
 
     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        useSplices=False, normalizationFactor=1.0, trackStrand=False,
                         keepStrand="both", shiftValue=0):
         """return a profile of the chromosome as an array of per-base read coverage....
             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
@@ -925,8 +912,8 @@ class ReadDataset():
         dataType = metadata["dataType"]
         scale = 1. / normalizationFactor
         shift = {}
-        shift['+'] = int(shiftValue)
-        shift['-'] = -1 * int(shiftValue)
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
 
         if cstop > 0:
             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
index 12077c1ec7d9d7e3ba499c67c99734a8a9a54157..c59dc9249b6c82ded0466f872e98a9900e702c46 100755 (executable)
@@ -2,7 +2,7 @@ try:
     import psyco
     psyco.full()
 except:
-    print 'psyco not running'
+    print "psyco not running"
 
 print "altSpliceCounts: version 3.7"
 
@@ -56,13 +56,13 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
     stopDict = {}
     resultDict = {}
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
     readlen = hitRDS.getReadSize()
     hitDict = hitRDS.getSplicesDict(noSense=True)
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
 
     for chrom in hitDict:
         startDict[chrom] = []
@@ -155,7 +155,7 @@ def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
 
         resultDict[chrom].sort()
         for line in resultDict[chrom]:
-            outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
+            outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
             alternative += 1
 
         print chrom, maxIndex, spliceEvent, altSpliceEvent
index 3bcd554e5406a09f31ecc988e8c60733616d0dd5..a6a344d549b907b98343d731b3a348dd9802d0ec 100755 (executable)
@@ -21,14 +21,16 @@ def main(argv=None):
 def bedToRegion(factor, infilename, outfilename):
     index = 1
     infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
     for line in infile:
-        if 'track' in line:
+        if "track" in line:
             continue
+
         fields = line.split()
-        line = string.join(fields, '\t')
-        outfile.write('%s%d\t%s\n' % (factor, index, line))
+        line = string.join(fields, "\t")
+        outfile.write("%s%d\t%s\n" % (factor, index, line))
         index += 1
+
     infile.close()
     outfile.close()
 
index 63aa9558b8848183d4528bb16149cd0ced38aadd..27b48e22e8ecde5b4fd9c817ff17d4ab75dcbd13 100755 (executable)
@@ -1,4 +1,5 @@
 import sys
+import string
 
 print "binstocdf: version 1.1"
 
@@ -6,19 +7,19 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    if len(argv) < 2:
-        print 'usage: python %s infile outfile' % sys.argv[0]
+    if len(argv) < 3:
+        print "usage: python %s infile outfile" % sys.argv[0]
         sys.exit(1)
 
-    infilename = argv[0]
-    outfilename = argv[1]
+    infilename = argv[1]
+    outfilename = argv[2]
 
     binToCDF(infilename, outfilename)
 
 
 def binToCDF(infilename, outfilename):
     infile = open(infilename)
-    outfile = open(outfilename, 'w')
+    outfile = open(outfilename, "w")
 
     for line in infile:
         fields = line.strip().split()
@@ -30,14 +31,15 @@ def binToCDF(infilename, outfilename):
             outfile.write(line)
             continue
 
-        outfile.write('%s\t%s\t%s\t%s' % (fields[0], fields[1], fields[2], fields[3]))
-        cum = 0
+        outputFields = fields[:4]
+        runningTotal = 0
         for bin in fields[4:]:
-            cum += int(bin)
-            percent = 100 * cum / total
-            outfile.write('\t%d' % percent)
+            runningTotal += int(bin)
+            percent = 100 * runningTotal / total
+            outputFields.append("%d" % percent)
 
-        outfile.write('\n')
+        outputLine = string.join(outputFields, "\t")
+        outfile.write("%s\n" % outputLine)
 
     infile.close()
     outfile.close()
index c7b6dd0c801025552c365003cf9067b2550c6b0c..a49120cf574f9c0213bd569edfe8507f923435a2 100755 (executable)
@@ -78,7 +78,7 @@ def buildMatrix(inFileName, colfilename, outfilename, truncateRPKM,
     if header.strip() == "":
         header = "#\t"
 
-    outfile.write( "%s\t%s\n" % (header, colID))
+    outfile.write("%s\t%s\n" % (header, colID))
 
     values = []
     min = 20000000000.
index 71662442afff0bdeb68fb5604307f72b8cedda91..52963b7445a1e28c8f251c94a6220f5fe1dc92f6 100755 (executable)
@@ -8,11 +8,15 @@ def main(argv=None):
         print "usage: python %s bins percent infile" % sys.argv[0]
         sys.exit(1)
 
-    bins = int(argv[0])
-    percent = int(argv[1])
-    infilename = argv[2]
+    bins = int(argv[1])
+    percent = int(argv[2])
+    infilename = argv[3]
 
-    cdfDist(bins, percent, infilename)
+    printCdfDist(bins, percent, infilename)
+
+
+def printCdfDist(bins, percent, infilename):
+    print cdfDist(bins, percent, infilename)
 
 
 def cdfDist(bins, percent, infilename):
@@ -30,7 +34,7 @@ def cdfDist(bins, percent, infilename):
             index += 1
 
     infile.close()
-    print binsList
+    return binsList
 
 
 if __name__ == "__main__":
index f6d1ff968c5260e35e876fb79a82ac201b088585..643aaeaa82b8bc0c444e33ae909d4f5859e2b1d7 100755 (executable)
--- a/colsum.py
+++ b/colsum.py
@@ -28,7 +28,7 @@ def colsum(fieldID, filename):
                 count += float(fields[fieldID])
             else:
                 count += int(fields[fieldID])
-        except ValueError:
+        except (ValueError, IndexError):
             pass
 
     infile.close()
index ead4e1b81d0b37335fbf00908f1aea615cec33dd..77f30d6fc2acb2a9424634a138a03ee30e314f91 100755 (executable)
@@ -51,23 +51,8 @@ def makeParser(usage=""):
 
 def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, doFraction=False):
 
-    firstDict = {}
-    firstfile = open(firstfileName)
-    for line in firstfile:
-        fields = line.strip().split()
-        firstDict[fields[1]] = fields[-1]
-
-    firstfile.close()
-
-    expandedDict = {}
-    gidDict = {}
-    expandedfile = open(expandedfileName)
-    for line in expandedfile:
-        fields = line.strip().split()
-        expandedDict[fields[1]] = fields[-1]
-        gidDict[fields[1]] = fields[0]
-
-    expandedfile.close()
+    firstDict = getRPKMDict(firstfileName)
+    gidDict, expandedDict = getRPKMDict(expandedfileName, getGIDDict=True)
 
     if doFraction:
         header = "gid\tRNAkb\tgene\tfirstRPKM\texpandedRPKM\tfinalRPKM\tfractionMulti\n"
@@ -97,5 +82,23 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
     outfile.close()
 
 
+def getRPKMDict(rpkmFileName, getGIDDict=False):
+    gidDict = {}
+    rpkmDict = {}
+    rpkmFile = open(rpkmFileName)
+    for line in rpkmFile:
+        fields = line.strip().split()
+        rpkmDict[fields[1]] = fields[-1]
+        if getGIDDict:
+            gidDict[fields[1]] = fields[0]
+
+    rpkmFile.close()
+
+    if getGIDDict:
+        return gidDict, rpkmDict
+    else:
+        return rpkmDict
+
+
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4c826b04ed6c668a3d5b2a66ee4b842789a483a0..2878423a9716a0e4f31d0a1738a0770339c8ed6c 100755 (executable)
@@ -10,7 +10,9 @@ except:
     pass
 
 import sys
+import optparse
 import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption
 
 print "combinerds: version 1.2"
 
@@ -19,97 +21,107 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    if len(argv) < 2:
-        print 'usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]' % argv[0]
-        #print '\nwhere the optional metadata name::value pairs are added to the existing dataset\n'
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    parser = makeParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 2:
+        print usage
         sys.exit(1)
 
-    doCache = False
-    cachePages = -1
-    if '--cache' in argv:
-        doCache = True
-        try:
-            cachePages =  int(argv[sys.argv.index('-cache') + 1])
-        except: 
-            pass
-
-    datafile = argv[1]
-    infileList = []
-    for index in range(2, len(argv)):
-        if argv[index][0] == '-':
-            break
-        infileList.append(sys.argv[index])
+    datafile = args[0]
+    infileList = args[1:]
 
-    print "destination RDS: %s" % datafile
+    combinerds(datafile, infileList, options.tableList, options.withFlag, options.doIndex, options.cachePages, options.doInit, options.initRNA)
 
-    if '--initrna' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True, datasetType='RNA')
-    elif '--init' in argv:
-        rds = ReadDataset.ReadDataset(datafile, initialize=True)
 
-    withFlag = ''
-    if '--flag' in argv:
-        withFlag = argv[sys.argv.index('-flag') + 1]
-        print "restrict to flag = %s" % withFlag
+def makeParser():
+    usage = __doc__
+
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--table", action="append", dest="tablelist")
+    parser.add_option("--init", action="store_true", dest="doInit")
+    parser.add_option("--initrna", action="store_true", dest="initRNA")
+    parser.add_option("--index", action="store_true", dest="doIndex")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--flag", dest="withFlag")
+
+    configParser = getConfigParser()
+    section = "combinerds"
+    doInit = getConfigBoolOption(configParser, section, "doInit", False)
+    initRNA = getConfigBoolOption(configParser, section, "initRNA", False)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+
+    parser.set_defaults(tableList=[], doInit=doInit, initRNA=initRNA, doIndex=doIndex, cachePages=cachePages,
+                        withFlag=withFlag)
+
+    return parser
+
 
-    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache)
+def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, cachePages=None, doInit=False, initRNA=False):
 
+    print "destination RDS: %s" % datafile
+    datasetType="DNA"
+    if initRNA:
+        doInit = True
+        datasetType="RNA"
+
+    doCache = False
+    if cachePages is not None:
+        doCache = True
+    else:
+        cachePages = -1
+
+    rds = ReadDataset.ReadDataset(datafile, verbose=True, cache=doCache, initialize=doInit, datasetType=datasetType)
     if cachePages > rds.getDefaultCacheSize():
         rds.setDBcache(cachePages)
-        cacheVal = cachePages
     else:
-        cacheVal = rds.getDefaultCacheSize()
-
-    doIndex = False
-    if '--index' in argv:
-        doIndex = True
+        cachePages = rds.getDefaultCacheSize()
 
-    tableList = []
-    if '--table' in argv:
-        tableList.append(argv[argv.index('-table') + 1])
-    else:
+    if tableList == []:
         tableList = rds.getTables()
 
-    combinerds(datafile, rds, infileList, cacheVal, tableList, withFlag, doIndex, doCache)
-
+    if withFlag != "":
+        print "restrict to flag = %s" % withFlag
 
-def combinerds(datafile, rds, infileList, cacheVal, tableList=[], withFlag="", doIndex=False, doCache=False):
     metaDict = rds.getMetadata()
     if "numberImports" not in metaDict:
         origIndex = 0
-        rds.insertMetadata([("numberImports", str(0))])
+        rds.insertMetadata([("numberImports", "0")])
     else:
         origIndex = int(metaDict["numberImports"])
 
     index = origIndex
     for inputfile in infileList:
-        asname = "input" + str(index)
-        rds.attachDB(inputfile,asname)
+        dbName = "input%s" % str(index)
+        rds.attachDB(inputfile, dbName)
         for table in tableList:
             print "importing table %s from file %s" % (table, inputfile)
-            ascols = "*"
+            dbColumns = "*"
             if table == "uniqs":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "multi":
-                ascols = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % asname
+                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "splices":
-                ascols = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % asname
-            elif table == "metadata":
-                ascols = "name, value || ' (import_%d)'" % index
-                rds.importFromDB(asname, table, ascols)
+                dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
 
-            if table != "metadata":
-                rds.importFromDB(asname, table, ascols, withFlag)
+            if table == "metadata":
+                dbColumns = "name, value || ' (import_%d)'" % index
+                rds.importFromDB(dbName, table, dbColumns)
+            else:
+                rds.importFromDB(dbName, table, dbColumns, withFlag)
 
-        rds.detachDB(asname)
-        rds.insertMetadata([("import_" + str(index), "%s %s" % (inputfile, str(tableList)))])
+        rds.detachDB(dbName)
+        rds.insertMetadata([("import_%s" % str(index), "%s %s" % (inputfile, str(tableList)))])
         index += 1
 
     rds.updateMetadata("numberImports", index, origIndex)
     if doIndex:
         print "building index...."
-        if cacheVal > 0:
-            rds.buildIndex(cacheVal)
+        if cachePages > 0:
+            rds.buildIndex(cachePages)
         else:
             rds.buildIndex()
 
index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..f526daacca0dc91245e71205363216bc17eba2e9 100755 (executable)
@@ -17,6 +17,9 @@ import Region
 commoncodeVersion = 5.6
 currentRDSversion = 2.0
 
+class ErangeError(Exception):
+    pass
+
 
 def getReverseComplement(base):
     revComp = {"A": "T",
@@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False):
     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
 
 
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
     if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
 
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
 
     return geneannotDict
 
@@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
     mergeCount = 0
     chromField = int(chromField)
     count = 0
-    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
-    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
-    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
-    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
-    #      be merged with A but that will exit the loop and the merge with C will be missed.
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
@@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     """
 
     if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 
     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 
@@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
     topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 
     if leftPlus:
         numLeftPlus = 0
@@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     return peak
 
 
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
     bestShift = 0
     lowestScore = 20000000000
     for testShift in xrange(maxShift + 1):
         shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
             currentpos = read["start"] - start
             if read["sense"] == "+":
                 currentpos += testShift
@@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
-            if doWeight:
+            if useWeight:
                 weight = read["weight"]
             else:
                 weight = 1.0
@@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
             hitIndex += 1
             currentpos += 1
 
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
             seqArray[currentpos] += weight
             hitIndex += 1
             currentpos += 1
@@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length):
         if topNucleotide < smoothArray[currentpos]:
             topNucleotide = smoothArray[currentpos]
             peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
             peakList.append(currentpos)
 
     return peakList
@@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
         return featuresByChromDict
 
 
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
     """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
         return locusByChromDict
 
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
     for gid in featuresDict:
         featureList = featuresDict[gid]
         newFeatureList = []
@@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
         else:
             if not useCDS and upstream > 0:
                 if upstreamSpanTSS:
@@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
 
-        glen = abs(gstop - gstart)
         if chrom not in locusByChromDict:
             locusByChromDict[chrom] = []
 
@@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
     return locusByChromDict
 
 
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                       binLength=-1):
@@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                 rlen = regionTuple[lengthField]
                 try:
                     rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                     rsense = "F"
 
                 if tagStart > stop:
index 5bc25321b25b8a30e9b09511cabbee4355010b71..be403109110c3fb40f7bba831f6c2ea48e496536 100755 (executable)
@@ -85,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
 
     print time.ctime()
 
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
     print "getting uniq reads"    
     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
     print "got uniqs"
 
     if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
 
     if doVerbose:
         print len(uniqDict), time.ctime()
 
     outfile = open(outfilename,"w")
-
     diffChrom = 0
     distal = 0
     total = 0
@@ -132,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                     continue
                 else:
                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print diffChrom, outline
             else:
                 dist = abs(start1 - start2)
-
                 if minDist < dist < maxDist:
                     distal += 1
                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print distal, outline
 
@@ -157,5 +140,22 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
     print time.ctime()
 
 
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        del read0[1]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                del read2[1]
+                uniqDict[readID] = [read0,read2]
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index 00cc91844b23084d712573261e996524739c9870..712e60eff9815f9728b6a03983385f83d96bc6f5 100644 (file)
@@ -14,8 +14,9 @@ except:
 import sys
 import time
 import optparse
+import string
 import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
 
 print "farPairs: version 1.4"
 
@@ -38,14 +39,13 @@ def main(argv=None):
     outfilename = args[1]
     outbedname = args[2]
 
-    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+    farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
              options.cachePages, options.minDist, options.maxDist, options.minCount,
              options.label)
 
 
 def getParser(usage):
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
     parser.add_option("--cache", type="int", dest="cachePages")
     parser.add_option("--verbose", action="store_true", dest="doVerbose")
     parser.add_option("--minDist", type="int", dest="minDist")
@@ -55,7 +55,6 @@ def getParser(usage):
 
     configParser = getConfigParser
     section = "farPairs"
-    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
@@ -63,15 +62,33 @@ def getParser(usage):
     minCount = getConfigIntOption(configParser, section, "minCount", 2)
     label = getConfigOption(configParser, section, "label", None)
 
-    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+    parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
 
     return parser
 
 
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
 
+    flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+    outfile = open(outfilename, "w")
+    for region in flagDict:
+        regionConnections = countDuplicatesInList(flagDict[region])
+        for (connectedRegion, count) in regionConnections:
+            if count >= minCount:
+                outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+                print >> outfile, outline
+                if doVerbose:
+                    print outline
+
+    outfile.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
     doCache = False
     if cachePages is not None:
         doCache = True
@@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
 
     if doVerbose:
         print time.ctime()
-
-    total = 0
-    outfile = open(outfilename, "w")
+    
     outbed = open(outbedname, "w")
     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+    outbed.close()
 
     readlen = RDS.getReadSize()
     flagDict = {}
@@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
         if doNotProcessChromosome(chromosome):
             continue
 
-        print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
-        if doVerbose:
-            print len(uniqDict), time.ctime()
-
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                start1 = readList[0]["start"]
-                flag1 = readList[0]["flag"]
-                start2 = readList[1]["start"]
-                flag2 = readList[1]["flag"]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    startList = [start1, start2]
-                    stopList = [start1 + readlen, start2 + readlen]
-                    startList.sort()
-                    stopList.sort()
-                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
-                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
-                        outbed.write(outputLine)
-                        if doVerbose:
-                            print flag1, flag2, dist
-
-                        try:
-                            flagDict[flag1].append((flag2, start1, start2))
-                        except KeyError:
-                            flagDict[flag1] = [(flag2, start1, start2)]
-
-                        try:
-                            flagDict[flag2].append((flag1, start1, start2))
-                        except KeyError:
-                            flagDict[flag2] = [(flag2, start1, start2)]
+        writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
 
     print "%d connected regions" % len(flagDict)
 
-    for region in flagDict:
-        flagDict[region].sort()
-        regionConnections = {}
-        for (region2, start1, start2) in flagDict[region]:
+    return flagDict
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+    outbed = open(outbedname, "a")
+    print chromosome
+    uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if readsAreFarPair(readList, minDist, maxDist):
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            startList = [start1, start2]
+            startList.sort()
+            outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+            outbed.write(outputLine)
+            flag1 = readList[0]["flag"]
+            flag2 = readList[1]["flag"]
+            if doVerbose:
+                print flag1, flag2, abs(start1 - start2)
+
             try:
-                regionConnections[region2] += 1
+                flagDict[flag1].append(flag2)
             except KeyError:
-                regionConnections[region2] = 1
+                flagDict[flag1] = [flag2]
 
-        for region2 in regionConnections:
-            if regionConnections[region2] >= minCount:
-                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
-                if doVerbose:
-                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+            try:
+                flagDict[flag2].append(flag1)
+            except KeyError:
+                flagDict[flag2] = [flag1]
 
-    outfile.close()
     outbed.close()
-    if doVerbose:
-        print "finished: ", time.ctime()
 
 
-def doNotProcessChromosome(chrom):
-    return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+    isFarPair = False
+    if len(readList) == 2:
+        flag1 = readList[0]["flag"]
+        flag2 = readList[1]["flag"]
+        if flag1 != flag2 and flag1 != "" and flag2 != "":
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            dist = abs(start1 - start2)
+            if minDist < dist < maxDist:
+                isFarPair = True
 
+    return isFarPair
 
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
-    readSizes = "%d" % (stopList[0] - startList[0])
-    readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+    sizes = ["%d" % readlen]
+    coords = ["0"]
     leftStart = startList[0] - 1
-    rightStop = stopList[-1]
+    rightStop = startList[-1]
     for index in range(1, numPieces):
-        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
-        readCoords += ",%d" % (startList[index] - startList[0])
+        sizes.append("%d" % (readlen + 1))
+        coords.append("%d" % (startList[index] - startList[0]))
 
     if rsense == "+":
-        senseCode = plusSense
+        senseCode = plusSenseColor
     else:
-        senseCode = minusSense
+        senseCode = minusSenseColor
 
+    readSizes = string.join(sizes, ",")
+    readCoords = string.join(coords, ",")
     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
     return outline
 
 
index cd357cc072c5cc7aa4460371439ef59c428873f5..c24b79edb312fae6c0b19f0d7e3d3ffb45d12984 100755 (executable)
@@ -38,7 +38,7 @@ def main(argv=None):
 
 def makeParser(usage=""):
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--cistype", action="store_false", dest="cistype")
+    parser.add_option("--cistype", dest="cistype")
     parser.add_option("--radius", type="int", dest="radius")
 
     configParser = getConfigParser()
@@ -52,10 +52,19 @@ def makeParser(usage=""):
 
 
 def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
-    tabfile = open(tabFileName)
-    previous = ""
+    
+    posList = getPositionList(tabFileName)
+    feats = featuresIntersecting("human", posList, radius, cistype)
+    featkeys = feats.keys()
+    featkeys.sort()
+    for (chrom, pos) in featkeys:
+        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+
 
+def getPositionList(tabFileName):
+    previous = ""
     posList = []
+    tabfile = open(tabFileName)
     for line in tabfile:
         fields = line.split("\t")
         current = fields[0]
@@ -66,11 +75,7 @@ def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
         chrom = fields[1][3:]
         posList.append((chrom, (int(fields[2]) + int(fields[3]))/2))
 
-    feats = featuresIntersecting("human", posList, radius, cistype)
-    featkeys = feats.keys()
-    featkeys.sort()
-    for (chrom, pos) in featkeys:
-        print "chr%s:%d-%d\t%s" % (chrom, pos, pos + 20, str(feats[(chrom, pos)]))
+    return posList
 
 
 if __name__ == "__main__":
index d20608f1997e2e6a0fd84d9169c78025485e4b57..ec75efb54f3680c892578a38811c1bd241c37241 100755 (executable)
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             strandfilter=None, combine5p=False):
 
     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
 
     if doRevBackground:
         print "Swapping IP and background to calculate FDR"
         pValueType = "back"
 
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
     doPvalue = True
     if ptype is not None:
         ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     else:
         pValueType = "self"
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
@@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
 
     stringency = max(stringency, 1.0)
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
     if doAppend:
         fileMode = "a"
     else:
         fileMode = "w"
 
     outfile = open(outfilename, fileMode)
-
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
         mockRDSsize = len(mockRDS) / 1000000.
@@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
 
                 if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                     try:
                         shiftDict[localshift] += 1
                     except KeyError:
@@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
 
 
 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
@@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
         weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
             sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                 sumAll /= rdsSampleSize
 
             regionStart = currentHitList[0]
@@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
                 #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                     peakScore = peak.smoothArray[bestPos]
                     numPlus = peak.numPlus
                     shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                         peakScore /= rdsSampleSize
 
                     if doTrim:
@@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
 
                         sumAll = trimPeak.numHits
                         numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                             sumAll /= rdsSampleSize
 
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                         if outputRegionList:
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
@@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         except:
                             continue
 
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
                         sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             sumMulti /= rdsSampleSize
 
                         try:
@@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
     return footer
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4ce97e4fc1380517447153e59fba80e5531e3029..91db4a1a44870602383a072d7a4585568026f165 100755 (executable)
@@ -14,7 +14,7 @@ import sys
 import optparse
 import ReadDataset
 from cistematic.genomes import Genome
-from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
 
 print "geneDownstreamBins: version 2.1"
 
@@ -53,8 +53,6 @@ def makeParser(usage=""):
 
 
 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
-    bins = 10
-    standardMinThresh = standardMinDist / bins
 
     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     normalizationFactor = 1.0
@@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac
     geneinfoDict = getGeneInfoDict(genome, cache=True)
     hg = Genome(genome)
     featuresDict = hg.getallGeneFeatures()
-
     outfile = open(outfilename, "w")
-
     gidList = hg.allGIDs()
     gidList.sort()
     for gid in gidList:
-        symbol = "LOC" + gid
-        geneinfo = ""
-        featureList = []
         try:
-            geneinfo = geneinfoDict[gid]
-            featureList = featuresDict[gid]
-            symbol = geneinfo[0][0]
-        except:
+            featuresList = featuresDict[gid]
+        except KeyError:
             print gid
 
-        if len(featureList) == 0:
+        try:
+            binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
+        except ErangeError:
             continue
 
-        newfeatureList = []
-        for (ftype, chrom, start, stop, fsense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
+        tagCount *= normalizationFactor
+        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
+        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
+        for binAmount in binList:
+            outfile.write("\t%.2f" % binAmount)
 
-        if chrom not in hitDict:
-            continue
+        outfile.write("\n")
 
-        newfeatureList.sort()
-        if len(newfeatureList) < 1:
-            continue
+    outfile.close()
 
-        glen = standardMinDist
-        if fsense == "F":
-            nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
 
-            if glen < 1:
-                glen = 1
+def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
 
-            gstart = newfeatureList[-1][1]
-        else:
-            nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
-            if nextGene < glen * 2:
-                glen = nextGene / 2
+    symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
+    geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
+    if geneLength < standardMinDist:
+        raise ErangeError("gene length less than minimum")
 
-            if glen < 1:
-                glen = 1
+    binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
+    if tagCount < 2:
+        raise ErangeError("tag count less than minimum")
 
-            gstart = newfeatureList[0][0] - glen
-            if gstart < 0:
-                gstart = 0
+    return binList, symbol, geneLength, tagCount
 
-        tagCount = 0
-        if glen < standardMinDist:
-            continue
 
-        binList = [0.] * bins
-        for read in hitDict[chrom]:
-            tagStart = read["start"]
-            weight = read["weight"]
-            tagStart -= gstart
-            if tagStart >= glen:
-                break
-
-            if tagStart > 0:
-                tagCount += weight
-                if fsense == "F":
-                    # we are relying on python's integer division quirk
-                    binID = tagStart / standardMinThresh 
-                    binList[binID] += weight
-                else:
-                    rdist = glen - tagStart
-                    binID = rdist / standardMinThresh 
-                    binList[binID] += weight
-
-        if tagCount < 2:
-            continue
+def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
+    if len(featureList) == 0:
+        raise ErangeError("no features found")
 
-        tagCount *= normalizationFactor
-        print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
-        outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
-        for binAmount in binList:
-            outfile.write("\t%.2f" % binAmount)
+    symbol = "LOC%s" % gid
+    geneinfo = ""
+    try:
+        geneinfo = geneinfoDict[gid]
+        symbol = geneinfo[0][0]
+    except KeyError:
+        print gid
 
-        outfile.write("\n")
+    newfeatureList = []
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+
+    if len(newfeatureList) < 1:
+        raise ErangeError("no features found")
+
+    if chrom not in chromosomeList:
+        raise ErangeError("chromosome not found in reads")
+
+    newfeatureList.sort()
+
+    return symbol, newfeatureList, sense, chrom
 
-    outfile.close()
 
+def getGeneStats(genome, gid, minDistance, featureList, sense):
+    geneLength = minDistance
+    if sense == "F":
+        nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = featureList[-1][1]
+    else:
+        nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
+        geneStart = max(featureList[0][0] - geneLength, 0)
+
+    if nextGene < geneLength * 2:
+        geneLength = nextGene / 2
+
+    geneLength = max(geneLength, 1)
+
+    return geneStart, geneLength
+
+
+def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
+    tagCount = 0
+    bins = 10
+    standardMinThresh = standardMinDist / bins
+    binList = [0.] * bins
+    for read in readList:
+        tagStart = read["start"]
+        if tagStart >= geneLength:
+            break
+
+        tagStart -= geneStart
+        weight = read["weight"]
+        if tagStart > 0:
+            tagCount += weight
+            if sense == "F":
+                # we are relying on python's integer division quirk
+                binID = tagStart / standardMinThresh 
+            else:
+                rdist = geneLength - tagStart
+                binID = rdist / standardMinThresh
+
+            binList[binID] += weight
+
+    return binList, tagCount
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index 6a1efc48e270d92a8453e8fcb6907db00185ed92..49745dc0c1361c2a8dbe4736c9c8630396fd93e4 100755 (executable)
@@ -12,6 +12,7 @@ except:
 
 import sys
 import optparse
+import string
 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
 import ReadDataset
 from cistematic.genomes import Genome
@@ -92,68 +93,78 @@ def getParser(usage):
 
     return parser
 
+
 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
                   normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
                   acceptfile=None):
 
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
+
+    if doFlank:
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
+    else:
+        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
     readlen = hitRDS.getReadSize()
     normalizationFactor = 1.0
     if normalizeBins:
         totalCount = len(hitRDS)
         normalizationFactor = totalCount / 1000000.
 
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
-    if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
-    else:
-        locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
-
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for region in acceptDict[chrom]:
-            if region.label not in gidList:
-                gidList.append(region.label)
-
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
 
-    outfile = open(outfilename,'w')
 
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except KeyError:
                 pass
         else:
             symbol = gid
+
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
         for binAmount in gidBins[gid]:
             if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
     outfile.close()
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 38e853aec6172be26b448066692193f1537b6926..5299d27d26c23f8729063eb31f0da390730b5ce5 100755 (executable)
@@ -2,7 +2,7 @@ try:
     import psyco
     psyco.full()
 except:
-    print 'psyco not running'
+    print "psyco not running"
 
 import sys
 import optparse
@@ -68,12 +68,6 @@ def makeParser(usage=""):
     return parser
 
 
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -206,7 +200,6 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
-        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
@@ -271,4 +264,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 4091281c8fe0f5410090b23db7276496fe49d6ed..e95d72693c76df13871da4cf809c99e43c41d36a 100755 (executable)
@@ -68,7 +68,6 @@ def doNotProcessLine(line):
 
 def getNovelSNPInfo(genome, snpEntry, hg):
     fields = snpEntry.split()
-    #TODO: refactor naming. is fields[8] rpkm?
     if fields[8].find("N\A") == -1: 
         return snpEntry
     else:
@@ -93,4 +92,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 37576ca867fb2283f9a5d497c0bbca7d06191f7e..17520401c52b5d153404aa4312a800017e9af3bb 100755 (executable)
@@ -20,6 +20,9 @@ import ReadDataset
 verstring = "makerdsfromblat: version 3.10"
 print verstring
 
+NUM_HEADER_LINES = 5
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -98,24 +101,15 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
                     verbose=False, cachePages=100000, geneDataFileName="",
                     propertyList=[]):
 
-    delimiter = "|"
-    minIntron = 10
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
+    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
     if forceRNA:
         print "forcing datatype to RNA"
         dataType = "RNA"
 
-    if dataType == "RNA":
-        genedatafile = open(geneDataFileName)
-
-    writeLog(outdbname + ".log", verstring, string.join(sys.argv[1:]))
-
     geneDict = {}
     mapDict = {}
     if dataType == "RNA" and not forceRNA:
+        genedatafile = open(geneDataFileName)
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
@@ -164,9 +158,10 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
 
     # make some assumptions based on first read
     infile = open(filename, "r")
-    for arg in range(6):
+    for arg in range(NUM_HEADER_LINES):
         line = infile.readline()
 
+    line = infile.readline()
     fields = line.split()
     readsize = int(fields[10])
     pairedTest = fields[9][-2:]
@@ -186,8 +181,9 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
         rds.insertMetadata([("blat_mapped", "True")])
 
     minReadScore = readsize - readsize/25 - 1
-    trim = -4
+    maxBorder = 0
     if dataType == "RNA":
+        trim = -4
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
@@ -199,9 +195,12 @@ def makerdsfromblat(label, filename, outdbname, dataType="DNA", init=True,
     index = uIndex = mIndex = sIndex = lIndex = 0
     bestScore = 0
     # skip headers
-    for arg in range(5):
+    for arg in range(NUM_HEADER_LINES):
         line = infile.readline()
 
+    insertSize = 100000
+    delimiter = "|"
+    minIntron = 10
     for line in infile:
         lIndex += 1
         fields = line.strip().split()
index 14be260cdf0721b0514098a22d3a07423756c41b..eadb66b83d59e140b0402a9ea76dd351219abad0 100755 (executable)
@@ -87,29 +87,13 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                       flip=False, verbose=False, stripSpace=False, cachePages=100000,
                       propertyList=[]):
 
-    delimiter = "|"
+    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
 
+    geneDict = {}
     dataType = "DNA"
     if genedatafilename is not None:
         dataType = "RNA"
         genedatafile = open(genedatafilename)
-
-
-    forcePair = False
-    if forceID is not None:
-        forcePair = True
-    else:
-        forceID = 0
-
-    maxBorder = 0
-    index = 0
-    insertSize = 100000
-
-    writeLog("%s.log" % outdbname, verstring, string.join(sys.argv[1:]))
-
-    geneDict = {}
-    mapDict = {}
-    if dataType == "RNA":
         for line in genedatafile:
             fields = line.strip().split("\t")
             blockCount = int(fields[7])
@@ -130,7 +114,6 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
                 totalLength += exonLengths[index]
 
             geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
 
         genedatafile.close()
 
@@ -164,12 +147,17 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
     fields = line.split()
     readsize = len(fields[5])
     pairedTest = fields[0][-2:]
+    forcePair = False
+    if forceID is not None:
+        forcePair = True
+    else:
+        forceID = 0
+
     paired = False
     if pairedTest in ["/1", "/2"] or forcePair:
         print "assuming reads are paired"
         paired = True
 
-
     print "read size: %d bp" % readsize
     if init:
         rds.insertMetadata([("readsize", readsize)])
@@ -184,8 +172,9 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
 
     infile.close()
 
-    trim = -4
+    maxBorder = 0
     if dataType == "RNA":
+        trim = -4
         maxBorder = readsize + trim
 
     infile = open(filename, "r")
@@ -195,6 +184,8 @@ def makerdsfrombowtie(label, filename, outdbname, genedatafilename=None, init=Tr
     mInsertList = []
     sInsertList = []
     index = uIndex = mIndex = sIndex = lIndex = 0
+    delimiter = "|"
+    insertSize = 100000
     for line in infile:
         lIndex += 1
         if stripSpace:
index 66209eef973f36072ba964aac73f8579685aed99..b11848d7a93e3968d7b76b286fa415d04ffa078d 100755 (executable)
@@ -106,8 +106,6 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
     insertSize = 100000
 
     geneDict = {}
-    mapDict = {}
-    seenSpliceList = []
     if dataType == 'RNA':
         genedatafile = open(geneDataFileName)
         for line in genedatafile:
@@ -120,17 +118,11 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
             chrom = fields[1]
             sense = fields[2]
             chromstarts = fields[8][:-1].split(',')
-            chromstops = fields[9][:-1].split(',')
-            exonLengths = []
-            totalLength = 0
             for index in range(blockCount):
                 chromstarts[index] = int(chromstarts[index])
-                chromstops[index] = int(chromstops[index])
-                exonLengths.append(chromstops[index] - chromstarts[index])
-                totalLength += exonLengths[index]
 
-            geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths)
-            mapDict[uname] = []
+            geneDict[uname] = (sense, blockCount, chrom, chromstarts)
+
         genedatafile.close()
 
     rds = ReadDataset.ReadDataset(outdbname, init, dataType, verbose=True)
@@ -322,6 +314,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
     print '%d unique reads' % index
     infile.close()
 
+    seenSpliceList = []
     if dataType == 'RNA':
         print 'mapping splices...'
         index = 0
@@ -441,7 +434,7 @@ def makeRDSFromEland2(label, filename, outdbname, doIndex=False, delimiter="|",
                     print fields
                     continue
 
-                (sense, blockCount, transLength, chrom, chromstarts, blockSizes) = geneDict[model]
+                (sense, blockCount, chrom, chromstarts) = geneDict[model]
                 if extended:
                     if 'F' in thepos:
                         rsense = '+'
index bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07..2df39a9412ab6794a1c437bead4b476c482e3918 100755 (executable)
@@ -8,6 +8,14 @@ import sys
 from cistematic.genomes import Genome
 
 
+class GeneSymbolAndCount():
+
+    def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+        self.symbol = symbol
+        self.uniqueCount = uniqueCount
+        self.spliceCount = spliceCount
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -31,53 +39,65 @@ def main(argv=None):
 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
     hg = Genome(genome)
 
-    gidDict = {}
-    gidList = []
-    uniqueCountDict = {}
-    spliceCountDict = {}
+    gidDict = getGeneData(uniquefilecount, splicefilecount)
+    gidList = gidDict.keys()
+    gidList.sort()
+
+    outfile = open(outfilename, "w")
+    for gid in gidList:
+        featureList = hg.getGeneFeatures((genome, gid))
+        featuresizesum, splicearea = getStats(featureList, splicelead)
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        geneData = gidDict[gid]
+        uniqueCount = geneData.uniqueCount
+        expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+        symbol = geneData.symbol
+        spliceCount = geneData.spliceCount
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
 
+    outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+    gidDict = {}
     uniquefile = open(uniquefilecount)
     for line in uniquefile:
         fields = line.strip().split()
-        gidDict[fields[0]] = fields[1]
-        gidList.append(fields[0])
-        uniqueCountDict[fields[0]] = int(fields[2])
+        geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+        gidDict[fields[0]] = geneData
 
+    uniquefile.close()
     splicefile = open(splicefilecount)
     for line in splicefile:
         fields = line.strip().split()
-        spliceCountDict[fields[0]] = int(fields[2])
+        gidDict[fields[0]].spliceCount = int(fields[2])
 
-    outfile = open(outfilename,'w')
+    splicefile.close()
 
-    gidList.sort()
-    for gid in gidList:
-        symbol = gidDict[gid]
-        featureList = hg.getGeneFeatures((genome, gid))
-        newfeatureList = []
-        featuresizesum = 0
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
-                featuresizesum += stop - start + 1
+    return gidDict
 
-        if featuresizesum < 1:
-            featuresizesum = 1
 
-        splicearea = (len(newfeatureList) - 1) * splicelead
-        if splicearea < splicelead:
-            splicearea = 0
+def getStats(featureList, splicelead):
+    newfeatureList = []
+    featuresizesum = 0
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+            featuresizesum += stop - start + 1
 
-        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
-        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+    if featuresizesum < 1:
+        featuresizesum = 1
 
-        # this p-value is based on the observed unique count, not the expected total count
-        # nor the multi-read adjusted count
-        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
-        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
-        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+    splicearea = (len(newfeatureList) - 1) * splicelead
+    if splicearea < splicelead:
+        splicearea = 0
 
-    outfile.close()
+    return featuresizesum, splicearea
 
 
 if __name__ == "__main__":
index 4826a13bbb31824ffea9b187d383f42b35bbf682..e012bb49436c92f50a1d8722182d0cd72a754897 100755 (executable)
@@ -4,10 +4,29 @@
 #
 
 import sys
+from commoncode import regionsOverlap
 
 print "siteintersects: version 2.1"
 
 
+class Site():
+    def __init__(self, line, doExpanded=False):
+        fields = line.strip().split()
+        if doExpanded:
+            self.chrom = fields[1][3:]
+            self.start = int(fields[2])
+            self.stop = int(fields[3])
+            self.rest = fields[4:]
+        else:
+            (chromosome, pos) = fields[0].split(":")
+            self.chrom = chromosome[3:]
+            (start, stop) = pos.split("-")
+            self.start = int(start)
+            self.stop = int(stop)
+            self.rest = fields[1:]
+
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -33,115 +52,114 @@ def main(argv=None):
     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
 
 
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
 
-    siteDict = {}
-    file1Dict = {}
+    siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+    commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
 
-    infile1count = 0
-    infile = open(sitefilename1)
+    if doReject and siteRejectFilename is not None:
+        writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+    print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+    siteDict = {}
+    rejectSiteDict = {}
+    processedLineCount = 0
+    infile = open(sitefilename)
     infile.readline()
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
-        infile1count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            start = int(start)
-            stop = int(stop)
-            rest = fields[1:]
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
+        rest = site.fieldList
 
         try:
             siteDict[chrom].append((start, stop, rest))
-        except:
+        except KeyError:
             siteDict[chrom] = [(start, stop, rest)]
 
         if doReject:
-            file1Dict[str((chrom, start, stop, rest))] = line
+            rejectSiteDict[str((chrom, start, stop, rest))] = line
 
     infile.close()
+    print "file1: %d" % processedLineCount
 
-    print "file1: %d" % infile1count
+    return siteDict, rejectSiteDict
 
-    infile2count = 0
-    infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+    processedLineCount = 0
+    infile = open(siteFilename)
     infile.readline()
 
     commonSites = 0
-    unique2List = []
+    if doReject and rejectFilename is not None:
+        rejectfile = open(rejectFilename, "w")
+    else:
+        doReject=False
+
     outfile = open(outfilename, "w")
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
-        infile2count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            rest = str(fields[1:])
-
-        start = int(start)
-        stop = int(stop)
-        mid = start + abs(stop - start)/2
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
         if chrom not in siteDict:
             if doReject:
-                unique2List.append(line)
-                continue
+                rejectfile.write(line)
+
+            continue
 
-        twoNotCommon = True
+        siteNotCommon = True
         for (rstart, rstop, rline) in siteDict[chrom]:
-            rsize = abs(rstart - rstop) /2
-            rmid = rstart + abs(rstop - rstart)/2
-            if abs(mid - rmid) < rsize:
+            if regionsOverlap(start, stop, rstart, rstop):
                 commonSites += 1
-                if twoNotCommon:
-                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
-                    twoNotCommon = False
+                if siteNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+                    siteNotCommon = False
 
                 try:
                     if doReject:
-                        del file1Dict[str((chrom, rstart, rstop, rline))]
-                except:
+                        del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+                except KeyError:
                     pass
 
-        if doReject and twoNotCommon:
-            unique2List.append(line)
+        if doReject and siteNotCommon:
+            rejectfile.write(line)
 
-    outfile.close()
+    if doReject:
+        rejectfile.close()
 
-    print "file2: %d" % infile2count
+    outfile.close()
 
-    if doReject:
-        reject1file = open(reject1filename, "w")
-        reject2file = open(reject2filename, "w")
+    print "file2: %d" % processedLineCount
 
-        for key in file1Dict:
-            reject1file.write(file1Dict[key])
+    return commonSites
 
-        for line in unique2List:
-            reject2file.write(line)
 
-        reject1file.close()
-        reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+    rejectfile = open(siteRejectFilename, "w")
 
-    print commonSites
+    for key in rejectSiteDict:
+        rejectfile.write(rejectSiteDict[key])
 
+    rejectfile.close()
+    
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
diff --git a/test/testBedToRegion.py b/test/testBedToRegion.py
new file mode 100644 (file)
index 0000000..b5b4c03
--- /dev/null
@@ -0,0 +1,65 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import bedtoregion
+
+
+class TestBedToRegion(unittest.TestCase):
+
+    testBedFile = "erangeTestBedFile"
+    testRegionFile = "erangeTestRegionFile"
+
+
+    def setUp(self):
+        bedfile = open(self.testBedFile, "w")
+        bedfile.write("tab\tdelimited\tfields\n")
+        bedfile.write("space delimited fields\n")
+        bedfile.write("track\n")
+        bedfile.write("line after track will not be processed\n")
+        bedfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testBedFile)
+        except OSError:
+            print "bed file does not exist"
+
+        try:
+            os.remove(self.testRegionFile)
+        except OSError:
+            print "region file does not exist"
+
+
+    def testBedToRegion(self):
+        bedtoregion.bedToRegion("regionLabel", self.testBedFile, self.testRegionFile)
+        resultFile = open(self.testRegionFile)
+        regionList = resultFile.readlines()
+        self.assertEquals(2, len(regionList))
+        self.assertEquals("regionLabel1\ttab\tdelimited\tfields\n", regionList[0])
+        self.assertEquals("regionLabel2\tspace\tdelimited\tfields\n", regionList[1])
+
+
+    def testMain(self):
+        argv = ["bedtoregion"]
+        self.assertRaises(SystemExit, bedtoregion.main, argv)
+        argv = ["bedtoregion", "regionLabel", self.testBedFile, self.testRegionFile]
+        bedtoregion.main(argv)
+        resultFile = open(self.testRegionFile)
+        self.assertEquals(2, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBedToRegion))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testBinsToCdf.py b/test/testBinsToCdf.py
new file mode 100644 (file)
index 0000000..688e688
--- /dev/null
@@ -0,0 +1,66 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import binstocdf
+
+
+class TestBinsToCdf(unittest.TestCase):
+
+    testInputFile = "erangeTestBinFile"
+    testOutputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        binfile = open(self.testInputFile, "w")
+        binfile.write("field1\tfield2\t10\tfield4\t2\t3\n")
+        binfile.write("field1\tfield2\t20\tfield4\t1\t4\n")
+        binfile.write("field1\tfield2\t0\tfield4\n")
+        binfile.write("too short\n")
+        binfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "bin file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        binstocdf.binToCDF(self.testInputFile, self.testOutputFile)
+        resultFile = open(self.testOutputFile)
+        resultList = resultFile.readlines()
+        self.assertEquals(3, len(resultList))
+        self.assertEquals("field1\tfield2\t10\tfield4\t20\t50\n", resultList[0])
+        self.assertEquals("field1\tfield2\t20\tfield4\t5\t25\n", resultList[1])
+        self.assertEquals("field1\tfield2\t0\tfield4\n", resultList[2])
+
+
+    def testMain(self):
+        argv = ["binstocdf"]
+        self.assertRaises(SystemExit, binstocdf.main, argv)
+        argv = ["binstocdf", self.testInputFile, self.testOutputFile]
+        binstocdf.main(argv)
+        resultFile = open(self.testOutputFile)
+        self.assertEquals(3, len(resultFile.readlines()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestBinsToCdf))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testCdfDist.py b/test/testCdfDist.py
new file mode 100644 (file)
index 0000000..9048749
--- /dev/null
@@ -0,0 +1,58 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import cdfdist
+
+
+class TestCdfDist(unittest.TestCase):
+
+    testInputFile = "erangeTestCDFFile"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1 30 60\n")
+        cdffile.write("line2 90 99\n")
+        cdffile.write("line3 5 80\n")
+        cdffile.write("line4 10 14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        bins = 2
+        self.assertEquals([2, 2], cdfdist.cdfDist(bins, 10, self.testInputFile))
+        self.assertEquals([1, 2], cdfdist.cdfDist(bins, 50, self.testInputFile))
+        self.assertEquals([1, 0], cdfdist.cdfDist(bins, 89, self.testInputFile))
+        self.assertEquals([0, 1], cdfdist.cdfDist(bins, 91, self.testInputFile))
+
+
+    def testMain(self):
+        bins = 2
+        percent = 50
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, cdfdist.main, argv)
+        argv = ["cdfdist", bins, percent, self.testInputFile]
+        cdfdist.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestCdfDist))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index b41fe659421377de86e576a7644abf9e8c5971b6..0fbbb7b2835ad2e7b8c52f38fdb4f116268444bc 100644 (file)
@@ -23,6 +23,8 @@ class TestChksnp(unittest.TestCase):
 
     snpDB = "%s/dbSNP128.db" % dbPath
     altSnpDB = "%s/snp129cDNA.db" % dbPath
+    inputFileName = "testChkSNP_input.txt"
+    outputFileName = "testChkSNP_output.txt"
 
     def setUp(self):
         pass
@@ -33,8 +35,7 @@ class TestChksnp(unittest.TestCase):
 
 
     def testChkSNPFile(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
@@ -42,10 +43,8 @@ class TestChksnp(unittest.TestCase):
         infile.write("%s\n" % snpEntry)
         infile.close()
 
-        outputFileName = "testChkSNP_output.txt"
-
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName)
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName)
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -53,10 +52,10 @@ class TestChksnp(unittest.TestCase):
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
-        os.remove(outputFileName)
+        os.remove(self.outputFileName)
 
-        chksnp.chkSNPFile(self.snpDB, inputFileName, outputFileName, snpDBList=[self.altSnpDB])
-        outfile = open(outputFileName, "r")
+        chksnp.chkSNPFile(self.snpDB, self.inputFileName, self.outputFileName, snpDBList=[self.altSnpDB])
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -65,13 +64,12 @@ class TestChksnp(unittest.TestCase):
         self.assertEquals(result, line)
         outfile.close()
 
-        os.remove(inputFileName)
-        os.remove(outputFileName)
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
 
 
     def testMain(self):
-        inputFileName = "testChkSNP_input.txt"
-        infile = open(inputFileName, "w")
+        infile = open(self.inputFileName, "w")
         infile.write("# header line\n")
         snpEntry = string.join(["foo", "foo", "chr1", "691"], "\t")
         infile.write("%s\n" % snpEntry)
@@ -79,11 +77,9 @@ class TestChksnp(unittest.TestCase):
         infile.write("%s\n" % snpEntry)
         infile.close()
 
-        outputFileName = "testChkSNP_output.txt"
-
-        argv = ["chksnp", self.snpDB, inputFileName, outputFileName]
+        argv = ["chksnp", self.snpDB, self.inputFileName, self.outputFileName]
         chksnp.main(argv)
-        outfile = open(outputFileName, "r")
+        outfile = open(self.outputFileName, "r")
         line = outfile.readline()
         result = "foo\tfoo\tchr1\t691\trs10218492\tunknown\n"
         self.assertEquals(result, line)
@@ -91,7 +87,10 @@ class TestChksnp(unittest.TestCase):
         line = outfile.readline()
         self.assertEquals(result, line)
         outfile.close()
-        os.remove(outputFileName)
+
+        os.remove(self.inputFileName)
+        os.remove(self.outputFileName)
+
 
     def testChkSNP(self):
         snpPropertiesList = []
diff --git a/test/testColsum.py b/test/testColsum.py
new file mode 100644 (file)
index 0000000..5c149e7
--- /dev/null
@@ -0,0 +1,56 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import colsum
+
+
+class TestColsum(unittest.TestCase):
+
+    testInputFile = "erangeTestColsumFile"
+
+
+    def setUp(self):
+        colfile = open(self.testInputFile, "w")
+        colfile.write("line1 30 60.5\n")
+        colfile.write("line2 90 99\n")
+        colfile.write("line3 5 80\n")
+        colfile.write("line4 10 1\n")
+        colfile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "cdf file does not exist"
+
+
+    def testBinsToCdf(self):
+        self.assertEquals(0, colsum.colsum(0, self.testInputFile))
+        self.assertEquals(135, colsum.colsum(1, self.testInputFile))
+        self.assertEquals(240.5, colsum.colsum(2, self.testInputFile))
+        self.assertEquals(0, colsum.colsum(3, self.testInputFile))
+
+
+    def testMain(self):
+        field = 1
+        argv = ["colsum"]
+        self.assertRaises(SystemExit, colsum.main, argv)
+        argv = ["colsum", field, self.testInputFile]
+        colsum.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestColsum))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index 43eb96bc2a3153e7cbf0bb5719fa13320195512c..ce8be55e8c9d9aed892a8eba023af0e79b812a8d 100644 (file)
@@ -8,6 +8,7 @@ import os
 import string
 from array import array
 from erange import commoncode
+from erange import Region
 from cistematic.genomes import Genome
 
 
@@ -122,8 +123,10 @@ class TestCommoncode(unittest.TestCase):
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         testfile.write(regionEntry)
         testfile.close()
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegions("regionTestFile"))
+        result = commoncode.getMergedRegions("regionTestFile")
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
         os.remove("regionTestFile")
 
 
@@ -132,108 +135,277 @@ class TestCommoncode(unittest.TestCase):
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(5, 25, 20)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=5))
-        result = {"1": [(12, 18, 6)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, pad=-2))
-        result = {"chr1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, fullChrom=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=5)
+        self.assertEquals(5, result["1"][0].start)
+        self.assertEquals(25, result["1"][0].stop)
+        self.assertEquals(20, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, pad=-2)
+        self.assertEquals(12, result["1"][0].start)
+        self.assertEquals(18, result["1"][0].stop)
+        self.assertEquals(6, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, fullChrom=True)
+        self.assertEquals(10, result["chr1"][0].start)
+        self.assertEquals(20, result["chr1"][0].stop)
+        self.assertEquals(10, result["chr1"][0].length)
 
         regionEntry = string.join(["1", "chr1:10-20", "5"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, compact=True, scoreField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 40, 30)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10), (15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False))
-        result = {"1": [("1", 10, 20, 10), ("2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2", result["1"][1].label)
 
         regionEntry = string.join(["1", "spacer", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "spacer2", "chr1", "15", "40", "10"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("1\tspacer", 10, 20, 10), ("2\tspacer2", 15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2))
+        result = commoncode.getMergedRegionsFromList(regionList, doMerge=False, keepLabel=True, chromField=2)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals("1\tspacer", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals("2\tspacer2", result["1"][1].label)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr1", "2030", "2040", "15"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 2040, 2030)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, maxDist=3000))
-        result = {"1": [(10, 20, 10), (2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=5))
-        result = {"1": [(2030, 2040, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, minHits=12))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, maxDist=3000)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(2030, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=5)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(2030, result["1"][1].start)
+        self.assertEquals(2040, result["1"][1].stop)
+        self.assertEquals(10, result["1"][1].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        result = commoncode.getMergedRegionsFromList(regionList, minHits=12)
+        self.assertEquals(2030, result["1"][0].start)
+        self.assertEquals(2040, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
 
         regionEntry = string.join(["1", "chr1", "10", "20", "+", "5"], "\t")
         regionList = [regionEntry]
         regionEntry = string.join(["2", "chr2", "15", "40", "+", "15"], "\t")
         regionList.append(regionEntry)
-        result = {"2": [(15, 40, 25)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12))
+        result = commoncode.getMergedRegionsFromList(regionList, scoreField=5, minHits=12)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
         self.assertRaises(IndexError, commoncode.getMergedRegionsFromList, regionList, scoreField=6, returnTop=1)
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=6))
         self.assertEquals({}, commoncode.getMergedRegionsFromList(regionList, scoreField=1))
 
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList = [regionEntry]
-        result = {"1": [(10, 20, 10)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList))
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+
         regionEntry = string.join(["2", "chr2", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("1", 10, 20, 10, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
         regionEntry = string.join(["3", "chr1", "15", "40", "32", "17"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("2", 15, 40, 25, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+
         regionEntry = string.join(["4", "chr2", "65", "88", "72", "7"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [("3", 10, 40, 30, 3, 40)], "2": [("4", 15, 88, 73, 32, 17)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True))
-        result = {"1": [("1", 10, 20, 10, 3, 40), ("3", 15, 40, 25, 32, 17)],
-                  "2": [("2", 15, 40, 25, 32, 17), ("4", 65, 88, 23, 72, 7)]
-        }
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(40, result["1"][0].stop)
+        self.assertEquals(30, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("3", result["1"][0].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(88, result["2"][0].stop)
+        self.assertEquals(73, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("4", result["2"][0].label)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, keepLabel=True, doMerge=False)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+        self.assertEquals("1", result["1"][0].label)
+        self.assertEquals(15, result["1"][1].start)
+        self.assertEquals(40, result["1"][1].stop)
+        self.assertEquals(25, result["1"][1].length)
+        self.assertEquals(32, result["1"][1].peakPos)
+        self.assertEquals(17, result["1"][1].peakHeight)
+        self.assertEquals("3", result["1"][1].label)
+        self.assertEquals(15, result["2"][0].start)
+        self.assertEquals(40, result["2"][0].stop)
+        self.assertEquals(25, result["2"][0].length)
+        self.assertEquals(32, result["2"][0].peakPos)
+        self.assertEquals(17, result["2"][0].peakHeight)
+        self.assertEquals("2", result["2"][0].label)
+        self.assertEquals(65, result["2"][1].start)
+        self.assertEquals(88, result["2"][1].stop)
+        self.assertEquals(23, result["2"][1].length)
+        self.assertEquals(72, result["2"][1].peakPos)
+        self.assertEquals(7, result["2"][1].peakHeight)
+        self.assertEquals("4", result["2"][1].label)
 
         regionList = ["# comment"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# pvalue"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
-        result = {"1": [(10, 20, 10, 3, 40)]}
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value"], "\t")
         regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         regionList = ["# pvalue readShift"]
         regionEntry = string.join(["1", "chr1", "10", "20", "5", "3", "40", "any value", "any shift"], "\t")
         regionList.append(regionEntry)
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True))
-        self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1))
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
+        result = commoncode.getMergedRegionsFromList(regionList, keepPeak=True, returnTop=1)
+        self.assertEquals(10, result["1"][0].start)
+        self.assertEquals(20, result["1"][0].stop)
+        self.assertEquals(10, result["1"][0].length)
+        self.assertEquals(3, result["1"][0].peakPos)
+        self.assertEquals(40, result["1"][0].peakHeight)
+
         #Test fails - the header line is required if there are fields after the peak which isn't so good
         #self.assertEquals(result, commoncode.getMergedRegionsFromList(regionList[1:], keepPeak=True))
 
@@ -274,28 +446,61 @@ class TestCommoncode(unittest.TestCase):
     #TODO: write test
     def testFindPeak(self):
         hitList = []
-        result = ([], 0.0, array("f"), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 0))
+        result = commoncode.findPeak(hitList, 0, 0)
+        self.assertEquals([], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f"), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10))
-        result = ([6, 7], 0.5, array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), 0.5)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, doWeight=True))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto"))
-        result = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 0.0, array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 0.0, 6)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift="auto", returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, doWeight=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(0.5, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0555555559694767, 0.1666666716337204, 0.3333333432674408, 0.4444444477558136, 0.5, 0.5, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.5, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, shift="auto")
+        self.assertEquals([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], result.topPos)
+        self.assertEquals(0.0, result.numHits)
+        self.assertEquals(array("f", [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(0.0, result.numPlus)
+        self.assertEquals(6, result.shift)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(3, result.shift)
 
         hitList= [{"start": 4, "sense": "+", "weight": 0.5}]
-        result = ([6, 7], 1.0, array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), 1.0, 1.0)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True))
-        result = ([7], 1.0, array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), 1.0, 1.0, 3)
-        self.assertEquals(result, commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3, returnShift=True))
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True)
+        self.assertEquals([6, 7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.8888888955116272, 1.0, 1.0, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+
+        result = commoncode.findPeak(hitList, 0, 10, leftPlus=True, shift=3)
+        self.assertEquals([7], result.topPos)
+        self.assertEquals(1.0, result.numHits)
+        self.assertEquals(array('f', [0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.3333333432674408, 0.66666668653488159, 0.0, 0.0]), result.smoothArray)
+        self.assertEquals(1.0, result.numPlus)
+        self.assertEquals(1.0, result.numLeftPlus)
+        self.assertEquals(3, result.shift)
 
 
     #TODO: write test
@@ -354,10 +559,13 @@ class TestCommoncode(unittest.TestCase):
         }
         self.assertEquals(result, complementDict)
 
-        regionDict = {"I": [("new feature", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new feature"
+        regionDict = {"I": [additionalRegion]}
         featureDict = commoncode.getFeaturesByChromDict(self.genome, additionalRegionsDict=regionDict)
         result = (100, 150, "new feature", "+", "custom")
         self.assertEquals(result, featureDict["I"][0])
+        
 
 
     def testGetLocusByChromDict(self):
@@ -382,7 +590,9 @@ class TestCommoncode(unittest.TestCase):
             self.assertTrue(chrom in self.celegansChroms)
             self.assertEquals(firstLoci[chrom], locusDict[chrom][0])
 
-        regionDict = {"I": [("new region", 100, 150, 50)]}
+        additionalRegion = Region.Region(100, 150)
+        additionalRegion.label = "new region"
+        regionDict = {"I": [additionalRegion]}
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict)
         self.assertEquals((100, 150, "new region", 50), locusDict["I"][0])
         locusDict = commoncode.getLocusByChromDict(self.genome, additionalRegionsDict=regionDict, keepSense=True)
index 4455fff7211ff686832b81cb80ed6cc7adcfba03..d993fad8ba7e7f1dd208d6760d465842538ddba7 100644 (file)
@@ -11,8 +11,13 @@ Created on Sep 8, 2010
 import sys
 import unittest
 import testAnalyzeGO
+import testBedToRegion
+import testBinsToCdf
+import testCdfDist
 import testChksnp
+import testColsum
 import testCommoncode
+import testFeatureIntersects
 import testGeneMrnaCounts
 import testGeneMrnaCountsWeighted
 #import testGetFasta
@@ -23,6 +28,7 @@ import testMakeBamFromRds
 import testmakebedfromrds
 #import testMakeGraphs
 import testMakeRdsFromBam
+import testMakeSiteTrack
 import testMakeSNPTrack
 import testMarkLinkers
 import testPeak
@@ -41,8 +47,13 @@ def main(argv=None):
 
     suite = unittest.TestSuite()
     suite.addTest(testAnalyzeGO.suite())
+    suite.addTest(testBedToRegion.suite())
+    suite.addTest(testBinsToCdf.suite())
+    suite.addTest(testCdfDist.suite())
     suite.addTest(testChksnp.suite())
+    suite.addTest(testColsum.suite())
     suite.addTest(testCommoncode.suite())
+    suite.addTest(testFeatureIntersects.suite())
     suite.addTest(testGeneMrnaCounts.suite())
     suite.addTest(testGeneMrnaCountsWeighted.suite())
     #suite.addTest(testGetFasta.suite())
@@ -53,6 +64,7 @@ def main(argv=None):
     suite.addTest(testmakebedfromrds.suite())
     #suite.addTest(testMakeGraphs.suite())
     suite.addTest(testMakeRdsFromBam.suite())
+    suite.addTest(testMakeSiteTrack.suite())
     suite.addTest(testMakeSNPTrack.suite())
     suite.addTest(testMarkLinkers.suite())
     suite.addTest(testPeak.suite())
diff --git a/test/testFeatureIntersects.py b/test/testFeatureIntersects.py
new file mode 100644 (file)
index 0000000..9633f8c
--- /dev/null
@@ -0,0 +1,73 @@
+
+import unittest
+import os
+from erange import featureIntersects
+
+
+class TestFeatureIntersects(unittest.TestCase):
+
+    testInputFile = "erangeTestFeatureIntersects"
+
+
+    def setUp(self):
+        cdffile = open(self.testInputFile, "w")
+        cdffile.write("line1\tchr1\t30\t60\n")
+        cdffile.write("line2\tchr1\t90\t99\n")
+        cdffile.write("line3\tchr1\t5\t80\n")
+        cdffile.write("line4\tchr2\t10\t14\n")
+        cdffile.write("line4\tnot to be processed\n")
+        cdffile.write("line5\tchr2\t10\t14\n")
+        cdffile.close()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "position file does not exist"
+
+
+    #TODO: write test
+    def testFeatureIntersects(self):
+        pass
+
+
+    #TODO: write test
+    def testGetPositionList(self):
+        pass
+
+
+    #TODO: write test
+    def testMain(self):
+        argv = ["cdfdist"]
+        self.assertRaises(SystemExit, featureIntersects.main, argv)
+        argv = ["cdfdist", self.testInputFile]
+        featureIntersects.main(argv)
+
+
+    def testMakeParser(self):
+        parser = featureIntersects.makeParser("")
+        argv = []
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual([], args)
+        argv = [self.testInputFile]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual(self.testInputFile, args[0])
+        self.assertEqual("TFBSCONSSITES", options.cistype)
+        self.assertEqual(100, options.radius)
+        argv = [self.testInputFile, "--cistype", "test", "--radius", "50"]
+        (options, args) = parser.parse_args(argv)
+        self.assertEqual("test", options.cistype)
+        self.assertEqual(50, options.radius)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestFeatureIntersects))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
diff --git a/test/testMakeSiteTrack.py b/test/testMakeSiteTrack.py
new file mode 100644 (file)
index 0000000..7e609e7
--- /dev/null
@@ -0,0 +1,94 @@
+'''
+Created on Dec 2, 2010
+
+@author: sau
+'''
+import unittest
+import os
+from erange import makesitetrack
+
+
+class TestMakeSiteTrack(unittest.TestCase):
+
+    testInputFile = "erangeTestSiteFile"
+    testOutputFile = "erangeTestBedFile"
+
+
+    def setUp(self):
+        self.createCompactInputFile()
+
+
+    def tearDown(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        try:
+            os.remove(self.testOutputFile)
+        except OSError:
+            print "bed file does not exist"
+
+
+    def createStandardInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("name\tchr1\t100\t175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def createCompactInputFile(self):
+        siteFile = open(self.testInputFile, "w")
+        siteFile.write("#comment line\n")
+        siteFile.write("chr1:100-175\t11\t+\t-\n")
+        siteFile.close()
+
+
+    def testMakeSiteTrack(self):
+        self.createCompactInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile)
+        bedfile = open(self.testOutputFile)
+        header = 'track name="" visibility=4 itemRgb="On"\n'
+        self.assertEquals(header, bedfile.readline())
+        result = "chr1\t100\t176\t-1\t11\t+\t-\t-\t0,0,0\n"
+        self.assertEquals(result, bedfile.readline())
+
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, append=True, noHeader=True)
+        bedfile = open(self.testOutputFile)
+        header = bedfile.readline()
+        firstEntry = bedfile.readline()
+        self.assertEquals(firstEntry, bedfile.readline())
+
+
+    def testNonCompactMakeSiteTrack(self):
+        try:
+            os.remove(self.testInputFile)
+        except OSError:
+            print "site file does not exist"
+
+        self.createStandardInputFile()
+        makesitetrack.makesitetrack(self.testInputFile, self.testOutputFile, compact=False, color="255,255,255")
+        bedfile = open(self.testOutputFile)
+        bedfile.readline()
+        result = "chr1\t100\t176\t-1\t1.0\t+\t-\t-\t255,255,255\n"
+        self.assertEquals(result, bedfile.readline())
+
+
+    def testMain(self):
+        self.createCompactInputFile()
+        argv = ["makesitetrack"]
+        self.assertRaises(SystemExit, makesitetrack.main, argv)
+        argv = ["makesitetrack", self.testInputFile, self.testOutputFile]
+        makesitetrack.main(argv)
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestMakeSiteTrack))
+
+    return suite
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
\ No newline at end of file
index f4b1691a07a08b15988539f75f1fbb2a62e148b2..9fd041d4462abb552eef87ebd82430177106231f 100755 (executable)
@@ -168,7 +168,6 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                 chrom = "chr%s" % multiReadList[index][1]
                 reweighList.append((round(score,3), chrom, start, theID))
 
-            #TODO: Is this right? If match index is 0 are we doing nothing?
             if theMatch > 0:
                 RDS.reweighMultireads(reweighList)
                 fixedPair += 1
@@ -330,4 +329,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)