erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / cisstat / score.py
diff --git a/cistematic/cisstat/score.py b/cistematic/cisstat/score.py
new file mode 100644 (file)
index 0000000..36a747b
--- /dev/null
@@ -0,0 +1,109 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+# Score formulas
+#
+from math import log, sqrt, exp
+
+try:
+    import _stat
+    hasStatExtension = True
+except:
+    hasStatExtension = False
+
+
+def llScore(observed, expected):
+    """ returns log likelihood for one pair of observed vs. expected.
+    """
+    score = float(observed) * log(float(observed) / float(expected))
+    return score
+
+
+def zScore(observed, expected, prob):
+    """ returns (observed - expected)/(standard deviation), where the standard deviation 
+        is calculated assuming normality as sqrt(obs * prob * (1 - prob)).
+    """
+    score = (float(observed) - float(expected)) / sqrt(float(observed) * prob * (1 - prob))
+    return score
+
+
+def expected(length, prob):
+    return length * prob
+
+
+def pvalue(zvalue):
+    """ approximation for converting from zvalue to P-value using Hamaker's 
+        formula as described in Zar (App18). This is dependable down to 
+        abs(z) ~ 0.2
+    """
+    z = abs(zvalue)
+    c = 0.806 * z * (1 - 0.018 * z)
+    pval = (1 - sqrt(1 - exp(-1 * (c * c))))/2
+
+    if zvalue < 0:
+        pval = 1 - pval
+
+    return pval
+
+
+def pearsonCorrelation(colA, colB):
+    if hasStatExtension:
+        return _stat.pearsonCorrelation(colA, colB)
+    else:
+        return localPearsonCorrelation(colA, colB)
+
+
+def localPearsonCorrelation(colA, colB):
+    meanA = 0.0
+    meanB = 0.0
+
+    length = len(colA)
+    if length > len(colB):
+        length = len(colB)
+
+    for index in range(length):
+        meanA += colA[index]
+        meanB += colB[index]
+
+    meanA /= length
+    meanB /= length
+
+    numerator = 0.0
+    denominatorA = 0.0
+    denominatorB = 0.0
+
+    for index in range(length):
+        numerator += (colA[index] - meanA) * (colB[index] - meanB)
+        denominatorA += (colA[index] - meanA) * (colA[index] - meanA)
+        denominatorB += (colB[index] - meanB) * (colB[index] - meanB)
+
+    if denominatorA == 0.0 or denominatorB == 0.0:
+        return 0.0
+
+    return (numerator / sqrt(denominatorA * denominatorB))
\ No newline at end of file