X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fcore%2Fmotifextension.c;fp=cistematic%2Fcore%2Fmotifextension.c;h=c069d0c94e56f5aec6f4e6527d2e8eac7d4e59bc;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/core/motifextension.c b/cistematic/core/motifextension.c new file mode 100644 index 0000000..c069d0c --- /dev/null +++ b/cistematic/core/motifextension.c @@ -0,0 +1,646 @@ +/*##########################################################################*/ +/*# #*/ +/*# C O P Y R I G H T N O T I C E #*/ +/*# Copyright (c) 2003-10 by: #*/ +/*# * California Institute of Technology #*/ +/*# #*/ +/*# All Rights Reserved. #*/ +/*# #*/ +/*# Permission is hereby granted, free of charge, to any person #*/ +/*# obtaining a copy of this software and associated documentation files #*/ +/*# (the "Software"), to deal in the Software without restriction, #*/ +/*# including without limitation the rights to use, copy, modify, merge, #*/ +/*# publish, distribute, sublicense, and/or sell copies of the Software, #*/ +/*# and to permit persons to whom the Software is furnished to do so, #*/ +/*# subject to the following conditions: #*/ +/*# #*/ +/*# The above copyright notice and this permission notice shall be #*/ +/*# included in all copies or substantial portions of the Software. #*/ +/*# #*/ +/*# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #*/ +/*# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #*/ +/*# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #*/ +/*# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #*/ +/*# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #*/ +/*# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #*/ +/*# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #*/ +/*# SOFTWARE. #*/ +/*###########################################################################*/ +/*# C extension for commonly used motif methods. */ + +#include +#include +#include +#include + +static char module_doc[] = +"This module only implements the locateMotif and correlateMotifs in C for now."; + +double probMatchPWM_func(double *PWM, int index, char *aSeq, long seqPos) { + double aval, cval, gval, tval; + + aval = PWM[4 * index]; + cval = PWM[4 * index + 1]; + gval = PWM[4 * index + 2]; + tval = PWM[4 * index + 3]; + + switch (aSeq[seqPos]) { + case 'A': case 'a': + return aval; + break; + case 'C': case 'c': + return cval; + break; + case 'G': case 'g': + return gval; + break; + case 'T': case 't': + return tval; + break; + case 'N': case 'n': + return 1.0; + break; + case 'S': case 's': + return cval + gval; + break; + case 'W': case 'w': + return aval + tval; + break; + case 'R': case 'r': + return aval + gval; + break; + case 'Y': case 'y': + return cval + tval; + break; + case 'M': case 'm': + return aval + cval; + break; + case 'K': case 'k': + return gval + tval; + break; + case 'B': case 'b': + return cval + gval + tval; + break; + case 'D': case 'd': + return aval + gval + tval; + break; + case 'H': case 'h': + return aval + cval + tval; + break; + case 'V': case 'v': + return aval + cval + gval; + break; + } + return 0.0; +} + +double probMatchDPWM_func(double *PWM, int index, char *aSeq, long seqPos) { + double aval, cval, gval, tval; + int prevIndex; + + aval = 0.0; + cval = 0.0; + gval = 0.0; + tval = 0.0; + + if (index == 0) { + for (prevIndex = 0; prevIndex < 4; prevIndex++) { + aval += PWM[prevIndex * 4]; + cval += PWM[prevIndex * 4 + 1]; + gval += PWM[prevIndex * 4 + 2]; + tval += PWM[prevIndex * 4 + 3]; + } + } else { + if (seqPos > 0) { + switch(aSeq[seqPos - 1]) { + case 'A': case 'a': + aval = PWM[16 * index]; + cval = PWM[16 * index + 1]; + gval = PWM[16 * index + 2]; + tval = PWM[16 * index + 3]; + break; + case 'C': case 'c': + aval = PWM[16 * index + 4]; + cval = PWM[16 * index + 5]; + gval = PWM[16 * index + 6]; + tval = PWM[16 * index + 7]; + break; + case 'G': case 'g': + aval = PWM[16 * index + 8]; + cval = PWM[16 * index + 9]; + gval = PWM[16 * index + 10]; + tval = PWM[16 * index + 11]; + break; + case 'T': case 't': + aval = PWM[16 * index + 12]; + cval = PWM[16 * index + 13]; + gval = PWM[16 * index + 14]; + tval = PWM[16 * index + 15]; + break; + } + } + } + + switch (aSeq[seqPos]) { + case 'A': case 'a': + return aval; + break; + case 'C': case 'c': + return cval; + break; + case 'G': case 'g': + return gval; + break; + case 'T': case 't': + return tval; + break; + } + + return 0.0001; +} + +void scoreMot_func(char *theSeq, double *mPWM, double *rPWM, long pos, long motLen, float *bestCons, float *maxDiff, float *score, char *sense) +{ + float forScore, revScore; + long index, currentPos; + + forScore = 0.0; + revScore = 0.0; + *sense = 'F'; + + for (index = 0; index < motLen; index++) { + currentPos = pos + index; + forScore += probMatchPWM_func(mPWM, index, theSeq, currentPos); + revScore += probMatchPWM_func(rPWM, index, theSeq, currentPos); + } + + if (((forScore + *maxDiff) < *bestCons) && ((revScore + *maxDiff) < *bestCons)) { + *score = -1.0; + return; + } + + if (forScore > revScore) { + *score = forScore; + } else { + *score = revScore; + *sense = 'R'; + } +} + +void scoreDPWM_func(char *theSeq, double *mDPWM, double *rDPWM, long pos, long motLen, float *bestCons, float *score, char *sense) +{ + float forScore, revScore; + long index, currentPos; + + forScore = 0.0; + revScore = 0.0; + *sense = 'F'; + + for (index = 0; index < motLen; index++) { + currentPos = pos + index; + forScore -= log(probMatchDPWM_func(mDPWM, index, theSeq, currentPos)) / log(2.0); + revScore -= log(probMatchDPWM_func(rDPWM, index, theSeq, currentPos)) / log(2.0); + } + + if ((forScore > *bestCons) && (revScore > *bestCons)) { + *score = -1.0; + return; + } + + if (forScore < revScore) { + *score = forScore; + } else { + *score = revScore; + *sense = 'R'; + } +} + +static PyObject* +the_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *motifPWM, *revPWM, *mScore, *mDiff, *results; + double *mPWM, *rPWM; + float maxScore, maxDiff, seqScore; + long pos, maxPos; + int ok, index, indexMax, ntIndex, motLen, seqLen, skipping; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 5, 5, &pySeq, &motifPWM, &revPWM, &mScore, &mDiff); + + seq = PyString_AsString(pySeq); + motLen = PyList_Size(motifPWM); + seqLen = PyString_Size(pySeq); + maxScore = PyFloat_AsDouble(mScore); + maxDiff = PyFloat_AsDouble(mDiff); + + indexMax = 4 * motLen; + mPWM = malloc(indexMax * sizeof(double)); + rPWM = malloc(indexMax * sizeof(double)); + + for (index = 0; index < motLen; index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + mPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(motifPWM, index), ntIndex)); + rPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(revPWM, index), ntIndex)); + } + } + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + scoreMot_func(seq, mPWM, rPWM, pos, motLen, &maxScore, &maxDiff, &seqScore, &sense); + + if (seqScore > 1.0) { + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + free(mPWM); + free(rPWM); + return results; +} + +static PyObject* +dpwm_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *motifPWM, *revPWM, *mScore, *results; + double *mDPWM, *rDPWM; + float maxScore, seqScore; + long pos, maxPos; + int ok, index, indexMax, prevIndex, currentIndex, motLen, seqLen, skipping; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &motifPWM, &revPWM, &mScore); + + seq = PyString_AsString(pySeq); + motLen = PyList_Size(motifPWM); + seqLen = PyString_Size(pySeq); + maxScore = PyFloat_AsDouble(mScore); + + indexMax = 16 * motLen; + mDPWM = malloc(indexMax * sizeof(double)); + rDPWM = malloc(indexMax * sizeof(double)); + + for (index = 0; index < motLen; index++) { + for (prevIndex = 0; prevIndex < 4; prevIndex++) { + for (currentIndex = 0; currentIndex < 4; currentIndex++) { + mDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(motifPWM, index), prevIndex), currentIndex)); + rDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(revPWM, index), prevIndex), currentIndex)); + } + } + } + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + scoreDPWM_func(seq, mDPWM, rDPWM, pos, motLen, &maxScore, &seqScore, &sense); + + if (seqScore > 1.0) { + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + free(mDPWM); + free(rDPWM); + return results; +} + +static PyObject* +mer_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *forwardMer, *revcompMer, *maxMismatches, *results; + long pos, maxPos; + int ok, index, motLen, seqLen, skipping, mismatches, fmis, rmis; + char *fMer, *rMer; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &forwardMer, &revcompMer, &maxMismatches); + + seq = PyString_AsString(pySeq); + fMer = PyString_AsString(forwardMer); + rMer = PyString_AsString(revcompMer); + motLen = PyString_Size(forwardMer); + seqLen = PyString_Size(pySeq); + mismatches = PyInt_AsLong(maxMismatches); + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + fmis = 0; + rmis = 0; + for (index = 0; index < motLen; index++) { + if (seq[pos + index] != fMer[index]) { + fmis++; + } + if (seq[pos + index] != rMer[index]) { + rmis++; + } + if ((fmis > mismatches) && (rmis > mismatches)) { + break; + } + } + + if ((fmis <= mismatches) || (rmis <= mismatches)) { + sense = 'F'; + if (rmis < fmis) { + sense = 'R'; + } + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + return results; +} + +double pearson_func(double *colA, double *colB, int pos) +{ + double c, numerator; + double meanA, denominatorA; + double meanB, denominatorB; + long index; + + meanA = 0.0; + meanB = 0.0; + + for (index = 0; index < 4; index++) { + meanA += colA[pos + index]; + meanB += colB[pos + index]; + } + + meanA /= 4; + meanB /= 4; + + denominatorA = 0.0; + denominatorB = 0.0; + numerator = 0.0; + + for (index = 0; index < 4; index++) { + numerator += (colA[pos + index] - meanA) * (colB[pos + index] - meanB); + denominatorA += (colA[pos + index] - meanA) * (colA[pos + index] - meanA); + denominatorB += (colB[pos + index] - meanB) * (colB[pos + index] - meanB); + } + + if (denominatorA == 0.0 || denominatorB == 0.0) { + c = 0.0; + } else { + c = numerator / sqrt(denominatorA * denominatorB); + } + + return c; +} + +static PyObject* +corr_func(PyObject *self, PyObject *args) +{ + PyObject *PyaPWM, *PybPWM, *PycPWM, *PyMaxSlide; + double *aPWM, *bPWM, *cPWM, *tempA, *tempB, *tempC; + float fscore, rscore, bestScore; + long maxSlide; + int ok, index, indexMax, bIndexMax, ntIndex, motLen, padLen, slide, adjustedPadLen, adjustedSlide, tempMax, tempSize; + + bestScore = 0.0; + + ok = PyArg_UnpackTuple(args, "corr", 4, 4, &PyaPWM, &PybPWM, &PycPWM, &PyMaxSlide); + + motLen = PyList_Size(PyaPWM); + padLen = motLen - PyList_Size(PybPWM); + maxSlide = PyInt_AsLong(PyMaxSlide); + + if (maxSlide > motLen) { + maxSlide = motLen - 1; + } + + indexMax = 4 * motLen; + bIndexMax = 4 * (motLen - padLen); + aPWM = malloc(indexMax * sizeof(double)); + bPWM = malloc(bIndexMax * sizeof(double)); + cPWM = malloc(bIndexMax * sizeof(double)); + + for (index = 0; index < (motLen - padLen); index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex)); + bPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PybPWM, index), ntIndex)); + cPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PycPWM, index), ntIndex)); + } + } + + for (; index < motLen; index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex)); + } + } + + for (slide = -1 * maxSlide; slide < (maxSlide + padLen + 1); slide++ ) { + tempA = malloc(3 * indexMax * sizeof(double)); + tempB = malloc(3 * indexMax * sizeof(double)); + tempC = malloc(3 * indexMax * sizeof(double)); + if (slide < 0) { + tempSize = abs(slide) + motLen; + tempMax = 4 * tempSize; + for (index = 0; index < 4 * abs(slide); index++) { + tempA[index] = 0.25; + } + for (; index < tempMax; index++) { + tempA[index] = aPWM[index - 4 * slide]; + } + for (index = 0; index < bIndexMax; index++) { + tempB[index] = bPWM[index]; + tempC[index] = cPWM[index]; + } + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else if ((slide > 0) && (slide <= maxSlide)) { + if (padLen > 0) { + if (padLen >= slide) { + adjustedPadLen = padLen - slide; + adjustedSlide = 0; + } else { + adjustedPadLen = 0; + adjustedSlide = slide - padLen; + } + tempSize = motLen + adjustedSlide; + tempMax = indexMax + 4 * adjustedSlide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + tempMax = indexMax + 4 * adjustedSlide; + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else { + tempSize = motLen + slide; + tempMax = indexMax + 4 * slide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + } + } else if (slide > maxSlide) { + tempSize = motLen + maxSlide; + tempMax = indexMax + 4 * maxSlide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + tempMax = indexMax + 4 * maxSlide; + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else { + tempSize = motLen; + + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (index = 0; index < bIndexMax; index++) { + tempB[index] = bPWM[index]; + tempC[index] = cPWM[index]; + } + for (; index < indexMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } + fscore = 0.0; + rscore = 0.0; + + for (index = 0; index bestScore)) { + bestScore = rscore; + } else if (fscore > bestScore) { + bestScore = fscore; + } + + free(tempA); + free(tempB); + free(tempC); + } + + free(aPWM); + free(bPWM); + free(cPWM); + + return PyFloat_FromDouble(bestScore); +} + +static char the_func_doc[] = +"returns a list of positions on aSeq that match the PWM within a Threshold, given as a percentage of the optimal consensus score."; + +static char dpwm_func_doc[] = +"returns a list of positions on aSeq that match the DPWM within a given Fold of the optimal consensus score."; + +static char mer_func_doc[] = +"returns a list of positions on aSeq that match an N-mer within M mismatches. Assumes Mers and Seq are in the same case."; + +static char corr_func_doc[] = +"returns a pearson-correlation coefficient based similarity value between -1 (anti-correlated) and +1 (identical) for two motifs."; + +static PyMethodDef module_methods[] = { + {"locateMotif", the_func, METH_VARARGS, the_func_doc}, + {"locateMarkov1", dpwm_func, METH_VARARGS, dpwm_func_doc}, + {"locateMer", mer_func, METH_VARARGS, mer_func_doc}, + {"correlateMotifs", corr_func, METH_VARARGS, corr_func_doc}, + {NULL, NULL} +}; + +PyMODINIT_FUNC +init_motif(void) +{ + Py_InitModule3("_motif", module_methods, module_doc); +}