1 /*##########################################################################*/
3 /*# C O P Y R I G H T N O T I C E #*/
4 /*# Copyright (c) 2003-10 by: #*/
5 /*# * California Institute of Technology #*/
7 /*# All Rights Reserved. #*/
9 /*# Permission is hereby granted, free of charge, to any person #*/
10 /*# obtaining a copy of this software and associated documentation files #*/
11 /*# (the "Software"), to deal in the Software without restriction, #*/
12 /*# including without limitation the rights to use, copy, modify, merge, #*/
13 /*# publish, distribute, sublicense, and/or sell copies of the Software, #*/
14 /*# and to permit persons to whom the Software is furnished to do so, #*/
15 /*# subject to the following conditions: #*/
17 /*# The above copyright notice and this permission notice shall be #*/
18 /*# included in all copies or substantial portions of the Software. #*/
20 /*# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #*/
21 /*# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #*/
22 /*# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #*/
23 /*# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #*/
24 /*# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #*/
25 /*# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #*/
26 /*# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #*/
28 /*###########################################################################*/
29 /*# C extension for commonly used motif methods. */
36 static char module_doc[] =
37 "This module only implements the locateMotif and correlateMotifs in C for now.";
39 double probMatchPWM_func(double *PWM, int index, char *aSeq, long seqPos) {
40 double aval, cval, gval, tval;
42 aval = PWM[4 * index];
43 cval = PWM[4 * index + 1];
44 gval = PWM[4 * index + 2];
45 tval = PWM[4 * index + 3];
47 switch (aSeq[seqPos]) {
82 return cval + gval + tval;
85 return aval + gval + tval;
88 return aval + cval + tval;
91 return aval + cval + gval;
97 double probMatchDPWM_func(double *PWM, int index, char *aSeq, long seqPos) {
98 double aval, cval, gval, tval;
107 for (prevIndex = 0; prevIndex < 4; prevIndex++) {
108 aval += PWM[prevIndex * 4];
109 cval += PWM[prevIndex * 4 + 1];
110 gval += PWM[prevIndex * 4 + 2];
111 tval += PWM[prevIndex * 4 + 3];
115 switch(aSeq[seqPos - 1]) {
117 aval = PWM[16 * index];
118 cval = PWM[16 * index + 1];
119 gval = PWM[16 * index + 2];
120 tval = PWM[16 * index + 3];
123 aval = PWM[16 * index + 4];
124 cval = PWM[16 * index + 5];
125 gval = PWM[16 * index + 6];
126 tval = PWM[16 * index + 7];
129 aval = PWM[16 * index + 8];
130 cval = PWM[16 * index + 9];
131 gval = PWM[16 * index + 10];
132 tval = PWM[16 * index + 11];
135 aval = PWM[16 * index + 12];
136 cval = PWM[16 * index + 13];
137 gval = PWM[16 * index + 14];
138 tval = PWM[16 * index + 15];
144 switch (aSeq[seqPos]) {
162 void scoreMot_func(char *theSeq, double *mPWM, double *rPWM, long pos, long motLen, float *bestCons, float *maxDiff, float *score, char *sense)
164 float forScore, revScore;
165 long index, currentPos;
171 for (index = 0; index < motLen; index++) {
172 currentPos = pos + index;
173 forScore += probMatchPWM_func(mPWM, index, theSeq, currentPos);
174 revScore += probMatchPWM_func(rPWM, index, theSeq, currentPos);
177 if (((forScore + *maxDiff) < *bestCons) && ((revScore + *maxDiff) < *bestCons)) {
182 if (forScore > revScore) {
190 void scoreDPWM_func(char *theSeq, double *mDPWM, double *rDPWM, long pos, long motLen, float *bestCons, float *score, char *sense)
192 float forScore, revScore;
193 long index, currentPos;
199 for (index = 0; index < motLen; index++) {
200 currentPos = pos + index;
201 forScore -= log(probMatchDPWM_func(mDPWM, index, theSeq, currentPos)) / log(2.0);
202 revScore -= log(probMatchDPWM_func(rDPWM, index, theSeq, currentPos)) / log(2.0);
205 if ((forScore > *bestCons) && (revScore > *bestCons)) {
210 if (forScore < revScore) {
219 the_func(PyObject *self, PyObject *args)
221 PyObject *pySeq, *motifPWM, *revPWM, *mScore, *mDiff, *results;
223 float maxScore, maxDiff, seqScore;
225 int ok, index, indexMax, ntIndex, motLen, seqLen, skipping;
228 results = Py_BuildValue("[]");
230 ok = PyArg_UnpackTuple(args, "ref", 5, 5, &pySeq, &motifPWM, &revPWM, &mScore, &mDiff);
232 seq = PyString_AsString(pySeq);
233 motLen = PyList_Size(motifPWM);
234 seqLen = PyString_Size(pySeq);
235 maxScore = PyFloat_AsDouble(mScore);
236 maxDiff = PyFloat_AsDouble(mDiff);
238 indexMax = 4 * motLen;
239 mPWM = malloc(indexMax * sizeof(double));
240 rPWM = malloc(indexMax * sizeof(double));
242 for (index = 0; index < motLen; index++) {
243 for (ntIndex = 0; ntIndex < 4; ntIndex++) {
244 mPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(motifPWM, index), ntIndex));
245 rPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(revPWM, index), ntIndex));
250 maxPos = seqLen - motLen;
252 while (pos <= maxPos) {
254 for (index = 0; index < motLen; index++) {
255 if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) {
256 skipping = index + 1;
265 scoreMot_func(seq, mPWM, rPWM, pos, motLen, &maxScore, &maxDiff, &seqScore, &sense);
267 if (seqScore > 1.0) {
268 PyList_Append(results, Py_BuildValue("(i, c)", pos, sense));
279 dpwm_func(PyObject *self, PyObject *args)
281 PyObject *pySeq, *motifPWM, *revPWM, *mScore, *results;
282 double *mDPWM, *rDPWM;
283 float maxScore, seqScore;
285 int ok, index, indexMax, prevIndex, currentIndex, motLen, seqLen, skipping;
288 results = Py_BuildValue("[]");
290 ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &motifPWM, &revPWM, &mScore);
292 seq = PyString_AsString(pySeq);
293 motLen = PyList_Size(motifPWM);
294 seqLen = PyString_Size(pySeq);
295 maxScore = PyFloat_AsDouble(mScore);
297 indexMax = 16 * motLen;
298 mDPWM = malloc(indexMax * sizeof(double));
299 rDPWM = malloc(indexMax * sizeof(double));
301 for (index = 0; index < motLen; index++) {
302 for (prevIndex = 0; prevIndex < 4; prevIndex++) {
303 for (currentIndex = 0; currentIndex < 4; currentIndex++) {
304 mDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(motifPWM, index), prevIndex), currentIndex));
305 rDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(revPWM, index), prevIndex), currentIndex));
311 maxPos = seqLen - motLen;
313 while (pos <= maxPos) {
315 for (index = 0; index < motLen; index++) {
316 if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) {
317 skipping = index + 1;
326 scoreDPWM_func(seq, mDPWM, rDPWM, pos, motLen, &maxScore, &seqScore, &sense);
328 if (seqScore > 1.0) {
329 PyList_Append(results, Py_BuildValue("(i, c)", pos, sense));
340 mer_func(PyObject *self, PyObject *args)
342 PyObject *pySeq, *forwardMer, *revcompMer, *maxMismatches, *results;
344 int ok, index, motLen, seqLen, skipping, mismatches, fmis, rmis;
348 results = Py_BuildValue("[]");
350 ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &forwardMer, &revcompMer, &maxMismatches);
352 seq = PyString_AsString(pySeq);
353 fMer = PyString_AsString(forwardMer);
354 rMer = PyString_AsString(revcompMer);
355 motLen = PyString_Size(forwardMer);
356 seqLen = PyString_Size(pySeq);
357 mismatches = PyInt_AsLong(maxMismatches);
360 maxPos = seqLen - motLen;
362 while (pos <= maxPos) {
364 for (index = 0; index < motLen; index++) {
365 if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) {
366 skipping = index + 1;
377 for (index = 0; index < motLen; index++) {
378 if (seq[pos + index] != fMer[index]) {
381 if (seq[pos + index] != rMer[index]) {
384 if ((fmis > mismatches) && (rmis > mismatches)) {
389 if ((fmis <= mismatches) || (rmis <= mismatches)) {
394 PyList_Append(results, Py_BuildValue("(i, c)", pos, sense));
402 double pearson_func(double *colA, double *colB, int pos)
405 double meanA, denominatorA;
406 double meanB, denominatorB;
412 for (index = 0; index < 4; index++) {
413 meanA += colA[pos + index];
414 meanB += colB[pos + index];
424 for (index = 0; index < 4; index++) {
425 numerator += (colA[pos + index] - meanA) * (colB[pos + index] - meanB);
426 denominatorA += (colA[pos + index] - meanA) * (colA[pos + index] - meanA);
427 denominatorB += (colB[pos + index] - meanB) * (colB[pos + index] - meanB);
430 if (denominatorA == 0.0 || denominatorB == 0.0) {
433 c = numerator / sqrt(denominatorA * denominatorB);
440 corr_func(PyObject *self, PyObject *args)
442 PyObject *PyaPWM, *PybPWM, *PycPWM, *PyMaxSlide;
443 double *aPWM, *bPWM, *cPWM, *tempA, *tempB, *tempC;
444 float fscore, rscore, bestScore;
446 int ok, index, indexMax, bIndexMax, ntIndex, motLen, padLen, slide, adjustedPadLen, adjustedSlide, tempMax, tempSize;
450 ok = PyArg_UnpackTuple(args, "corr", 4, 4, &PyaPWM, &PybPWM, &PycPWM, &PyMaxSlide);
452 motLen = PyList_Size(PyaPWM);
453 padLen = motLen - PyList_Size(PybPWM);
454 maxSlide = PyInt_AsLong(PyMaxSlide);
456 if (maxSlide > motLen) {
457 maxSlide = motLen - 1;
460 indexMax = 4 * motLen;
461 bIndexMax = 4 * (motLen - padLen);
462 aPWM = malloc(indexMax * sizeof(double));
463 bPWM = malloc(bIndexMax * sizeof(double));
464 cPWM = malloc(bIndexMax * sizeof(double));
466 for (index = 0; index < (motLen - padLen); index++) {
467 for (ntIndex = 0; ntIndex < 4; ntIndex++) {
468 aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex));
469 bPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PybPWM, index), ntIndex));
470 cPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PycPWM, index), ntIndex));
474 for (; index < motLen; index++) {
475 for (ntIndex = 0; ntIndex < 4; ntIndex++) {
476 aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex));
480 for (slide = -1 * maxSlide; slide < (maxSlide + padLen + 1); slide++ ) {
481 tempA = malloc(3 * indexMax * sizeof(double));
482 tempB = malloc(3 * indexMax * sizeof(double));
483 tempC = malloc(3 * indexMax * sizeof(double));
485 tempSize = abs(slide) + motLen;
486 tempMax = 4 * tempSize;
487 for (index = 0; index < 4 * abs(slide); index++) {
490 for (; index < tempMax; index++) {
491 tempA[index] = aPWM[index - 4 * slide];
493 for (index = 0; index < bIndexMax; index++) {
494 tempB[index] = bPWM[index];
495 tempC[index] = cPWM[index];
497 for (; index < tempMax; index++) {
501 } else if ((slide > 0) && (slide <= maxSlide)) {
503 if (padLen >= slide) {
504 adjustedPadLen = padLen - slide;
508 adjustedSlide = slide - padLen;
510 tempSize = motLen + adjustedSlide;
511 tempMax = indexMax + 4 * adjustedSlide;
512 for (index = 0; index < indexMax; index++) {
513 tempA[index] = aPWM[index];
515 for (; index < tempMax; index++) {
519 for (index = 0; index < tempMax; index++) {
523 tempMax += bIndexMax;
524 for (; index < tempMax; index++) {
525 tempB[index] = bPWM[index - 4 * slide];
526 tempC[index] = cPWM[index - 4 * slide];
528 tempMax = indexMax + 4 * adjustedSlide;
529 for (; index < tempMax; index++) {
534 tempSize = motLen + slide;
535 tempMax = indexMax + 4 * slide;
536 for (index = 0; index < indexMax; index++) {
537 tempA[index] = aPWM[index];
539 for (; index < tempMax; index++) {
543 for (index = 0; index < tempMax; index++) {
547 tempMax += bIndexMax;
548 for (; index < tempMax; index++) {
549 tempB[index] = bPWM[index - 4 * slide];
550 tempC[index] = cPWM[index - 4 * slide];
553 } else if (slide > maxSlide) {
554 tempSize = motLen + maxSlide;
555 tempMax = indexMax + 4 * maxSlide;
556 for (index = 0; index < indexMax; index++) {
557 tempA[index] = aPWM[index];
559 for (; index < tempMax; index++) {
563 for (index = 0; index < tempMax; index++) {
567 tempMax += bIndexMax;
568 for (; index < tempMax; index++) {
569 tempB[index] = bPWM[index - 4 * slide];
570 tempC[index] = cPWM[index - 4 * slide];
572 tempMax = indexMax + 4 * maxSlide;
573 for (; index < tempMax; index++) {
580 for (index = 0; index < indexMax; index++) {
581 tempA[index] = aPWM[index];
583 for (index = 0; index < bIndexMax; index++) {
584 tempB[index] = bPWM[index];
585 tempC[index] = cPWM[index];
587 for (; index < indexMax; index++) {
595 for (index = 0; index <tempSize; index++) {
596 fscore += pearson_func(tempA, tempB, 4 * index);
597 rscore += pearson_func(tempA, tempC, 4 * index);
598 /* fprintf(stdout,"A: %f %f %f %f\n", tempA[4 * index], tempA[4 * index + 1], tempA[4 * index + 2], tempA[4 * index + 3]); */
599 /* fprintf(stdout,"B: %f %f %f %f\n", tempB[4 * index], tempB[4 * index + 1], tempB[4 * index + 2], tempB[4 * index + 3]); */
601 fscore = fscore / tempSize;
602 rscore = rscore / tempSize;
603 /* fprintf(stdout,"slide %d %f %f\n", slide, fscore, rscore); */
604 if ((fscore < rscore) && (rscore > bestScore)) {
606 } else if (fscore > bestScore) {
619 return PyFloat_FromDouble(bestScore);
622 static char the_func_doc[] =
623 "returns a list of positions on aSeq that match the PWM within a Threshold, given as a percentage of the optimal consensus score.";
625 static char dpwm_func_doc[] =
626 "returns a list of positions on aSeq that match the DPWM within a given Fold of the optimal consensus score.";
628 static char mer_func_doc[] =
629 "returns a list of positions on aSeq that match an N-mer within M mismatches. Assumes Mers and Seq are in the same case.";
631 static char corr_func_doc[] =
632 "returns a pearson-correlation coefficient based similarity value between -1 (anti-correlated) and +1 (identical) for two motifs.";
634 static PyMethodDef module_methods[] = {
635 {"locateMotif", the_func, METH_VARARGS, the_func_doc},
636 {"locateMarkov1", dpwm_func, METH_VARARGS, dpwm_func_doc},
637 {"locateMer", mer_func, METH_VARARGS, mer_func_doc},
638 {"correlateMotifs", corr_func, METH_VARARGS, corr_func_doc},
645 Py_InitModule3("_motif", module_methods, module_doc);