From 6524cd9b9ceabf1ec539bdf8b387227b12cf1fa2 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 16 Sep 2005 22:45:17 +0000 Subject: [PATCH] [project @ 4] mussa_pI.3 --- Makefile | 13 +- annot_test.cc | 23 +++ examples/mck3test.mupa | 30 ++++ examples/mck3test_example.museq | 20 +++ examples/mck3test_example.muway | 41 +++++ examples/seq/human_mck_pro.fa | 43 +++++ examples/seq/mouse_mck_pro.fa | 49 ++++++ examples/seq/rabbit_mck_pro.fa | 13 ++ mussa.cc | 35 ++-- mussa_class.cc | 274 ++++++++++++++++++++++++++------ mussa_class.hh | 15 +- mussa_gui_conn.cc | 222 +++++++++++++++++++++++--- mussa_gui_conn.hh | 6 +- mussa_gui_seq.cc | 148 ++++++++++++++--- mussa_gui_seq.hh | 9 +- mussa_nway.cc | 174 ++++++++++++++++++-- mussa_nway.hh | 12 +- mussa_test.cc | 94 +++++++++++ seqcomp.cc | 2 +- sequence.cc | 265 +++++++++++++++++++++++++++++- sequence.hh | 21 ++- 21 files changed, 1354 insertions(+), 155 deletions(-) create mode 100644 annot_test.cc create mode 100644 examples/mck3test.mupa create mode 100644 examples/mck3test_example.museq create mode 100644 examples/mck3test_example.muway create mode 100644 examples/seq/human_mck_pro.fa create mode 100644 examples/seq/mouse_mck_pro.fa create mode 100644 examples/seq/rabbit_mck_pro.fa create mode 100644 mussa_test.cc diff --git a/Makefile b/Makefile index 580077a..7edccd2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,13 @@ #CFLAGS=-ftemplate-depth-20 CFLAGS=-O -Winline -ftemplate-depth-20 +# -ggdb LDFLAGS= -lfltk +# this is what's needed on titus's redhat machine +# tho he may have things setup oddly to accomodate his development setup +#LDFLAGS= -lfltk -L/usr/X11R6/lib -lX11 + + #all: seqcomp sequence.o : sequence.cc sequence.hh @@ -29,7 +35,12 @@ mussa_class.o : mussa_class.cc mussa_class.hh g++ $(CFLAGS) -c mussa_class.cc mussa : sequence.o flp.o flp_seqcomp.o mussa_nway.o mussa_gui_seq.o mussa_gui_conn.o mussa_class.o mussa.cc - g++ $(CFLAGS) $(LDFLAGS) -o mussa mussa.cc mussa_class.o mussa_gui_conn.o mussa_gui_seq.o mussa_nway.o flp_seqcomp.o flp.o sequence.o + g++ $(CFLAGS) -o mussa mussa.cc mussa_class.o mussa_gui_conn.o mussa_gui_seq.o mussa_nway.o flp_seqcomp.o flp.o sequence.o $(LDFLAGS) + + + +annot_test : sequence.o annot_test.cc + g++ $(CFLAGS) -o annot_test annot_test.cc sequence.o #match_list_type.o : modules/match_list_type.c modules/match_list_type.h \ diff --git a/annot_test.cc b/annot_test.cc new file mode 100644 index 0000000..1df5a86 --- /dev/null +++ b/annot_test.cc @@ -0,0 +1,23 @@ +#include "sequence.hh" + +int main(int argc, char **argv) +{ + Sequence seqA; + char * seqA_file, * seqA_annot; + cout << "fee fie foe fum" << endl; + + + seqA_file = * ++argv; + seqA_annot = * ++argv; + + seqA.load_fasta(seqA_file, 1); + seqA.load_annot(seqA_annot); + //seqA_len = seqA.len(); +} + +/* + cout << "fee\n"; + cout << "fie\n"; + cout << "foe "; + cout << "fum\n"; +*/ diff --git a/examples/mck3test.mupa b/examples/mck3test.mupa new file mode 100644 index 0000000..8ce6a13 --- /dev/null +++ b/examples/mck3test.mupa @@ -0,0 +1,30 @@ +# name of anaylsis directory and stem for associated files +ANA_NAME mck3test + +# if APPEND vars true, a _wXX and/or _tYY added to analysis name +# where XX = WINDOW and YY = THRESHOLD +# Highly recommeded with use of command line override of WINDOW or THRESHOLD +APPEND_WIN true +APPEND_THRES true + +# how many sequences are being analyzed +SEQUENCE_NUM 3 + +# first sequence info +SEQUENCE seq/mouse_mck_pro.fa +# ANNOTATION test.annot +SEQ_START 600 + +# the second sequence info +SEQUENCE seq/human_mck_pro.fa +# ANNOTATION test.annot +SEQ_START 600 +# SEQ_END 2400 + +# third sequence info +SEQUENCE seq/rabbit_mck_pro.fa +# ANNOTATION test.annot + +# analyses parameters: command line args -w -t will override these +WINDOW 30 +THRESHOLD 20 diff --git a/examples/mck3test_example.museq b/examples/mck3test_example.museq new file mode 100644 index 0000000..6184d20 --- /dev/null +++ b/examples/mck3test_example.museq @@ -0,0 +1,20 @@ + + +CCATCCTGGTCTATAGAGAGAGTTCCAGAACAGCCAGGGCTACAGATAAACCCATCTGGAAAAACAAAGTTGAATGACCCAAGAGGGGTTCTCAGAGGGTGGCGTGTGCTCCCTGGCAAGCCTATGACATGGCCGGGGCCTGCCTCTCTCTGCCTCTGACCCTCAGTGGCTCCCATGAACTCCTTGCCCAATGGCATCTTTTTCCTGCGCTCCTTGGGTTATTCCAGTCTCCCCTCAGCATTCCTTCCTCAGGGCCTCGCTCTTCTCTCTGCTCCCTCCTTGCACAGCTGGCTCTGTCCACCTCAGATGTCACAGTGCTCTCTCAGAGGAGGAAGGCACCATGTACCCTCTGTTTCCCAGGTAAGGGTTCAATTTTTAAAAATGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTCAAGACAGGGCTCCTCTGTGTAGTCCTAACTGTCTTGAAACTCCCTCTGTAGACCAGGTCGACCTCGAACTCTTGAAACCTGCCACGGACCACCCAGTCAGGTATGGAGGTCCCTGGAATGAGCGTCCTCGAAGCTAGGTGGGTAAGGGTTCGGCGGTGACAAACAGAAACAAACACAGAGGCAGTTTGAATCTGAGTGTATTTTGCAGCTCTCAAGCAGGGGATTTTATACATAAAAAAAAAAAAAAAAAAAAAACCAAACATTACATCTCTTAGAAACTATATCCAATGAAACAATCACAGATACCAACCAAAACCATTGGGCAGAGTAAAGCACAAAAATCATCCAAGCATTACAACTCTGAAACCATGTATTCAGTGAATCACAAACAGAACAGGTAACATCATTATTAATATAAATCACCAAAATATAACAATTCTAAAAGGATGTATCCAGTGGGGGCTGTCGTCCAAGGCTAGTGGCAGATTTCCAGGAGCAGGTTAGTAAATCTTAACCACTGAACTAACTCTCCAGCCCCATGGTCAATTATTATTTAGCATCTAGTGCCTAATTTTTTTTTATAAATCTTCACTATGTAATTTAAAACTATTTTAATTCTTCCTAATTAAGGCTTTCTTTACCATATACCAAAATTCACCTCCAATGACACACGCGTAGCCATATGAAATTTTATTGTTGGGAAAATTTGTACCTATCATAATAGTTTTGTAAATGATTTAAAAAGCAAAGTGTTAGCCGGGCGTGGTGGCACACGCCTTTAATCCCTGCACTCGGGAGGCAGGGGCAGGAGGATTTCTGAGTTTGAGGCCAGCCTGGTCTACAGAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAAACCCTGTCTCGAACCCCCCACCCCCCAAAAAAAGCAAAGTGTTGGTTTCCTTGGGGATAAAGTCATGTTAGTGGCCCATCTCTAGGCCCATCTCACCCATTATTCTCGCTTAAGATCTTGGCCTAGGCTACCAGGAACATGTAAATAAGAAAAGGAATAAGAGAAAACAAAACAGAGAGATTGCCATGAGAACTACGGCTCAATATTTTTTCTCTCCGGCGAAGAGTTCCACAACCATCTCCAGGAGGCCTCCACGTTTTGAGGTCAATGGCCTCAGTCTGTGGAACTTGTCACACAGATCTTACTGGAGGTGGTGTGGCAGAAACCCATTCCTTTTAGTGTCTTGGGCTAAAAGTAAAAGGCCCAGAGGAGGCCTTTGCTCATCTGACCATGCTGACAAGGAACACGGGTGCCAGGACAGAGGCTGGACCCCAGGAACACCTTAAACACTTCTTCCCTTCTCCGCCCCCTAGAGCAGGCTCCCCTCACCAGCCTGGGCAGAAATGGGGGAAGATGGAGTGAAGCCATACTGGCTACTCCAGAATCAACAGAGGGAGCCGGGGGCAATACTGGAGAAGCTGGTCTCCCCCCAGGGGCAATCCTGGCACCTCCCAGGCAGAAGAGGAAACTTCCACAGTGCATCTCACTTCCATGAATCCCCTCCTCGGACTCTGAGGTCCTTGGTCACAGCTGAGGTGCAAAAGGCTCCTGTCATATTGTGTCCTGCTCTGGTCTGCCTTCCACAGCTTGGGGGCCACCTAGCCCACCTCTCCCTAGGGATGAGAGCAGCCACTACGGGTCTAGGCTGCCCATGTAAGGAGGCAAGGCCTGGGGACACCCGAGATGCCTGGTTATAATTAACCCAGACATGTGGCTGCCCCCCCCCCCCCAACACCTGCTGCCTGAGCCTCACCCCCACCCCGGTGCCTGGGTCTTAGGCTCTGTACACCATGGAGGAGAAGCTCGCTCTAAAAATAACCCTGTCCCTGGTGGATCCAGGGTGAGGGGCAGGCTGAGGGCGGCCACTTCCCTCAGCCGCAGGTTTGTTTTCCCAAGAATGGTTTTTCTGCTTCTGTAGCTTTTCCTGTCAATTCTGCCATGGTGGAGCAGCCTGCACTGGGCTTCTGGGAGAAACCAAACCGGGTTCTAACCTTTCAGCTACAGTTATTGCCTTTCCTGTAGATGGGCGACTACAGCCCCACCCCCACCCCCGTCTCCTGTATCCTTCCTGGGCCTGGGGATCCTAGGCTTTCACTGGAAATTTCCCCCCAGGTGCTGTAGGCTAGAGTCACGGCTCCCAAGAACAGTGCTTGCCTGGCATGCATGGTTCTGAACCTCCAACTGCAAAAAATGACACATACCTTGACCCTTGGAAGGCTGAGGCAGGGGGATTGCCATGAGTGCAAAGCCAGACTGGGTGGCATAGTTAGACCCTGTCTCAAAAAACCAAAAACAATTAAATAACTAAAGTCAGGCAAGTAATCCTACTCGGGAGACTGAGGCAGAGGGATTGTTACATGTCTGAGGCCAGCCTGGACTACATAGGGTTTCAGGCTAGCCCTGTCTACAGAGTAAGGCCCTATTTCAAAAACACAAACAAAATGGTTCTCCCAGCTGCTAATGCTCACCAGGCAATGAAGCCTGGTGAGCATTAGCAATGAAGGCAATGAAGGAGGGTGCTGGCTACAATCAAGGCTGTGGGGGACTGAGGGCAGGCTGTAACAGGCTTGGGGGCCAGGGCTTATACGTGCCTGGGACTCCCAAAGTATTACTGTTCCATGTTCCCGGCGAAGGGCCAGCTGTCCCCCGCCAGCTAGACTCAGCACTTAGTTTAGGAACCAGTGAGCAAGTCAGCCCTTGGGGCAGCCCATACAAGGCCATGGGGCTGGGCAAGCTGCACGCCTGGGTCCGGGGTGGGCACGGTGCCCGGGCAACGAGCTGAAAGCTCATCTGCTCTCAGGGGCCCCTCCCTGGGGACAGCCCCTCCTGGCTAGTCACACCCTGTAGGCTCCTCTATATAACCCAGGGGCACAGGGGCTGCCCCCGGGTCAC + + +300 600 Glorp Glorptype + + +GGATCCTTCCTCCTTGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCTGGCCTATTACCCTTCTCAGGCTCTGGAGTCCATCCTTCTGCTCTGTCTCCCTCAGTTCAATTGTTTTTTGTTTTTTGTTTTTTTTTTAGACACAGTCTCGCTCTGTCACCAAGGCTGGAGTGCAGCAGTGCGATCACAGCTCACCGCAGCCTCACCTCCCAGGCTCAAGTGATCCTCCCATCTCGGCCTCTGAGTAGCTGAGACTATAGGTGTGTCCACATGTCCGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCGCGTTGGCCAGGGTGGTCTTGAACTCCTGAGCTCAAGCAATCCTCCTGCCTCAGCCTCCTTGTTTTGATTTTTAGATCCCACAAATAACTTGTGATGTTTGTCTTTCTATACCTGGTTCATTTAACATTTTCTTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTGTGAGACTGAGTCTTGCTCTGTCACTCAGGCTGGAGGGCAATGGTGCATCTCAGCTCACTGCAACCTCCACCTCCTAGGTTCAAGCAATTCTTATGCCTCAGCCTCCTGGCTAGCTGGGATTACAGGCGTGTGTCACCATGCCAGGCTAATTTTTGTACTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGGCCTCAAGTGATCCACCCGCCTCCGCCTCTGCCTCCCAAAGTGCTGGGATTACGGGCCTGAGCCACTGTGCCCGGCCCATCTAACATTTTCACTGTCAATCACAATGGGATTAAAACTCCTCCCACAGCCCCTAGGGACCATGGGTCTGCTCCTGTCTCCCCTCCAACCTCATCTTCTTCCTCCCACTCTCTCCTTGGCCCCATCTGCTCCAGTCCCCTGGCCTCCTTCCTGTCTGTCCTCAGATGTGCCCAGCCATTCTCACCTCAGCGCCTTTGCACCTGCTGTTCCCCCCAGAGCCGCACATGGCTGGCTCCCTGTTCTCCTTCAGGTCTCTGCTCAGATGTCATCTTCCCAAAGAGGCCTGCCTCGACCTCCCCTGCTGCTGTGCCGTCCCCTCATCTGTGACCCTCTTGCACTATCACCTCCAGGACGGCGGGGGTTTTGTGTTTTGTTGTAGCCTCAGGAAGTGCCTGATAGATCCCTGTTTCGAGACCAGTTCCATTTGGTTTTCTGGGCCTCAGTTTCCGTAACCGTGAAGGAGACCCTCGGCAATCTGAGCTTGCTGGGAAAGGGCTGGGCCCCATGTAAATATTTCTAAAGCACCCCTCTCCCCTCCCCCCTCAGATCAGGAGTCTGAGGGAGAGGCACAGAGGCTCCCTTTCTCTAAGCCAGTCCTCACCTGCCTAAGAAGATGTGAAGGAGACCCAGGAGACCCTGGGATAGGGAGGAACTCAGAGGGAAGGGACATTCTTTTCTTCGTCGCAATCCTGGGAGCTCCCTGGAGGAGGAGACCCGATCAGCCTGCAATCCTGGCGCGTCCCAGGAGGAGAAAGCGGCTTCCTCTATACTGTACTCTCCTCCACAGAACCCCCCTCTCAGCCCTGGAAGTCCTTGCTCACAGCCGAGGCGCCGAGAGCGCTTGCTCTGCCCAGATCTCGGCGAGTCTGCGCCCGCGCTCTGAACGGCGTCGCTGCCCAGCCCCCTTCCCCGGGAGGTGGGAGCGGCCACCCAGGGCCCCGTGGCTGCCCTTGTAAGGAGGCGAGGCCGAGGACACCCGAGACGCCCGGTTATAATTAACCAGGACACGTGGCGAACCCCCCTCCAACACCTGCCCCCGAACCCCCCCATACCCAGCGCCTCGGGTCTCGGCCTTTGCGGCAGAGGAGACAGCAAAGCGCCCTCTAAAAATAACTCCTTTCCCGGCGACCGAGACCCTCCCTGTCCCCGCACAGCGAAATCTCCCAGTGGCACCGAGGGGGCGAGGGTTAAGTGGGGGGGAGGGTGACCACCGCCTCCCACCCTTGCCCTGAGTTTGAATCTCTCCAACTCAGCCAGCCTCAGTTTCCCCTCCACTCAGTCCCTAGGAGGAAGGGGCGCCCAAGCGGGTTTCTGGGGTTAGACTGCCCTCCATTGCAATTGGTCCTTCTCCCGGCCTCTGCTTCCTCCAGCTCACAGGGTATCTGCTCCTCCTGGAGCCACACCTTGGTTCCCCGAGGTGCCGCTGGGACTCGGGTAGGGGTGAGGGCCCAGGGGCGACAGGGGGAGCCGAGGGCCACAGGAAGGGCTGGTGGCTGAAGGAGACTCAGGGGCCAGGGGACGGTGGCTTCTACGTGCTTGGGACGTTCCCAGCCACCGTCCCATGTTCCCGGCGGGGGCCAGCTGTCCCCACCGCCAGCCCAACTCAGCACTTGGTTAGGGTATCAGCTTGGTGGGGGCGTGAGCCCAGCCCTGGGGCGCTCAGCCCATACAAGGCCATGGGGCTGGGCGCAAAGCATGCCTGGGTTCAGGGTGGGTATGGTGCCGGAGCAGGGAGGTGAGAGGCTCAGCTGCCCTCCAGAACTCCTCCCTGGGGACAACCCCTCCCAGCCAATAGCACAGCCTAGGTCCCCCTATATAAGGCCACGGCTGCTGGCCCTTCCTTTGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTCAGCCCAGCCCAGCCAGGTACTGCACGGGGCGGGAATCTGGGTGGGGGCCAGAGTAGGGGATTTCTGTGGGTGCTAGAGGCTTGGCTTGGGAAAGGGTCTGTGTGTCACCCCTTGCTCCACCAACATCCTCCTATACAAAGGCAGGTCGGTGCGTGGGAAGGTTGACCCTTGTGTGTCTGGGAGGCCCCTCCATCTGTGAGGCTGCCTGAACCCCCACTGGGACCTGTGATTTCTGCGGCACAG + + +300 600 Glorp Glorptype + + +CTCCCTGAGGGGAGTGCCCCGCTTAGCCCTCAGCCCTGGCACCCACCAGGCAGAGGAGGCAGCTCCCAGAGTCCCCGGCTGAGGGGGCCTGCCCATGTCCTCGCGAGTCGGGCACCCAGTCCCCTTCCCTGGGGGCCTCGGGTGCCTTGTAAGGAGGCAAGGCCCAGGACACCCGAGCTGCCGGCTATAATTAGCCTGGACACGTGGGCCCCCCGCCAGCACCTGCCCTGAGCCCCCCTACCCGCAGCCTTGGGTGCTGGGACTGCCGACAGCAGAGACCGCTCTAAAAATAGCTCCCTTCCTGGACGGCCCCTCCCAGCAGGCCCCCCACCCCCCGCTGAGGCTCACTGCTGGTGTCCCTGCCTGGTGCTCCCTAGGGTGTGCACCCACGGCAGGTGCAGAGGCAGGGGGCACCGCGGGGGGCTGGTAACAGCAGCAGGCCTAGGGGCCAGGGGACGGCGAGCTGTACGTGCCAGGACACCCCCAGCCACTGTCCCGTGCTCCCGGCGGGGGCCAGCTGTCCCCCGCCAGCCCGACTCAGCACTTGCTCTGCACATCGGCCTGGGGCGGCCCATACAAGGGCATGGGGCTGGGCACAAGGCACGCCCGCGTCCAGGTGGGCACTGTGCCCGGGCATGGTGGGGGGAGAGGCTGGGCTGCCTTCCAGGCCCCCTCCCTGGGGCCAGCCCCGCCCAGCCAGTGGCACAGCAGGGTCCCCTACATAAAGCCGGGGCCGCGGGCCCTGGATCTGCCTCAGTGTCGCCTGCAGGACACACGTAGCCCCGCTCAGCACAGCCCAGCCAG + + +300 600 Glorp Glorptype + + diff --git a/examples/mck3test_example.muway b/examples/mck3test_example.muway new file mode 100644 index 0000000..ed3f1d6 --- /dev/null +++ b/examples/mck3test_example.muway @@ -0,0 +1,41 @@ + +31:1895,1497,24 +30:1899,1459,28 +30:1901,1503,30 +30:1901,1461,30 +34:1904,1506,33 +47:2100,1711,130 +30:2123,1733,152 +34:2125,1735,154 +30:2131,1741,160 +30:2133,1743,162 +44:2148,1758,176 +30:2186,1793,208 +40:2261,1868,271 +31:3015,2264,424 +34:3071,2326,485 +31:3085,2339,498 +30:3086,-2346,499 +30:3087,2341,500 +30:3088,-2344,-503 +30:3088,-2344,501 +30:3089,-2343,-502 +30:3089,-2343,502 +30:3090,-2342,-501 +33:3090,-2342,503 +30:3094,-2338,-497 +30:3094,-2338,507 +30:3095,-2337,-496 +30:3095,-2337,508 +30:3096,-2336,-495 +30:3096,-2336,509 +30:3097,-2335,-494 +44:3101,2357,514 +30:3157,2426,556 +30:3158,2542,673 +30:3158,2427,557 +30:3159,2543,674 +64:3159,2428,558 +41:3210,2479,608 +62:3246,2516,647 + diff --git a/examples/seq/human_mck_pro.fa b/examples/seq/human_mck_pro.fa new file mode 100644 index 0000000..0e31f75 --- /dev/null +++ b/examples/seq/human_mck_pro.fa @@ -0,0 +1,43 @@ +>gi|180579|gb|M21487.1|HUMCKMM1 Human muscle creatine kinase gene (CKMM), 5' flank +GGATCCTTCCTCCTTGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCTGGCCTATTACC +CTTCTCAGGCTCTGGAGTCCATCCTTCTGCTCTGTCTCCCTCAGTTCAATTGTTTTTTGTTTTTTGTTTT +TTTTTTAGACACAGTCTCGCTCTGTCACCAAGGCTGGAGTGCAGCAGTGCGATCACAGCTCACCGCAGCC +TCACCTCCCAGGCTCAAGTGATCCTCCCATCTCGGCCTCTGAGTAGCTGAGACTATAGGTGTGTCCACAT +GTCCGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCGCGTTGGCCAGGGTGGTCTTGAACTCC +TGAGCTCAAGCAATCCTCCTGCCTCAGCCTCCTTGTTTTGATTTTTAGATCCCACAAATAACTTGTGATG +TTTGTCTTTCTATACCTGGTTCATTTAACATTTTCTTTTTCTTTTCTTTTCTTTTTTTTTTTTTTTGTGA +GACTGAGTCTTGCTCTGTCACTCAGGCTGGAGGGCAATGGTGCATCTCAGCTCACTGCAACCTCCACCTC +CTAGGTTCAAGCAATTCTTATGCCTCAGCCTCCTGGCTAGCTGGGATTACAGGCGTGTGTCACCATGCCA +GGCTAATTTTTGTACTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGGC +CTCAAGTGATCCACCCGCCTCCGCCTCTGCCTCCCAAAGTGCTGGGATTACGGGCCTGAGCCACTGTGCC +CGGCCCATCTAACATTTTCACTGTCAATCACAATGGGATTAAAACTCCTCCCACAGCCCCTAGGGACCAT +GGGTCTGCTCCTGTCTCCCCTCCAACCTCATCTTCTTCCTCCCACTCTCTCCTTGGCCCCATCTGCTCCA +GTCCCCTGGCCTCCTTCCTGTCTGTCCTCAGATGTGCCCAGCCATTCTCACCTCAGCGCCTTTGCACCTG +CTGTTCCCCCCAGAGCCGCACATGGCTGGCTCCCTGTTCTCCTTCAGGTCTCTGCTCAGATGTCATCTTC +CCAAAGAGGCCTGCCTCGACCTCCCCTGCTGCTGTGCCGTCCCCTCATCTGTGACCCTCTTGCACTATCA +CCTCCAGGACGGCGGGGGTTTTGTGTTTTGTTGTAGCCTCAGGAAGTGCCTGATAGATCCCTGTTTCGAG +ACCAGTTCCATTTGGTTTTCTGGGCCTCAGTTTCCGTAACCGTGAAGGAGACCCTCGGCAATCTGAGCTT +GCTGGGAAAGGGCTGGGCCCCATGTAAATATTTCTAAAGCACCCCTCTCCCCTCCCCCCTCAGATCAGGA +GTCTGAGGGAGAGGCACAGAGGCTCCCTTTCTCTAAGCCAGTCCTCACCTGCCTAAGAAGATGTGAAGGA +GACCCAGGAGACCCTGGGATAGGGAGGAACTCAGAGGGAAGGGACATTCTTTTCTTCGTCGCAATCCTGG +GAGCTCCCTGGAGGAGGAGACCCGATCAGCCTGCAATCCTGGCGCGTCCCAGGAGGAGAAAGCGGCTTCC +TCTATACTGTACTCTCCTCCACAGAACCCCCCTCTCAGCCCTGGAAGTCCTTGCTCACAGCCGAGGCGCC +GAGAGCGCTTGCTCTGCCCAGATCTCGGCGAGTCTGCGCCCGCGCTCTGAACGGCGTCGCTGCCCAGCCC +CCTTCCCCGGGAGGTGGGAGCGGCCACCCAGGGCCCCGTGGCTGCCCTTGTAAGGAGGCGAGGCCGAGGA +CACCCGAGACGCCCGGTTATAATTAACCAGGACACGTGGCGAACCCCCCTCCAACACCTGCCCCCGAACC +CCCCCATACCCAGCGCCTCGGGTCTCGGCCTTTGCGGCAGAGGAGACAGCAAAGCGCCCTCTAAAAATAA +CTCCTTTCCCGGCGACCGAGACCCTCCCTGTCCCCGCACAGCGAAATCTCCCAGTGGCACCGAGGGGGCG +AGGGTTAAGTGGGGGGGAGGGTGACCACCGCCTCCCACCCTTGCCCTGAGTTTGAATCTCTCCAACTCAG +CCAGCCTCAGTTTCCCCTCCACTCAGTCCCTAGGAGGAAGGGGCGCCCAAGCGGGTTTCTGGGGTTAGAC +TGCCCTCCATTGCAATTGGTCCTTCTCCCGGCCTCTGCTTCCTCCAGCTCACAGGGTATCTGCTCCTCCT +GGAGCCACACCTTGGTTCCCCGAGGTGCCGCTGGGACTCGGGTAGGGGTGAGGGCCCAGGGGCGACAGGG +GGAGCCGAGGGCCACAGGAAGGGCTGGTGGCTGAAGGAGACTCAGGGGCCAGGGGACGGTGGCTTCTACG +TGCTTGGGACGTTCCCAGCCACCGTCCCATGTTCCCGGCGGGGGCCAGCTGTCCCCACCGCCAGCCCAAC +TCAGCACTTGGTTAGGGTATCAGCTTGGTGGGGGCGTGAGCCCAGCCCTGGGGCGCTCAGCCCATACAAG +GCCATGGGGCTGGGCGCAAAGCATGCCTGGGTTCAGGGTGGGTATGGTGCCGGAGCAGGGAGGTGAGAGG +CTCAGCTGCCCTCCAGAACTCCTCCCTGGGGACAACCCCTCCCAGCCAATAGCACAGCCTAGGTCCCCCT +ATATAAGGCCACGGCTGCTGGCCCTTCCTTTGGGTCAGTGTCACCTCCAGGATACAGACAGCCCCCCTTC +AGCCCAGCCCAGCCAGGTACTGCACGGGGCGGGAATCTGGGTGGGGGCCAGAGTAGGGGATTTCTGTGGG +TGCTAGAGGCTTGGCTTGGGAAAGGGTCTGTGTGTCACCCCTTGCTCCACCAACATCCTCCTATACAAAG +GCAGGTCGGTGCGTGGGAAGGTTGACCCTTGTGTGTCTGGGAGGCCCCTCCATCTGTGAGGCTGCCTGAA +CCCCCACTGGGACCTGTGATTTCTGCGGCACAG \ No newline at end of file diff --git a/examples/seq/mouse_mck_pro.fa b/examples/seq/mouse_mck_pro.fa new file mode 100644 index 0000000..46c2a6a --- /dev/null +++ b/examples/seq/mouse_mck_pro.fa @@ -0,0 +1,49 @@ +>gi|10129974|gb|AF188002.1|AF188002 Mus musculus muscle creatine kinase (Mck) gene, promoter region +CCATCCTGGTCTATAGAGAGAGTTCCAGAACAGCCAGGGCTACAGATAAACCCATCTGGAAAAACAAAGT +TGAATGACCCAAGAGGGGTTCTCAGAGGGTGGCGTGTGCTCCCTGGCAAGCCTATGACATGGCCGGGGCC +TGCCTCTCTCTGCCTCTGACCCTCAGTGGCTCCCATGAACTCCTTGCCCAATGGCATCTTTTTCCTGCGC +TCCTTGGGTTATTCCAGTCTCCCCTCAGCATTCCTTCCTCAGGGCCTCGCTCTTCTCTCTGCTCCCTCCT +TGCACAGCTGGCTCTGTCCACCTCAGATGTCACAGTGCTCTCTCAGAGGAGGAAGGCACCATGTACCCTC +TGTTTCCCAGGTAAGGGTTCAATTTTTAAAAATGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTG +TTTTTCAAGACAGGGCTCCTCTGTGTAGTCCTAACTGTCTTGAAACTCCCTCTGTAGACCAGGTCGACCT +CGAACTCTTGAAACCTGCCACGGACCACCCAGTCAGGTATGGAGGTCCCTGGAATGAGCGTCCTCGAAGC +TAGGTGGGTAAGGGTTCGGCGGTGACAAACAGAAACAAACACAGAGGCAGTTTGAATCTGAGTGTATTTT +GCAGCTCTCAAGCAGGGGATTTTATACATAAAAAAAAAAAAAAAAAAAAAACCAAACATTACATCTCTTA +GAAACTATATCCAATGAAACAATCACAGATACCAACCAAAACCATTGGGCAGAGTAAAGCACAAAAATCA +TCCAAGCATTACAACTCTGAAACCATGTATTCAGTGAATCACAAACAGAACAGGTAACATCATTATTAAT +ATAAATCACCAAAATATAACAATTCTAAAAGGATGTATCCAGTGGGGGCTGTCGTCCAAGGCTAGTGGCA +GATTTCCAGGAGCAGGTTAGTAAATCTTAACCACTGAACTAACTCTCCAGCCCCATGGTCAATTATTATT +TAGCATCTAGTGCCTAATTTTTTTTTATAAATCTTCACTATGTAATTTAAAACTATTTTAATTCTTCCTA +ATTAAGGCTTTCTTTACCATATACCAAAATTCACCTCCAATGACACACGCGTAGCCATATGAAATTTTAT +TGTTGGGAAAATTTGTACCTATCATAATAGTTTTGTAAATGATTTAAAAAGCAAAGTGTTAGCCGGGCGT +GGTGGCACACGCCTTTAATCCCTGCACTCGGGAGGCAGGGGCAGGAGGATTTCTGAGTTTGAGGCCAGCC +TGGTCTACAGAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAAACCCTGTCTCGAACCCCCCACCCCC +CAAAAAAAGCAAAGTGTTGGTTTCCTTGGGGATAAAGTCATGTTAGTGGCCCATCTCTAGGCCCATCTCA +CCCATTATTCTCGCTTAAGATCTTGGCCTAGGCTACCAGGAACATGTAAATAAGAAAAGGAATAAGAGAA +AACAAAACAGAGAGATTGCCATGAGAACTACGGCTCAATATTTTTTCTCTCCGGCGAAGAGTTCCACAAC +CATCTCCAGGAGGCCTCCACGTTTTGAGGTCAATGGCCTCAGTCTGTGGAACTTGTCACACAGATCTTAC +TGGAGGTGGTGTGGCAGAAACCCATTCCTTTTAGTGTCTTGGGCTAAAAGTAAAAGGCCCAGAGGAGGCC +TTTGCTCATCTGACCATGCTGACAAGGAACACGGGTGCCAGGACAGAGGCTGGACCCCAGGAACACCTTA +AACACTTCTTCCCTTCTCCGCCCCCTAGAGCAGGCTCCCCTCACCAGCCTGGGCAGAAATGGGGGAAGAT +GGAGTGAAGCCATACTGGCTACTCCAGAATCAACAGAGGGAGCCGGGGGCAATACTGGAGAAGCTGGTCT +CCCCCCAGGGGCAATCCTGGCACCTCCCAGGCAGAAGAGGAAACTTCCACAGTGCATCTCACTTCCATGA +ATCCCCTCCTCGGACTCTGAGGTCCTTGGTCACAGCTGAGGTGCAAAAGGCTCCTGTCATATTGTGTCCT +GCTCTGGTCTGCCTTCCACAGCTTGGGGGCCACCTAGCCCACCTCTCCCTAGGGATGAGAGCAGCCACTA +CGGGTCTAGGCTGCCCATGTAAGGAGGCAAGGCCTGGGGACACCCGAGATGCCTGGTTATAATTAACCCA +GACATGTGGCTGCCCCCCCCCCCCCAACACCTGCTGCCTGAGCCTCACCCCCACCCCGGTGCCTGGGTCT +TAGGCTCTGTACACCATGGAGGAGAAGCTCGCTCTAAAAATAACCCTGTCCCTGGTGGATCCAGGGTGAG +GGGCAGGCTGAGGGCGGCCACTTCCCTCAGCCGCAGGTTTGTTTTCCCAAGAATGGTTTTTCTGCTTCTG +TAGCTTTTCCTGTCAATTCTGCCATGGTGGAGCAGCCTGCACTGGGCTTCTGGGAGAAACCAAACCGGGT +TCTAACCTTTCAGCTACAGTTATTGCCTTTCCTGTAGATGGGCGACTACAGCCCCACCCCCACCCCCGTC +TCCTGTATCCTTCCTGGGCCTGGGGATCCTAGGCTTTCACTGGAAATTTCCCCCCAGGTGCTGTAGGCTA +GAGTCACGGCTCCCAAGAACAGTGCTTGCCTGGCATGCATGGTTCTGAACCTCCAACTGCAAAAAATGAC +ACATACCTTGACCCTTGGAAGGCTGAGGCAGGGGGATTGCCATGAGTGCAAAGCCAGACTGGGTGGCATA +GTTAGACCCTGTCTCAAAAAACCAAAAACAATTAAATAACTAAAGTCAGGCAAGTAATCCTACTCGGGAG +ACTGAGGCAGAGGGATTGTTACATGTCTGAGGCCAGCCTGGACTACATAGGGTTTCAGGCTAGCCCTGTC +TACAGAGTAAGGCCCTATTTCAAAAACACAAACAAAATGGTTCTCCCAGCTGCTAATGCTCACCAGGCAA +TGAAGCCTGGTGAGCATTAGCAATGAAGGCAATGAAGGAGGGTGCTGGCTACAATCAAGGCTGTGGGGGA +CTGAGGGCAGGCTGTAACAGGCTTGGGGGCCAGGGCTTATACGTGCCTGGGACTCCCAAAGTATTACTGT +TCCATGTTCCCGGCGAAGGGCCAGCTGTCCCCCGCCAGCTAGACTCAGCACTTAGTTTAGGAACCAGTGA +GCAAGTCAGCCCTTGGGGCAGCCCATACAAGGCCATGGGGCTGGGCAAGCTGCACGCCTGGGTCCGGGGT +GGGCACGGTGCCCGGGCAACGAGCTGAAAGCTCATCTGCTCTCAGGGGCCCCTCCCTGGGGACAGCCCCT +CCTGGCTAGTCACACCCTGTAGGCTCCTCTATATAACCCAGGGGCACAGGGGCTGCCCCCGGGTCAC \ No newline at end of file diff --git a/examples/seq/rabbit_mck_pro.fa b/examples/seq/rabbit_mck_pro.fa new file mode 100644 index 0000000..171eda6 --- /dev/null +++ b/examples/seq/rabbit_mck_pro.fa @@ -0,0 +1,13 @@ +>gi|1621|emb|X55146.1|OCMCK1 Rabbit gene for muscle creatine kinase, exon 1 +CTCCCTGAGGGGAGTGCCCCGCTTAGCCCTCAGCCCTGGCACCCACCAGGCAGAGGAGGCAGCTCCCAGA +GTCCCCGGCTGAGGGGGCCTGCCCATGTCCTCGCGAGTCGGGCACCCAGTCCCCTTCCCTGGGGGCCTCG +GGTGCCTTGTAAGGAGGCAAGGCCCAGGACACCCGAGCTGCCGGCTATAATTAGCCTGGACACGTGGGCC +CCCCGCCAGCACCTGCCCTGAGCCCCCCTACCCGCAGCCTTGGGTGCTGGGACTGCCGACAGCAGAGACC +GCTCTAAAAATAGCTCCCTTCCTGGACGGCCCCTCCCAGCAGGCCCCCCACCCCCCGCTGAGGCTCACTG +CTGGTGTCCCTGCCTGGTGCTCCCTAGGGTGTGCACCCACGGCAGGTGCAGAGGCAGGGGGCACCGCGGG +GGGCTGGTAACAGCAGCAGGCCTAGGGGCCAGGGGACGGCGAGCTGTACGTGCCAGGACACCCCCAGCCA +CTGTCCCGTGCTCCCGGCGGGGGCCAGCTGTCCCCCGCCAGCCCGACTCAGCACTTGCTCTGCACATCGG +CCTGGGGCGGCCCATACAAGGGCATGGGGCTGGGCACAAGGCACGCCCGCGTCCAGGTGGGCACTGTGCC +CGGGCATGGTGGGGGGAGAGGCTGGGCTGCCTTCCAGGCCCCCTCCCTGGGGCCAGCCCCGCCCAGCCAG +TGGCACAGCAGGGTCCCCTACATAAAGCCGGGGCCGCGGGCCCTGGATCTGCCTCAGTGTCGCCTGCAGG +ACACACGTAGCCCCGCTCAGCACAGCCCAGCCAG \ No newline at end of file diff --git a/mussa.cc b/mussa.cc index af25db2..12c649a 100644 --- a/mussa.cc +++ b/mussa.cc @@ -1,44 +1,25 @@ #include "mussa_class.hh" #include "time.h" + int main(int argc, char **argv) { Mussa overlord; + char run_mode; char * para_file, * ana_file; - string an_arg; time_t t1, t2, begin, end; double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime; - string run_mode; int seq_num; begin = time(NULL); - // need more sophisticated arg reading structure - // support -w and -t override options...other stuff?? - an_arg = * ++argv; - - if (an_arg == "-v") - { - seq_num = atoi(* ++argv); - ana_file = * ++argv; - run_mode = "v"; - } - else - { - para_file = (char*) an_arg.c_str(); - run_mode = "f"; - } + run_mode = overlord.parse_args(argc, argv); - if (run_mode == "v") - { - cout << "load file = " << ana_file << " seq num = " << seq_num << endl; - overlord.load_old(ana_file, seq_num); - } - else if (run_mode == "f") + if ((run_mode == 'f') || (run_mode == 'n')) { t1 = time(NULL); - overlord.setup(para_file); + overlord.setup(); t2 = time(NULL); setuptime = difftime(t2, t1); @@ -66,7 +47,7 @@ int main(int argc, char **argv) cout << "fum\n"; t1 = time(NULL); - overlord.save_old(); + overlord.save(); t2 = time(NULL); savetime = difftime(t2, t1); @@ -82,8 +63,10 @@ int main(int argc, char **argv) cout << totaltime << "\n"; } + if (run_mode == 'v') + overlord.load(); - if (run_mode == "v") + if ((run_mode == 'f') || (run_mode == 'v')) overlord.FuckingPieceOfShit(1000,500); } diff --git a/mussa_class.cc b/mussa_class.cc index d00f766..110d6f5 100644 --- a/mussa_class.cc +++ b/mussa_class.cc @@ -4,26 +4,128 @@ #include "mussa_class.hh" +// doesn't do neg ints... +string +int_to_str(int an_int) +{ + string converted_int; + int remainder; + + converted_int = ""; + + if (an_int == 0) + converted_int = "0"; + + while (an_int != 0) + { + remainder = an_int % 10; + + if (remainder == 0) + converted_int = "0" + converted_int; + else if (remainder == 1) + converted_int = "1" + converted_int; + else if (remainder == 2) + converted_int = "2" + converted_int; + else if (remainder == 3) + converted_int = "3" + converted_int; + else if (remainder == 4) + converted_int = "4" + converted_int; + else if (remainder == 5) + converted_int = "5" + converted_int; + else if (remainder == 6) + converted_int = "6" + converted_int; + else if (remainder == 7) + converted_int = "7" + converted_int; + else if (remainder == 8) + converted_int = "8" + converted_int; + else if (remainder == 9) + converted_int = "9" + converted_int; + + an_int = an_int / 10; + } + + return converted_int; +} + Mussa::Mussa() { } +char +Mussa::parse_args(int argc, char **argv) +{ + int i; + string an_arg; + char run_mode; + + win_override = false; + thres_override = false; + // minimal arg reading structure, not very robust to errors + + + i = 1; + while (i < argc) + { + an_arg = * ++argv; + i++; + + if (an_arg == "-v") + { + ana_name = * ++argv; + i++; + run_mode = 'v'; + } + else if (an_arg == "-n") + { + para_file_path = * ++argv; + i++; + run_mode = 'n'; + } + else if (an_arg == "-w") + { + window = atoi(* ++argv); + i++; + win_override = true; + } + else if (an_arg == "-t") + { + threshold = atoi(* ++argv); + i++; + thres_override = true; + cout << thres_override << endl; + } + else + { + para_file_path = an_arg; + run_mode = 'f'; + } + } + return run_mode; +} + + void -Mussa::setup(char * para_file_path) +Mussa::setup() { ifstream para_file; string file_data_line; string param, value, annot_file; int split_index, fasta_index; + int sub_seq_start, sub_seq_end; bool seq_params, did_seq; int bogo; win_append = false; thres_append = false; + seq_files.clear(); + fasta_indices.clear(); + annot_files.clear(); + sub_seq_starts.clear(); + sub_seq_ends.clear(); - para_file.open(para_file_path, ios::in); + para_file.open(para_file_path.c_str(), ios::in); getline(para_file,file_data_line); split_index = file_data_line.find(" "); @@ -43,14 +145,22 @@ Mussa::setup(char * para_file_path) else if (param == "SEQUENCE_NUM") seq_num = atoi(value.c_str()); else if (param == "WINDOW") - window = atoi(value.c_str()); + { + if (!win_override) + window = atoi(value.c_str()); + } else if (param == "THRESHOLD") - threshold = atoi(value.c_str()); + { + if (!thres_override) + threshold = atoi(value.c_str()); + } else if (param == "SEQUENCE") { seq_files.push_back(value); fasta_index = 1; annot_file = ""; + sub_seq_start = 0; + sub_seq_end = 0; seq_params = true; while ((!para_file.eof()) && seq_params) @@ -64,6 +174,13 @@ Mussa::setup(char * para_file_path) fasta_index = atoi(value.c_str()); else if (param == "ANNOTATION") annot_file = value; + else if (param == "SEQ_START") + sub_seq_start = atoi(value.c_str()); + else if (param == "SEQ_END") + { + cout << "hey! " << atoi(value.c_str()) << endl; + sub_seq_end = atoi(value.c_str()); + } //ignore empty lines or that start with '#' else if ((param == "") || (param == "#")) {} else seq_params = false; @@ -72,6 +189,8 @@ Mussa::setup(char * para_file_path) fasta_indices.push_back(fasta_index); annot_files.push_back(annot_file); + sub_seq_starts.push_back(sub_seq_start); + sub_seq_ends.push_back(sub_seq_end); did_seq = true; } @@ -99,29 +218,44 @@ Mussa::setup(char * para_file_path) cout << "window = " << window << " threshold = " << threshold << "\n"; } +// if (!((param == "") || (param == "#"))) +// cout << value << " = " << param << endl; void Mussa::get_Seqs() { list::iterator seq_files_i, annot_files_i; - list::iterator fasta_indices_i; + list::iterator fasta_indices_i, seq_starts_i, seq_ends_i; Sequence aSeq; seq_files_i = seq_files.begin(); fasta_indices_i = fasta_indices.begin(); annot_files_i = annot_files.begin(); + seq_starts_i = sub_seq_starts.begin(); + seq_ends_i = sub_seq_ends.begin(); - while ( (seq_files_i != seq_files.end()) && + + while (seq_files_i != seq_files.end()) + /* it should be guarenteed that each of the following exist + && (fasta_indices_i != fasta_indices.end()) && - (annot_files_i != annot_files.end()) ) + (annot_files_i != annot_files.end()) && + (seq_starts_i != sub_seq_starts.end()) && + (seq_ends_i != sub_seq_ends.end()) ) + */ { - aSeq.load_fasta(*seq_files_i, *fasta_indices_i); + aSeq.load_fasta(*seq_files_i, *fasta_indices_i,*seq_starts_i,*seq_ends_i); + if (*annot_files_i != "") + aSeq.load_annot(*annot_files_i); the_Seqs.push_back(aSeq); cout << aSeq.hdr() << endl; + //cout << aSeq.seq() << endl; aSeq.clear(); ++seq_files_i; ++fasta_indices_i; ++annot_files_i; + ++seq_starts_i; + ++seq_ends_i; } } @@ -158,28 +292,103 @@ Mussa::seqcomp() all_comps[i][i2].setup("m", window, threshold, seq_lens[i],seq_lens[i2]); all_comps[i][i2].seqcomp(the_Seqs[i].seq(), the_Seqs[i2].seq(), false); all_comps[i][i2].seqcomp(the_Seqs[i].seq(),the_Seqs[i2].rev_comp(),true); - save_file_string = "ana"; - if (i == 0) - save_file_string += "0"; - else if (i == 1) - save_file_string += "1"; - if (i2 == 1) - save_file_string += "1"; - else if (i2 == 2) - save_file_string += "2"; all_comps[i][i2].file_save(save_file_string); } } + void Mussa::nway() { - the_paths.setup(seq_num); + the_paths.setup(seq_num, window, threshold); the_paths.find_paths_r(all_comps); + the_paths.simple_refine(); } +void +Mussa::save() +{ + string save_path_base, save_path; + fstream save_file; + int i; + + // gotta do bit with adding win & thres if to be appended - need itos + + // not sure why, but gotta close file each time since can't pass file streams + + save_path_base = ana_name; + + if (win_append) + save_path_base += "_w" + int_to_str(window); + + if (thres_append) + save_path_base += "_t" + int_to_str(threshold); + + // save sequence and annots to a special mussa file + save_path = save_path_base + ".museq"; + save_file.open(save_path.c_str(), ios::out); + save_file << "" << endl; + //save_file.close(); + + for(i = 0; i < seq_num; i++) + the_Seqs[i].save(save_file); + + //save_file.open(save_path.c_str(), ios::app); + save_file << "" << endl; + save_file.close(); + + // save nway paths to its mussa save file + save_path = save_path_base + ".muway"; + the_paths.save(save_path); +} + +void +Mussa::load() +{ + int i; + string load_file_path; + Sequence tmp_seq; + + + load_file_path = ana_name + ".muway"; + seq_num = the_paths.load(load_file_path); + + load_file_path = ana_name + ".museq"; + for (i = 1; i <= seq_num; i++) + { + tmp_seq.clear(); + tmp_seq.load_museq(load_file_path, i); + the_Seqs.push_back(tmp_seq); + } +} + + + +// In Memorial to Everything that's gone wrong in the last week +// and Everything that will go wrong in the next 2 weeks 03/02/2004 - Tristan +void +Mussa::FuckingPieceOfShit(int x_max, int y_max) +{ + Fl_Window *conn_window = new Fl_Window(x_max, y_max, "Mussa Connections"); + ConnView *conn_box = new ConnView(0, 0, x_max, y_max); + conn_box->setup(ana_name, seq_num, window, &the_Seqs, &the_paths); + conn_box->scale_paths(); + conn_window->end(); + conn_window->show(); + + Fl::run(); +} + +/* + cout << "fee\n"; + cout << "fie\n"; + cout << "foe\n"; + cout << "fum\n"; +*/ + + void Mussa::save_old() @@ -211,7 +420,7 @@ Mussa::load_old(char * load_file_path, int s_num) Sequence a_seq; seq_num = s_num; - the_paths.setup(seq_num); + the_paths.setup(seq_num, 0, 0); save_file.open(load_file_path, ios::in); // currently loads old mussa format @@ -252,30 +461,3 @@ Mussa::load_old(char * load_file_path, int s_num) //the_paths.save("tmp.save"); } - - -// In Memorial to Everything that's gone wrong in the last week -// and Everything that will go wrong in the next 2 weeks 03/02/2004 - Tristan -void -Mussa::FuckingPieceOfShit(int x_max, int y_max) -{ - - Fl_Window *conn_window = new Fl_Window(x_max, y_max, "Mussa Connections"); - ConnView *conn_box = new ConnView(0, 0, x_max, y_max); - conn_box->setup(ana_name, seq_num, window, &the_Seqs, &the_paths); - conn_box->scale_paths(); - conn_box->spawnSeq(); - conn_window->end(); - conn_window->show(); - - Fl::run(); -} - -/* - cout << "fee\n"; - cout << "fie\n"; - cout << "foe\n"; - cout << "fum\n"; -*/ - - diff --git a/mussa_class.hh b/mussa_class.hh index b80bc3e..06877c7 100644 --- a/mussa_class.hh +++ b/mussa_class.hh @@ -8,10 +8,11 @@ class Mussa { private: - string ana_name; + string para_file_path, ana_name; int seq_num, window, threshold; list seq_files, annot_files; - list fasta_indices; + list fasta_indices, sub_seq_starts, sub_seq_ends; + bool win_override, thres_override; bool win_append, thres_append; vector the_Seqs; @@ -20,12 +21,18 @@ class Mussa public: Mussa(); - void setup(char * para_file_path); + char parse_args(int argc, char **argv); + void setup(); void get_Seqs(); void seqcomp(); void nway(); + void save(); + void load(); + void FuckingPieceOfShit(int x_max, int y_max); + + // deprecated - support bridge for python version of mussa void save_old(); void load_old(char * load_file_path, int s_num); - void FuckingPieceOfShit(int x_max, int y_max); + }; diff --git a/mussa_gui_conn.cc b/mussa_gui_conn.cc index 75a757a..aeff03e 100644 --- a/mussa_gui_conn.cc +++ b/mussa_gui_conn.cc @@ -25,6 +25,11 @@ ConnView::setup(string name, int sq_num, int win_len, x_scale_factor = (float) seq_lens[0] / x_max; cout << "scale factor is " << x_scale_factor << endl; y_seq_incre = (y_max-20) / (seq_num - 1); + + dragging = false; + selected = false; + + highlight.clear(); } @@ -38,10 +43,10 @@ ConnView::scale_paths() scaled_pathz.clear(); - for(pathz_i = P->pathz.begin(); pathz_i != P->pathz.end(); ++pathz_i) + for(pathz_i = P->refined_pathz.begin(); pathz_i != P->refined_pathz.end(); ++pathz_i) { a_path = *pathz_i; - for(i2 = 0; i2 < seq_num; i2++) + for(i2 = 0; i2 <= seq_num; i2++) a_path[i2] = (int) (a_path[i2] / x_scale_factor); scaled_pathz.push_back(a_path); } @@ -54,30 +59,108 @@ void ConnView::draw() { list >::iterator i; - int i2, y_loc, x_loc; + int i2, i3, y_loc, x_loc, x_start, x_end; vector a_path; + list::iterator annot_i; + int window_size; + bool rc_color; + int path_start, path_end; + list::iterator highlight_i; + + + // clear drawing area and set background to white + fl_color(FL_WHITE); + fl_rectf(x(), y(), w(), h()); + + // determine which paths to highlight + highlight.clear(); + for(i = scaled_pathz.begin(); i != scaled_pathz.end(); ++i) + { + a_path = *i; + // determine if path falls within the selected region and mark it for + // highlighted color + path_start = abs(a_path[1]); + path_end = path_start + a_path[0]; + if ( ( (path_start >= drag_start) && (path_end <= drag_end) ) || + ( (path_start < drag_start) && (path_end > drag_end) ) || + ( (path_start < drag_start) && (path_end > drag_start) ) || + ( (path_start < drag_end) && (path_end > drag_end) ) ) + highlight.push_back(true); + else + highlight.push_back(false); + } - fl_color(FL_RED); fl_line_style(FL_SOLID, 1, NULL); + // draw non-highlight paths (ie not in the selection box) + highlight_i = highlight.begin(); + for(i = scaled_pathz.begin(); i != scaled_pathz.end(); ++i) + { + a_path = *i; + y_loc = 10; + + // get window size to determine line width + window_size = a_path[0]; + // make sure width is at least 1 - might be zero to my slack rounding + if (window_size == 0) + window_size = 1; + + if (!(*highlight_i)) + for(i2 = seq_num; i2 > 1; i2--) + { + // RC case handling + // ugh, an xor...only want blue if one of the nodes is rc + if ( ((a_path[i2] < 0) || (a_path[i2-1] < 0)) && + !((a_path[i2] < 0) && (a_path[i2-1] < 0)) ) + fl_color(FL_CYAN); + else + fl_color(FL_MAGENTA); + + fl_polygon((int)abs(a_path[i2]), y_loc + 3, + (int)abs(a_path[i2])+window_size, y_loc + 3, + (int)abs(a_path[i2-1])+window_size, y_loc + y_seq_incre - 3, + (int)abs(a_path[i2-1]), y_loc + y_seq_incre - 3); + y_loc += y_seq_incre; + } + ++highlight_i; + } + + // draw highlighted paths (ie in or partially in selection) + // drawing these separately and after other paths so they are on top + highlight_i = highlight.begin(); for(i = scaled_pathz.begin(); i != scaled_pathz.end(); ++i) { a_path = *i; y_loc = 10; - for(i2 = seq_num - 1; i2 > 0; i2--) - { - // RC case handling - // ugh, and xor...only want blue if one of the nodes is rc - if ( ((a_path[i2] < 0) || (a_path[i2-1] < 0)) && - !((a_path[i2] < 0) && (a_path[i2-1] < 0)) ) - fl_color(FL_BLUE); - - fl_line((int)abs(a_path[i2]),y_loc,(int)abs(a_path[i2-1]),y_loc + y_seq_incre); - y_loc += y_seq_incre; - fl_color(FL_RED); - } + + // get window size to determine line width + window_size = a_path[0]; + // make sure width is at least 1 - might be zero to my slack rounding + if (window_size == 0) + window_size = 1; + + if (*highlight_i) + for(i2 = seq_num; i2 > 1; i2--) + { + // RC case handling + // ugh, an xor...only want blue if one of the nodes is rc + if ( ((a_path[i2] < 0) || (a_path[i2-1] < 0)) && + !((a_path[i2] < 0) && (a_path[i2-1] < 0)) ) + fl_color(FL_BLUE); + else + fl_color(FL_RED); + + fl_polygon((int)abs(a_path[i2]), y_loc + 3, + (int)abs(a_path[i2])+window_size, y_loc + 3, + (int)abs(a_path[i2-1])+window_size, y_loc + y_seq_incre - 3, + (int)abs(a_path[i2-1]), y_loc + y_seq_incre - 3); + + y_loc += y_seq_incre; + } + ++highlight_i; } + // draw sequence representation lines fl_color(FL_BLACK); fl_line_style(FL_SOLID, 3, NULL); y_loc = 10; @@ -87,21 +170,116 @@ ConnView::draw() fl_line(0,y_loc,x_loc,y_loc); y_loc += y_seq_incre; } + + // draw annotations + fl_color(FL_GREEN); + fl_line_style(FL_SOLID, 9, NULL); + y_loc = 10; + for(i2 = seq_num - 1; i2 >= 0; i2--) + { + for(annot_i = (*S)[i2].annots.begin(); annot_i != (*S)[i2].annots.end(); ++annot_i) + { + x_start = (int)(annot_i->start / x_scale_factor); + x_end = (int)(annot_i->end / x_scale_factor); + fl_line(x_start,y_loc,x_end,y_loc); + } + y_loc += y_seq_incre; + } + + // draw selection box + if (selected) + { + fl_color(FL_BLACK); + fl_line_style(FL_SOLID, 2, NULL); + fl_rect(drag_start, y_loc - y_seq_incre-8, drag_end-drag_start, 16); + } +} + + + +int +ConnView::handle(int e) +{ + int return_value; + + switch(e) + { + case FL_PUSH: + if (Fl::event_button3()) + { + spawnSeq(); + return_value = 1; + } + break; + case FL_DRAG: + if (!dragging) + { + drag_start = Fl::event_x() -x(); + dragging = true; + selected = false; + return_value = 1; + } + fl_color(FL_CYAN); + fl_line_style(FL_SOLID, 1, NULL); + fl_overlay_rect(drag_start, y_seq_incre*(seq_num-1), Fl::event_x()-drag_start, 16); + + break; + case FL_RELEASE: + if (dragging) + { + drag_end = Fl::event_x() - x(); + dragging = false; + selected = true; + redraw(); + return_value = 1; + } + break; + default: + return_value = Fl_Widget::handle(e); + } + + return return_value; } + void ConnView::spawnSeq() { - int seq_box_x; + list > selected_paths; + list >::iterator pathz_i; + int i2, i3, y_loc, x_loc, x_start, x_end; + vector a_path; + list::iterator highlight_i; + + // make new list of connections that are highlighted + selected_paths.clear(); + highlight_i = highlight.begin(); + for(pathz_i = P->refined_pathz.begin(); pathz_i != P->refined_pathz.end(); ++pathz_i) + { + if (*highlight_i) + { + a_path = *pathz_i; + selected_paths.push_back(a_path); + //cout << "bing "; + } + /* + else + cout << "bang "; + */ + ++highlight_i; + } + + //cout << endl; Fl_Window *seq_window = new Fl_Window(x_max, y_max, "Mussa Sequence"); - Fl_Scroll *scroll_seq = new Fl_Scroll(0, 0, x_max, y_max); - seq_box_x = 32000; - //seq_lens[0]*14; - SeqView *seq_box = new SeqView(0, 0, seq_box_x, y_max-20); - seq_box->setup(ana_name, seq_num, window, S, P, seq_lens); + + SeqView *seq_box = new SeqView(0, 0, x_max, y_max); + seq_box->setup(ana_name, seq_num, S, selected_paths, seq_lens); + seq_box->debug_draw(); seq_window->end(); seq_window->show(); } + + diff --git a/mussa_gui_conn.hh b/mussa_gui_conn.hh index 0994dcf..d2813a8 100644 --- a/mussa_gui_conn.hh +++ b/mussa_gui_conn.hh @@ -16,7 +16,10 @@ class ConnView : public Fl_Box int x_max, y_max; float x_scale_factor; int y_seq_incre; - //list< will be list of these later for zooming history + int drag_start, drag_end; + bool dragging, selected; + list highlight; + list > scaled_pathz; vector seq_lens; @@ -33,5 +36,6 @@ class ConnView : public Fl_Box vector *, Nway_Paths *); void scale_paths(); void draw(); + int handle(int e); void spawnSeq(); }; diff --git a/mussa_gui_seq.cc b/mussa_gui_seq.cc index bf1afb3..4cf34af 100644 --- a/mussa_gui_seq.cc +++ b/mussa_gui_seq.cc @@ -2,13 +2,12 @@ void -SeqView::setup(string name, int sq_num, int win_len, +SeqView::setup(string name, int sq_num, vector *some_seqs, - Nway_Paths *some_paths, vector some_lens) + list > some_paths, vector some_lens) { ana_name = name; seq_num = sq_num; - base_window_len = win_len; S = some_seqs; P = some_paths; seq_lens = some_lens; @@ -17,6 +16,8 @@ SeqView::setup(string name, int sq_num, int win_len, fl_font(FL_COURIER, 14); cout << fl_width('A') << endl; + + align_offsets(0); } @@ -34,9 +35,11 @@ SeqView::draw() int window_length, win_i; int rc_1 = 0; int rc_2 = 0; + string sub_seq; + int offset1, offset2; + float center1, center2; - window_length = base_window_len; fl_font(FL_COURIER, 14); fl_color(FL_BLACK); //blatantly stolen from FR2 @@ -45,52 +48,149 @@ SeqView::draw() y_loc = 10; for(seq_i = seq_num-1; seq_i >= 0; --seq_i) { - //a_seq = a_seq; - fl_draw((*S)[seq_i].c_seq(), 0+x(), y_loc+y()); + sub_seq = (*S)[seq_i].seq(); + sub_seq = sub_seq.substr(seq_align_offsets[seq_i]); + fl_draw(sub_seq.c_str(), 0+x(), y_loc+y()); y_loc += y_seq_incre; } fl_color(FL_RED); fl_line_style(FL_SOLID, 1, NULL); - //fl_line(x(),10+y(),20+x(),y_max/2 + y()); - - //fl_color(FL_RED); - - for(pathz_i = P->pathz.begin(); pathz_i != P->pathz.end(); ++pathz_i) + for(pathz_i = P.begin(); pathz_i != P.end(); ++pathz_i) { a_path = *pathz_i; y_loc = 0; - for(i2 = seq_num - 1; i2 > 0; i2--) + window_length = a_path[0]; + for(i2 = seq_num; i2 > 1; i2--) { // RC case handling - // ugh, and xor...only want black if one of the nodes is rc + // ugh, and xor...only want rc coloring if just one of the nodes is rc if ( ((a_path[i2] < 0) || (a_path[i2-1] < 0)) && !((a_path[i2] < 0) && (a_path[i2-1] < 0)) ) fl_color(FL_BLUE); + offset1 = seq_align_offsets[i2-1]; + offset2 = seq_align_offsets[i2-2]; + center1 = 0.5; + center2 = 0.5; + // no matter the case, any RC node needs an offset if (a_path[i2] < 0) + { rc_1 = window_length; + offset1 = -offset1; + center1 = -center1; + } if (a_path[i2-1] < 0) + { rc_2 = window_length; - // normal case + offset2 = -offset2; + center2 = -center2; + } + + // draw the line for each bp in this window that actually matches for(win_i = 0; win_i < window_length; win_i++) - fl_line(abs((int)((a_path[i2]-rc_1+win_i+x()+0.5)*ch_width)), - y_loc + 10+y(), - abs((int)((a_path[i2-1]-rc_2+win_i+x()+0.5)*ch_width)), - y_loc+y_seq_incre+y()); + { + x_start = abs((int) (a_path[i2]-rc_1+win_i-offset1) ); + x_end = abs((int) (a_path[i2-1]-rc_2+win_i-offset2) ); + fl_line( (x_start+center1)*ch_width, y_loc + 10, + (x_end+center2)*ch_width, y_loc+y_seq_incre ); + } y_loc += y_seq_incre; - //if ((a_path[i2] < 0) || (a_path[i2-1] < 0)) - //{ - fl_color(FL_RED); - rc_1 = 0; - rc_2 = 0; - //} + fl_color(FL_RED); + rc_1 = 0; + rc_2 = 0; } } } + +void +SeqView::debug_draw() +{ + int seq_i; + Sequence a_seq; + int i, y_loc; + vector a_path; + list >::iterator pathz_i; + int i2; + int x_start, y_start, x_end, y_end; + double ch_width; + int window_length, win_i; + int rc_1 = 0; + int rc_2 = 0; + string sub_seq; + + + cout << "debugging...\n"; + y_loc = 10; + for(seq_i = seq_num-1; seq_i >= 0; --seq_i) + { + sub_seq = (*S)[seq_i].seq(); + cout << sub_seq << endl; + sub_seq = sub_seq.substr(seq_align_offsets[seq_i]); + cout << sub_seq << endl; + //fl_draw(sub_seq.c_str(), 0+x(), y_loc+y()); + y_loc += y_seq_incre; + } +} + +void +SeqView::align_offsets(int align_num) +{ + list >::iterator pathz_i; + int i; + + + if (P.begin() == P.end()) + cout << "crud....\n"; + pathz_i = P.begin(); + while(pathz_i != P.end()) + { + //cout << (*pathz_i)[0] << endl; + ++pathz_i; + } + + // find the path specified + i = 0; + pathz_i = P.begin(); + cout << "fee\n"; + while( (i < align_num) && (pathz_i != P.end()) ) + { + cout << "fie\n"; + ++i; + ++pathz_i; + } + + cout << "foe\n"; + // now set the alignment offsets - basically where the path starts + seq_align_offsets.clear(); + for(i = 0; i < seq_num; i++) + { + cout << (*pathz_i)[i+1] << endl; + seq_align_offsets.push_back( (*pathz_i)[i+1] ); + } + cout << "fum\n"; +} + + + + + + + + + + + + + + + + + + /* x_start = (int) (); y_start = ; diff --git a/mussa_gui_seq.hh b/mussa_gui_seq.hh index aed0140..58e6151 100644 --- a/mussa_gui_seq.hh +++ b/mussa_gui_seq.hh @@ -16,11 +16,13 @@ class SeqView : public Fl_Box //this data is passed as pointers to the instantiated classes vector *S; - Nway_Paths *P; + // list of paths in selection box + list > P; vector seq_lens; int x_max, y_max; int y_seq_incre; + vector seq_align_offsets; public: SeqView(int x_top,int y_top,int x_bot,int y_bot) : @@ -30,6 +32,9 @@ class SeqView : public Fl_Box y_max = y_bot; } - void setup(string, int, int, vector *, Nway_Paths *, vector); + void setup(string, int, vector *some_seqs, + list > some_paths, vector); void draw(); + void align_offsets(int align_num); + void debug_draw(); }; diff --git a/mussa_nway.cc b/mussa_nway.cc index ae9b69a..fd760b8 100644 --- a/mussa_nway.cc +++ b/mussa_nway.cc @@ -16,9 +16,11 @@ Nway_Paths::Nway_Paths() } void -Nway_Paths::setup(int sp_num) +Nway_Paths::setup(int sp_num, int w, int t) { species_num = sp_num; + threshold = t; + win_size = w; pathz.clear(); } @@ -53,10 +55,13 @@ Nway_Paths::path_search(vector > all_comparisons, vector path, if (trans_check_good) { + // this makes sure path nodes are recorded with RC status relative to + // the base species if ( path[depth-1] >= 0) path.push_back(*new_nodes_i); else path.push_back(*new_nodes_i * -1); + if (depth < species_num - 1) path_search(all_comparisons, path, depth + 1); else @@ -106,6 +111,56 @@ Nway_Paths::find_paths_r(vector > all_comparisons) } } +// dumbly goes thru and combines path windows exactly adjacent (ie + 1 index) +// doesn't deal with interleaved adjacency +void +Nway_Paths::simple_refine() +{ + // ext_path remembers the first window set in an extending path + vector ext_path, cur_path, next_path, new_path; + list >::iterator pathz_i; + int i2, win_ext_len = 0; + bool extending; + + + refined_pathz.clear(); + + pathz_i = pathz.begin(); + ext_path = *pathz_i; + while(pathz_i != pathz.end()) + { + // keep track of current path and advance to next path + cur_path = *pathz_i; + ++pathz_i; + next_path = *pathz_i; + + // if node for each seq is equal to the next node+1 then for all + // sequences then we are extending + extending = true; + for(i2 = 0; i2 < species_num; i2++) + { + //cout << cur_path[i2] << " vs " << next_path[i2] << endl; + if (cur_path[i2] + 1 != next_path[i2]) + extending = false; + } + if (extending) + { + //cout << "Raaaawwwrrr! I am extending\n"; + win_ext_len++; + } + else + { + // add the extend window length as first element and add as refined + new_path = ext_path; + new_path.insert(new_path.begin(),win_size + win_ext_len); + refined_pathz.push_back(new_path); + // reset stuff + win_ext_len = 0; + ext_path = next_path; + } + } +} + void Nway_Paths::add_path(vector loaded_path) @@ -124,27 +179,122 @@ Nway_Paths::save(string save_file_path) save_file.open(save_file_path.c_str(), ios::out); - //save_file << "\n"; - - path_i = pathz.begin(); - paths_end = pathz.end(); - if (path_i == paths_end) - cout << "Arrrrrrgggghhhhhh\n"; - while(path_i != paths_end) + save_file << "\n"; + + path_i = refined_pathz.begin(); + paths_end = refined_pathz.end(); + //path_i = pathz.begin(); + //paths_end = pathz.end(); + while (path_i != paths_end) { a_path = *path_i; //cout << a_path.size() << endl; - for(i = 0; i < species_num; ++i) - save_file << i << "," << a_path[i] << " "; + //first entry is the window length of the windows in the path + save_file << a_path[0] << ":"; + for(i = 1; i <= species_num; ++i) + { + save_file << a_path[i]; + if (i != species_num) + save_file << ","; + } save_file << endl; ++path_i; } -//save_file << "\n"; + save_file << "\n"; save_file.close(); } +/* + if (path_i == paths_end) + cout << "Arrrrrrgggghhhhhh\n"; +*/ + +int +Nway_Paths::load(string load_file_path) +{ + fstream load_file; + string file_data_line, header_data, data, path_node, path_length; + int i, space_split_i, equal_split_i, comma_split_i, colon_split_i; + vector loaded_path; + + + load_file.open(load_file_path.c_str(), ios::in); + + // get header data + // grab mussa tag - discard for now...maybe check in future... + getline(load_file,file_data_line); + space_split_i = file_data_line.find(" "); + file_data_line = file_data_line.substr(space_split_i+1); + // grab type tag - need in future to distinguish between flp and vlp paths + space_split_i = file_data_line.find(" "); + file_data_line = file_data_line.substr(space_split_i+1); + // get species/seq number + space_split_i = file_data_line.find(" "); + header_data = file_data_line.substr(0,space_split_i); + equal_split_i = header_data.find("="); + data = file_data_line.substr(equal_split_i+1); + species_num = atoi (data.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get window size + space_split_i = file_data_line.find(" "); + header_data = file_data_line.substr(0,space_split_i); + equal_split_i = header_data.find("="); + data = file_data_line.substr(equal_split_i+1); + win_size = atoi (data.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get window size + space_split_i = file_data_line.find(" "); + header_data = file_data_line.substr(0,space_split_i); + equal_split_i = header_data.find("="); + data = file_data_line.substr(equal_split_i+1); + threshold = atoi (data.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + + cout << "seq_num=" << species_num << " win=" << win_size; + cout << " thres=" << threshold << endl; + + // clear out the current data + refined_pathz.clear(); + + int temp; + + getline(load_file,file_data_line); + while ( (!load_file.eof()) && (file_data_line != "") ) + { + if (file_data_line != "") + { + loaded_path.clear(); + colon_split_i = file_data_line.find(":"); + path_length = file_data_line.substr(0,colon_split_i); + loaded_path.push_back(atoi (path_length.c_str())); + file_data_line = file_data_line.substr(colon_split_i+1); + for(i = 0; i < species_num; i++) + { + comma_split_i = file_data_line.find(","); + path_node = file_data_line.substr(0, comma_split_i); + temp = atoi (path_node.c_str()); + loaded_path.push_back(temp); + file_data_line = file_data_line.substr(comma_split_i+1); + } + refined_pathz.push_back(loaded_path); + } + getline(load_file,file_data_line); + } + + + load_file.close(); + + return species_num; +} + + + void Nway_Paths::save_old(string save_file_path) { diff --git a/mussa_nway.hh b/mussa_nway.hh index 4352924..3e5bbe2 100644 --- a/mussa_nway.hh +++ b/mussa_nway.hh @@ -14,20 +14,22 @@ class Nway_Paths int threshold; int win_size; list > pathz; + list > refined_pathz; public: Nway_Paths(); - void setup(int sp_num); + void setup(int sp_num, int w, int t); void find_paths_r(vector > all_comparisons); void path_search(vector > all_comparisons, vector path, int depth); + void simple_refine(); void save(string save_file_path); - void save_old(string save_file_path); - void load(string load_file_path); + int load(string load_file_path); void add_path(vector loaded_path); + void find_paths(vector > all_comparisons); void refine(); - void print(); - list > scaled_paths(float scale_factor); + void save_old(string save_file_path); + void print(); }; diff --git a/mussa_test.cc b/mussa_test.cc new file mode 100644 index 0000000..13b4021 --- /dev/null +++ b/mussa_test.cc @@ -0,0 +1,94 @@ +#include "mussa_class.hh" +#include "time.h" + +int main(int argc, char **argv) +{ + Mussa overlord; + char * para_file, * ana_file; + string an_arg; + time_t t1, t2, begin, end; + double setuptime, seqloadtime, seqcomptime, nwaytime, savetime, totaltime; + string run_mode; + int seq_num; + + + begin = time(NULL); + + // need more sophisticated arg reading structure + // support -w and -t override options...other stuff?? + an_arg = * ++argv; + + if (an_arg == "-v") + { + ana_file = * ++argv; + run_mode = "v"; + } + else + { + para_file = (char*) an_arg.c_str(); + run_mode = "f"; + } + + if (run_mode == "v") + { + cout << "load file = " << ana_file << endl; + overlord.load(ana_file); + //overlord.FuckingPieceOfShit(1000,500); + } + else if (run_mode == "f") + { + t1 = time(NULL); + overlord.setup(para_file); + t2 = time(NULL); + setuptime = difftime(t2, t1); + + + cout << "fee\n"; + t1 = time(NULL); + overlord.get_Seqs(); + t2 = time(NULL); + seqloadtime = difftime(t2, t1); + + + cout << "fie\n"; + t1 = time(NULL); + overlord.seqcomp(); + t2 = time(NULL); + seqcomptime = difftime(t2, t1); + + + cout << "foe\n"; + t1 = time(NULL); + overlord.nway(); + t2 = time(NULL); + nwaytime = difftime(t2, t1); + + + cout << "fum\n"; + t1 = time(NULL); + overlord.save_old(); + t2 = time(NULL); + savetime = difftime(t2, t1); + + end = time(NULL); + totaltime = difftime(end, begin); + + cout << "setup\tseqload\tseqcomp\tnway\tsave\ttotal\n"; + cout << setuptime << "\t"; + cout << seqloadtime << "\t"; + cout << seqcomptime << "\t"; + cout << nwaytime << "\t"; + cout << savetime << "\t"; + cout << totaltime << "\n"; + } +} + + + + +/* + cout << "fee\n"; + cout << "fie\n"; + cout << "foe\n"; + cout << "fum\n"; +*/ diff --git a/seqcomp.cc b/seqcomp.cc index d0a19ec..5eaba25 100644 --- a/seqcomp.cc +++ b/seqcomp.cc @@ -1,6 +1,6 @@ #include "flp.hh" #include -#include +#include int main(int argc, char **argv) { diff --git a/sequence.cc b/sequence.cc index 199a4ab..3a25656 100644 --- a/sequence.cc +++ b/sequence.cc @@ -11,12 +11,18 @@ Sequence::Sequence() void -Sequence::load_fasta(string file_path, int seq_num) +Sequence::load_fasta(string file_path, int seq_num, + int start_index, int end_index) { fstream data_file; string file_data_line; int header_counter = 0; bool read_seq = true; + string rev_comp; + char conversionTable[257]; + int seq_i, table_i, len; + string sequence_raw; + char * seq_tmp; // holds sequence during basic filtering data_file.open(file_path.c_str(), ios::in); @@ -32,32 +38,122 @@ Sequence::load_fasta(string file_path, int seq_num) header = file_data_line.substr(1); + sequence_raw = ""; + while ( !data_file.eof() && read_seq ) { getline(data_file,file_data_line); if (file_data_line.substr(0,1) == ">") read_seq = false; - else sequence += file_data_line; + else sequence_raw += file_data_line; } data_file.close(); - // need more sequence filtering for non AGCTN and convert to N - transform(sequence.begin(), sequence.end(), sequence.begin(), toupper); + // sequence filtering for upcasing agctn and convert non AGCTN to N + + len = sequence_raw.length(); + seq_tmp = (char*)sequence_raw.c_str(); + + sequence = ""; + + // Make a conversion table + + // everything we don't specify below will become 'N' + for(table_i=0; table_i < 256; table_i++) + { + conversionTable[table_i] = 'N'; + } + // add end of string character for printing out table for testing purposes + conversionTable[256] = '\0'; + + // we want these to map to themselves - ie not to change + conversionTable['A'] = 'A'; + conversionTable['T'] = 'T'; + conversionTable['G'] = 'G'; + conversionTable['C'] = 'C'; + // this is to upcase + conversionTable['a'] = 'A'; + conversionTable['t'] = 'T'; + conversionTable['g'] = 'G'; + conversionTable['c'] = 'C'; + + // finally, the actual conversion loop + for(seq_i = 0; seq_i < len; seq_i++) + { + table_i = (int) seq_tmp[seq_i]; + sequence += conversionTable[table_i]; + } + + + // Lastly, if subselection of the sequence was specified we keep cut out + // and only keep that part + + // end_index = 0 means no end was specified, so cut to the end + if (end_index == 0) + end_index = len; + + // start_index = 0 if no start was specified + sequence = sequence.substr(start_index, end_index - start_index); } + // this doesn't work properly under gcc 3.x ... it can't recognize toupper + //transform(sequence.begin(), sequence.end(), sequence.begin(), toupper); + + void Sequence::load_annot(string file_path) { fstream data_file; string file_data_line; annot an_annot; + int space_split_i; + string annot_value; + annots.clear(); data_file.open(file_path.c_str(), ios::in); + getline(data_file,file_data_line); + species = file_data_line; + + while ( !data_file.eof() ) + { + getline(data_file,file_data_line); + if (file_data_line != "") + { + // need to get 4 values...almost same code 4 times... + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.start = atoi (annot_value.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get annot end index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.end = atoi (annot_value.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.name = annot_value; + file_data_line = file_data_line.substr(space_split_i+1); + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.type = annot_value; + annots.push_back(an_annot); + } + } + data_file.close(); + list::iterator list_i; + for(list_i = annots.begin(); list_i != annots.end(); ++list_i) + { + cout << (*list_i).start << "," << (*list_i).end << "\t"; + cout << (*list_i).name << "\t" << (*list_i).type << endl; + } } @@ -153,3 +249,164 @@ Sequence::clear() species_num = -4; annots.clear(); } + +void +Sequence::save(fstream &save_file) + //string save_file_path) +{ + //fstream save_file; + list::iterator annots_i; + int i; + + // not sure why, or if i'm doing something wrong, but can't seem to pass + // file pointers down to this method from the mussa control class + // so each call to save a sequence appends to the file started by mussa_class + //save_file.open(save_file_path.c_str(), ios::app); + + save_file << "" << endl; + save_file << sequence << endl; + save_file << "" << endl; + + save_file << "" << endl; + for (annots_i = annots.begin(); annots_i != annots.end(); ++annots_i) + { + save_file << annots_i->start << " " << annots_i->end << " " ; + save_file << annots_i->name << " " << annots_i->type << endl; + } + save_file << "" << endl; + //save_file.close(); +} + +void +Sequence::load_museq(string load_file_path, int seq_num) +{ + fstream load_file; + string file_data_line; + int seq_counter; + annot an_annot; + int space_split_i; + string annot_value; + + annots.clear(); + load_file.open(load_file_path.c_str(), ios::in); + + seq_counter = 0; + // search for the seq_num-th sequence + while ( (!load_file.eof()) && (seq_counter < seq_num) ) + { + getline(load_file,file_data_line); + if (file_data_line == "") + seq_counter++; + } + getline(load_file, file_data_line); + //cout << "*fee\n"; + sequence = file_data_line; + //cout << "*fie\n"; + getline(load_file, file_data_line); + getline(load_file, file_data_line); + if (file_data_line == "") + { + while ( (!load_file.eof()) && (file_data_line != "") ) + { + //cout << "*foe\n"; + getline(load_file,file_data_line); + if ((file_data_line != "") && (file_data_line != "")) + { + //cout << "*fum\n"; + // need to get 4 values...almost same code 4 times... + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.start = atoi (annot_value.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get annot end index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.end = atoi (annot_value.c_str()); + file_data_line = file_data_line.substr(space_split_i+1); + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.name = annot_value; + file_data_line = file_data_line.substr(space_split_i+1); + // get annot start index + space_split_i = file_data_line.find(" "); + annot_value = file_data_line.substr(0,space_split_i); + an_annot.type = annot_value; + annots.push_back(an_annot); + } + } + } + load_file.close(); +} + + +list +Sequence::find_motif(string a_motif) +{ + char * seq_c; + int seq_i, motif_i, motif_len; + list motif_match_starts; + + motif_match_starts.clear(); + // faster to loop thru the sequence as a old c string (ie char array) + seq_c = (char*)sequence.c_str(); + motif_len = a_motif.length(); + + + for (seq_i = 0; seq_i < length; seq_i++) + { + + // this is pretty much a straight translation of Nora's python code + // to match iupac letter codes + if (a_motif[motif_i] == seq_c[seq_i]) + motif_i++; + else if (seq_c[seq_i] =='N') + motif_i++; + else if ((a_motif[motif_i] =='M') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C'))) + motif_i++; + else if ((a_motif[motif_i] =='R') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G'))) + motif_i++; + else if ((a_motif[motif_i] =='W') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='T'))) + motif_i++; + else if ((a_motif[motif_i] =='S') && + ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G'))) + motif_i++; + else if ((a_motif[motif_i] =='Y') && + ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='T'))) + motif_i++; + else if ((a_motif[motif_i] =='K') && + ((seq_c[seq_i]=='G') || (seq_c[seq_i]=='T'))) + motif_i++; + else if ((a_motif[motif_i] =='V') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') || + (seq_c[seq_i]=='G'))) + motif_i++; + else if ((a_motif[seq_i] =='H') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='C') || + (seq_c[seq_i]=='T'))) + motif_i++; + else if ((a_motif[motif_i] =='D') && + ((seq_c[seq_i]=='A') || (seq_c[seq_i]=='G') || + (seq_c[seq_i]=='T'))) + motif_i++; + else if ((a_motif[motif_i] =='B') && + ((seq_c[seq_i]=='C') || (seq_c[seq_i]=='G') || + (seq_c[seq_i]=='T'))) + motif_i++; + else + motif_i = 0; + + // end Nora stuff, now we see if a match is found this pass + if (motif_i == motif_len) + { + motif_match_starts.push_back(seq_i - motif_len + 1); + motif_i = 0; + } + } + + return motif_match_starts; +} diff --git a/sequence.hh b/sequence.hh index afbb9da..088eac8 100644 --- a/sequence.hh +++ b/sequence.hh @@ -7,12 +7,20 @@ #include #include #include -#include +#include #include #include // Sequence data class +using namespace std; + +struct annot +{ + int start, end; + string name, type; +}; + class Sequence { friend class ConnView; @@ -24,11 +32,6 @@ class Sequence string species; int species_num; - struct annot - { - int start, end; - string name, type; - }; list annots; @@ -37,7 +40,8 @@ class Sequence public: Sequence(); - void load_fasta(string file_path, int seq_num); + void load_fasta(string file_path, int seq_num, + int start_index, int end_index); void load_annot(string file_path); string seq(); const char * c_seq(); @@ -45,6 +49,9 @@ class Sequence int len(); string hdr(); void set_seq(string a_seq); + list find_motif(string a_motif); // string species(); void clear(); + void save(fstream &save_file); + void load_museq(string load_file_path, int seq_num); }; -- 2.30.2