X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=misc%2Fwgsim.c;h=b9c513c0540bc5120c0e1294ed62f91890bad000;hp=7b5f095d910b06f3242419066ac8879b7cdb7454;hb=0f906dafb2ad22371a753557562ef95c3034480d;hpb=b301e959d73eee0955c57004f344f17af00703f4 diff --git a/misc/wgsim.c b/misc/wgsim.c index 7b5f095..b9c513c 100644 --- a/misc/wgsim.c +++ b/misc/wgsim.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,11 +24,8 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* This program is separated from maq's read simulator with Colin - * Hercus' modification to allow longer indels. Colin is the chief - * developer of novoalign. */ + * Hercus' modification to allow longer indels. */ #include #include @@ -38,8 +36,11 @@ #include #include #include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) -#define PACKAGE_VERSION "0.2.3" +#define PACKAGE_VERSION "0.3.0" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -60,8 +61,6 @@ const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - /* Simple normal random number generator, copied from genran.c */ double ran_normal() @@ -85,78 +84,6 @@ double ran_normal() } } -/* FASTA parser, copied from seq.c */ - -typedef struct { - int l, m; /* length and maximum buffer size */ - unsigned char *s; /* sequence */ -} seq_t; - -#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 - -static int SEQ_BLOCK_SIZE = 512; - -void seq_set_block_size(int size) -{ - SEQ_BLOCK_SIZE = size; -} - -int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) -{ - int c, l, max; - char *p; - - c = 0; - while (!feof(fp) && fgetc(fp) != '>'); - if (feof(fp)) return -1; - p = locus; - while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') - if (c != '\r') *p++ = c; - *p = '\0'; - if (comment) { - p = comment; - if (c != '\n') { - while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); - if (c != '\n') { - *p++ = c; - while (!feof(fp) && (c = fgetc(fp)) != '\n') - if (c != '\r') *p++ = c; - } - } - *p = '\0'; - } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); - l = 0; max = seq->m; - while (!feof(fp) && (c = fgetc(fp)) != '>') { - if (isalpha(c) || c == '-' || c == '.') { - if (l + 1 >= max) { - max += SEQ_BLOCK_SIZE; - seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); - } - seq->s[l++] = (unsigned char)c; - } - } - if (c == '>') ungetc(c,fp); - seq->s[l] = 0; - seq->m = max; seq->l = l; - return l; -} - -/* Error-checking open, copied from utils.c */ - -#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) - -FILE *err_xopen_core(const char *func, const char *fn, const char *mode) -{ - FILE *fp = 0; - if (strcmp(fn, "-") == 0) - return (strstr(mode, "r"))? stdin : stdout; - if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); - } - return fp; -} - /* wgsim */ enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; @@ -170,24 +97,23 @@ typedef struct { static double ERR_RATE = 0.02; static double MUT_RATE = 0.001; -static double INDEL_FRAC = 0.1; +static double INDEL_FRAC = 0.15; static double INDEL_EXTEND = 0.3; -static int IS_SOLID = 0; -static int SHOW_MM_INFO = 1; +static double MAX_N_RATIO = 0.1; -void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { int i, deleting = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; - ret[0]->l = seq->l; ret[1]->l = seq->l; - ret[0]->m = seq->m; ret[1]->m = seq->m; - ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - for (i = 0; i != seq->l; ++i) { + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { int c; - c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; if (deleting) { if (drand48() < INDEL_EXTEND) { if (deleting & 1) ret[0]->s[i] |= DELETE; @@ -230,12 +156,12 @@ void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) } } } -void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) { int i; - for (i = 0; i != seq->l; ++i) { + for (i = 0; i != ks->seq.l; ++i) { int c[3]; - c[0] = nst_nt4_table[(int)seq->s[i]]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { @@ -248,8 +174,9 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins printf("-\t"); int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; - while(n > 0) { + while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t-\n"); @@ -266,6 +193,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t+\n"); @@ -284,46 +212,51 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } } -void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) { - seq_t seq; + kseq_t *ks; mutseq_t rseq[2]; + gzFile fp_fa; uint64_t tot_len, ii; int i, l, n_ref; - char name[256], *qstr; - int size[2], Q; + char *qstr; + int size[2], Q, max_size; uint8_t *tmp_seq[2]; mut_t *target; - INIT_SEQ(seq); - srand48(time(0)); - seq_set_block_size(0x1000000); l = size_l > size_r? size_l : size_r; qstr = (char*)calloc(l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); tot_len = n_ref = 0; - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { tot_len += l; ++n_ref; } - fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); - rewind(fp_fa); + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 3 * std_dev) { - fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); continue; } // generate mutations and print them out - maq_mut_diref(&seq, is_hap, rseq, rseq+1); - maq_print_mutref(name, &seq, rseq, rseq+1); + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { // the core loop double ran; @@ -335,8 +268,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); + d = d > max_size? d : max_size; pos = (int)((l - d + 1) * drand48()); - } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); // flip or not if (drand48() < 0.5) { @@ -353,7 +287,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ - for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ @@ -374,33 +308,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, if (k != s[x]) ext_coor[x] = -10; \ } while (0) - if (!IS_SOLID) { - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1, --i); - for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement - } else { - int c1, c2, c; - ++s[0]; ++s[1]; // temporarily increase read length by 1 - if (is_flip) { // RR pair - __gen_read(0, pos + s[0], --i); - __gen_read(1, pos + d - 1, --i); - } else { // FF pair - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1 - s[1], ++i); - ++ext_coor[0]; ++ext_coor[1]; - } - // change to color sequence: (0,1,2,3) -> (A,C,G,T) - for (j = 0; j < 2; ++j) { - c1 = tmp_seq[j][0]; - for (i = 1; i < s[j]; ++i) { - c2 = tmp_seq[j][i]; - c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct - else if (drand48() < ERR_RATE) { - c = (c + (int)(drand48() * 3.0 + 1)) & 3; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors ++n_err[j]; } tmp_seq[j][i] = c; } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; } // print for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; - if (SHOW_MM_INFO) { - fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], - (long long)ii, j==0? is_flip+1 : 2-is_flip); - } else { - fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - (long long)ii, j==0? is_flip+1 : 2-is_flip, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); - } + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@ -439,7 +352,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } free(rseq[0].s); free(rseq[1].s); } - free(seq.s); free(qstr); + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); } @@ -459,11 +374,9 @@ static int simu_usage() fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); - fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); - fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); return 1; } @@ -471,11 +384,12 @@ int main(int argc, char *argv[]) { int64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; - FILE *fpout1, *fpout2, *fp_fa; + FILE *fpout1, *fpout2; + int seed = -1; N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; - while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@ -486,17 +400,20 @@ int main(int argc, char *argv[]) case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; - case 'c': IS_SOLID = 1; break; - case 'C': SHOW_MM_INFO = 0; break; + case 'S': seed = atoi(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 3) return simu_usage(); - fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); - fpout1 = xopen(argv[optind+1], "w"); - fpout2 = xopen(argv[optind+2], "w"); - wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + srand48(seed > 0? seed : time(0)); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); - fclose(fpout1); fclose(fpout2); fclose(fp_fa); + fclose(fpout1); fclose(fpout2); return 0; }