X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bam_sort.c;h=12b1b5455280e8d8a788a82929844083a4efd084;hp=402792aea44952a5962e340e9a0efbad75654494;hb=049705238f6c2d4cdd79022e7bcf2cc7d83ba9c7;hpb=b27e00385f41769d03a8cca4dbd71275fc9fa906 diff --git a/bam_sort.c b/bam_sort.c index 402792a..12b1b54 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -33,47 +33,114 @@ static inline int strnum_cmp(const char *a, const char *b) typedef struct { int i; - uint64_t pos; + uint64_t pos, idx; bam1_t *b; } heap1_t; +#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) + static inline int heap_lt(const heap1_t a, const heap1_t b) { if (g_is_by_qname) { - int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); - return (t > 0 || (t == 0 && a.pos > b.pos)); - } else return (a.pos > b.pos); + int t; + if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; + t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); + return (t > 0 || (t == 0 && __pos_cmp(a, b))); + } else return __pos_cmp(a, b); } KSORT_INIT(heap, heap1_t, heap_lt) +static void swap_header_text(bam_header_t *h1, bam_header_t *h2) +{ + int tempi; + char *temps; + tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi; + temps = h1->text, h1->text = h2->text, h2->text = temps; +} + /*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name + @param headers name of SAM file from which to copy '@' header lines, + or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) +void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG) { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; - int i, j; + bam_header_t *hheaders = NULL; + int i, j, *RG_len = 0; + uint64_t idx = 0; + char **RG = 0; + + if (headers) { + tamFile fpheaders = sam_open(headers); + if (fpheaders == 0) { + fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers); + } else { + hheaders = sam_header_read(fpheaders); + sam_close(fpheaders); + } + } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + // prepare RG tag + if (add_RG) { + RG = (char**)calloc(n, sizeof(void*)); + RG_len = (int*)calloc(n, sizeof(int)); + for (i = 0; i != n; ++i) { + int l = strlen(fn[i]); + const char *s = fn[i]; + if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; + for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; + ++j; l -= j; + RG[i] = calloc(l + 1, 1); + RG_len[i] = l; + strncpy(RG[i], s + j, l); + } + } + // read the first for (i = 0; i != n; ++i) { heap1_t *h; bam_header_t *hin; - assert(fp[i] = bam_open(fn[i], "r")); + fp[i] = bam_open(fn[i], "r"); + if (fp[i] == 0) { + int j; + fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + for (j = 0; j < i; ++j) bam_close(fp[j]); + free(fp); free(heap); + // FIXME: possible memory leak + return; + } hin = bam_header_read(fp[i]); - if (i == 0) hout = hin; - else { // validate multiple baf + if (i == 0) { // the first SAM + hout = hin; + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) + fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); + } + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + hheaders = NULL; + } + } else { // validate multiple baf if (hout->n_targets != hin->n_targets) { fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); exit(1); @@ -84,17 +151,16 @@ void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) hout->target_name[j], hin->target_name[j], fn[i]); exit(1); } - if (hout->target_len[j] != hin->target_len[j]) - fprintf(stderr, "[bam_merge_core] different target sequence length: %d != %d in file '%s'. Continue.\n", - hout->target_len[j], hin->target_len[j], fn[i]); } bam_header_destroy(hin); } h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - if (bam_read1(fp[i], h->b) >= 0) + if (bam_read1(fp[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + h->idx = idx++; + } else h->pos = HEAP_EMPTY; } fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); @@ -105,35 +171,53 @@ void bam_merge_core(int by_qname, const char *out, int n, char * const *fn) ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; + if (add_RG && bam_aux_get(b, "RG") == 0) + bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); bam_write1_core(fpout, &b->core, b->data_len, b->data); - if ((j = bam_read1(fp[heap->i], b)) >= 0) + if ((j = bam_read1(fp[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); - else if (j == -1) heap->pos = HEAP_EMPTY; - else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + heap->idx = idx++; + } else if (j == -1) { + heap->pos = HEAP_EMPTY; + free(heap->b->data); free(heap->b); + heap->b = 0; + } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } - for (i = 0; i != n; ++i) { - bam_close(fp[i]); - free(heap[i].b->data); - free(heap[i].b); + if (add_RG) { + for (i = 0; i != n; ++i) free(RG[i]); + free(RG); free(RG_len); } + for (i = 0; i != n; ++i) bam_close(fp[i]); bam_close(fpout); free(fp); free(heap); } int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0; - while ((c = getopt(argc, argv, "n")) >= 0) { + int c, is_by_qname = 0, add_RG = 0; + char *fn_headers = NULL; + + while ((c = getopt(argc, argv, "h:nr")) >= 0) { switch (c) { + case 'r': add_RG = 1; break; + case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; } } if (optind + 2 >= argc) { - fprintf(stderr, "Usage: samtools merge [-n] [...]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] [...]\n\n"); + fprintf(stderr, "Options: -n sort by read names\n"); + fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); + fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); + fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); + fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); + fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } - bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1); + bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, add_RG); + free(fn_headers); return 0; } @@ -148,7 +232,7 @@ static inline int bam1_lt(const bam1_p a, const bam1_p b) } KSORT_INIT(sort, bam1_p, bam1_lt) -static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h) +static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { char *name; int i; @@ -157,7 +241,13 @@ static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam name = (char*)calloc(strlen(prefix) + 20, 1); if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); else sprintf(name, "%s.bam", prefix); - assert(fp = bam_open(name, "w")); + fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (fp == 0) { + fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); + free(name); + // FIXME: possible memory leak + return; + } free(name); bam_header_write(fp, h); for (i = 0; i < k; ++i) @@ -179,7 +269,7 @@ static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ -void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) { int n, ret, k, i; size_t mem; @@ -190,7 +280,10 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m g_is_by_qname = is_by_qname; n = k = 0; mem = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); + if (fp == 0) { + fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); + return; + } header = bam_header_read(fp); buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); // write sub files @@ -201,25 +294,26 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m mem += ret; ++k; if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header); + sort_blocks(n++, k, buf, prefix, header, 0); mem = 0; k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); - if (n == 0) sort_blocks(-1, k, buf, prefix, header); + if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); else { // then merge char **fns, *fnout; fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header); + sort_blocks(n++, k, buf, prefix, header, 0); fnout = (char*)calloc(strlen(prefix) + 20, 1); - sprintf(fnout, "%s.bam", prefix); + if (is_stdout) sprintf(fnout, "-"); + else sprintf(fnout, "%s.bam", prefix); fns = (char**)calloc(n, sizeof(char*)); for (i = 0; i < n; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - bam_merge_core(is_by_qname, fnout, n, fns); + bam_merge_core(is_by_qname, fnout, 0, n, fns, 0); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); @@ -238,20 +332,26 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m bam_close(fp); } +void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +{ + bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0); +} + int bam_sort(int argc, char *argv[]) { size_t max_mem = 500000000; - int c, is_by_qname = 0; - while ((c = getopt(argc, argv, "nm:")) >= 0) { + int c, is_by_qname = 0, is_stdout = 0; + while ((c = getopt(argc, argv, "nom:")) >= 0) { switch (c) { + case 'o': is_stdout = 1; break; case 'n': is_by_qname = 1; break; case 'm': max_mem = atol(optarg); break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: samtools sort [-n] [-m ] \n"); + fprintf(stderr, "Usage: samtools sort [-on] [-m ] \n"); return 1; } - bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem); + bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout); return 0; }