X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bam_sort.c;h=7aeccff3e1e8e61507c4983ce535de884d6bbcae;hp=76ab793196ca76fc5a63d619e230cf0fb5f59f1e;hb=ced7709f121a00d5049d99ee8576037994aab1d1;hpb=62781a2daa24d74a3c590e2669fad1fa7cabf933 diff --git a/bam_sort.c b/bam_sort.c index 76ab793..7aeccff 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -52,6 +52,14 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) KSORT_INIT(heap, heap1_t, heap_lt) +static void swap_header_targets(bam_header_t *h1, bam_header_t *h2) +{ + bam_header_t t; + t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets; + t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name; + t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len; +} + static void swap_header_text(bam_header_t *h1, bam_header_t *h2) { int tempi; @@ -130,42 +138,51 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch return -1; } hin = bam_header_read(fp[i]); - if (i == 0) { // the first SAM + if (i == 0) { // the first BAM hout = hin; - if (hheaders) { - // If the text headers to be swapped in include any @SQ headers, - // check that they are consistent with the existing binary list - // of reference information. - if (hheaders->n_targets > 0) { - if (hout->n_targets != hheaders->n_targets) { - fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); - if (!reg) return -1; - } - for (j = 0; j < hout->n_targets; ++j) - if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { - fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); - if (!reg) return -1; - } - } - swap_header_text(hout, hheaders); - bam_header_destroy(hheaders); - hheaders = NULL; - } } else { // validate multiple baf - if (hout->n_targets != hin->n_targets) { - fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Continue anyway!\n", fn[i]); - } else { - for (j = 0; j < hout->n_targets; ++j) { - if (strcmp(hout->target_name[j], hin->target_name[j])) { - fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Continue anyway!\n", - hout->target_name[j], hin->target_name[j], fn[i]); - } + int min_n_targets = hout->n_targets; + if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; + + for (j = 0; j < min_n_targets; ++j) + if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", + hout->target_name[j], hin->target_name[j], fn[i]); + return -1; } + + // If this input file has additional target reference sequences, + // add them to the headers to be output + if (hin->n_targets > hout->n_targets) { + swap_header_targets(hout, hin); + // FIXME Possibly we should also create @SQ text headers + // for the newly added reference sequences } + bam_header_destroy(hin); } } + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) { + fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); + if (!reg) return -1; + } + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); + if (!reg) return -1; + } + } + + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + } + if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { @@ -205,8 +222,11 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; - if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0) + if (flag & MERGE_RG) { + uint8_t *rg = bam_aux_get(b, "RG"); + if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); + } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); @@ -278,14 +298,19 @@ KSORT_INIT(sort, bam1_p, bam1_lt) static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { - char *name; + char *name, mode[3]; int i; bamFile fp; ks_mergesort(sort, k, buf, 0); name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); - else sprintf(name, "%s.bam", prefix); - fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (n >= 0) { + sprintf(name, "%s.%.4d.bam", prefix, n); + strcpy(mode, "w1"); + } else { + sprintf(name, "%s.bam", prefix); + strcpy(mode, "w"); + } + fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); if (fp == 0) { fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); free(name);