X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bam.c;h=0055e8427667dd95151ea1a1bef5dd14e74e15f1;hp=ee7642b3434248c9b44ef918435f150f101cae00;hb=aa08abe5f0b84ee0dd3491f00fe357d661c08e0c;hpb=317f5e8dd22cc9e1e5e05fbcaeb3b9aca7447351 diff --git a/bam.c b/bam.c index ee7642b..0055e84 100644 --- a/bam.c +++ b/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0; +int bam_is_be = 0, bam_verbose = 2; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** @@ -32,7 +32,7 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) int32_t l = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) l += cigar[k] >> BAM_CIGAR_SHIFT; } return l; @@ -70,6 +70,7 @@ bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; + int magic_len; int32_t i = 1, name_len; // check EOF i = bgzf_check_EOF(fp); @@ -78,11 +79,11 @@ bam_header_t *bam_header_read(bamFile fp) // with ESPIPE. Suppress the error message in this case. if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } - else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); + else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); // read "BAM1" - if (bam_read(fp, buf, 4) != 4) return 0; - if (strncmp(buf, "BAM\001", 4)) { - fprintf(stderr, "[bam_header_read] wrong header\n"); + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return 0; } header = bam_header_init(); @@ -140,6 +141,7 @@ int bam_header_write(bamFile fp, const bam_header_t *header) bam_write(fp, &x, 4); } else bam_write(fp, &header->target_len[i], 4); } + bgzf_flush(fp); return 0; } @@ -158,6 +160,19 @@ static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + else if (type == 'B') { + int32_t n, Bsize = bam_aux_type2size(*s); + memcpy(&n, s + 1, 4); + if (1 == Bsize) { + } else if (2 == Bsize) { + for (i = 0; i < n; i += 2) + bam_swap_endian_2p(s + 5 + i); + } else if (4 == Bsize) { + for (i = 0; i < n; i += 4) + bam_swap_endian_4p(s + 5 + i); + } + bam_swap_endian_4p(s+1); + } } } @@ -207,6 +222,7 @@ inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8 x[5] = c->mtid; x[6] = c->mpos; x[7] = c->isize; + bgzf_flush_try(fp, 4 + block_len); if (bam_is_be) { for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); y = block_len; @@ -232,8 +248,8 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kstring_t str; str.l = str.m = 0; str.s = 0; - ksprintf(&str, "%s\t", bam1_qname(b)); - if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); + kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); + if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) @@ -241,41 +257,68 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } - if (c->tid < 0) kputs("*\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->tid]); - ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->tid < 0) kputsn("*\t", 2, &str); + else { + if (header) kputs(header->target_name[c->tid] , &str); + else kputw(c->tid, &str); + kputc('\t', &str); + } + kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { - for (i = 0; i < c->n_cigar; ++i) - ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + for (i = 0; i < c->n_cigar; ++i) { + kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); + kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + } } kputc('\t', &str); - if (c->mtid < 0) kputs("*\t", &str); - else if (c->mtid == c->tid) kputs("=\t", &str); - else ksprintf(&str, "%s\t", header->target_name[c->mtid]); - ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + if (c->mtid < 0) kputsn("*\t", 2, &str); + else if (c->mtid == c->tid) kputsn("=\t", 2, &str); + else { + if (header) kputs(header->target_name[c->mtid], &str); + else kputw(c->mtid, &str); + kputc('\t', &str); + } + kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); - } else ksprintf(&str, "*\t*"); + } else kputsn("*\t*", 3, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; - ksprintf(&str, "\t%c%c:", key[0], key[1]); - if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } - else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } - else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } - else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } - else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } - else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } - else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); + if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } + else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } + else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } + else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } + else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } + else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } + else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } - else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing + for (i = 0; i < n; ++i) { + kputc(',', &str); + if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } + else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } + } + } } return str.s; } @@ -288,10 +331,26 @@ char *bam_format1(const bam_header_t *header, const bam1_t *b) void bam_view1(const bam_header_t *header, const bam1_t *b) { char *s = bam_format1(header, b); - printf("%s\n", s); + puts(s); free(s); } +int bam_validate1(const bam_header_t *header, const bam1_t *b) +{ + char *s; + + if (b->core.tid < -1 || b->core.mtid < -1) return 0; + if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; + + if (b->data_len < b->core.l_qname) return 0; + s = memchr(bam1_qname(b), '\0', b->core.l_qname); + if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; + + // FIXME: Other fields could also be checked, especially the auxiliary data + + return 1; +} + // FIXME: we should also check the LB tag associated with each alignment const char *bam_get_library(bam_header_t *h, const bam1_t *b) {