3 #include "sam_header.h"
11 KHASH_MAP_INIT_STR(str, const char *)
15 struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
16 struct _HeaderList *next;
19 typedef struct _HeaderList list_t;
20 typedef list_t HeaderDict;
36 const char *o_hd_tags[] = {"SO","GO",NULL};
37 const char *r_hd_tags[] = {"VN",NULL};
39 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
40 const char *r_sq_tags[] = {"SN","LN",NULL};
41 const char *u_sq_tags[] = {"SN",NULL};
43 const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
44 const char *r_rg_tags[] = {"ID",NULL};
45 const char *u_rg_tags[] = {"ID",NULL};
47 const char *o_pg_tags[] = {"VN","CL",NULL};
48 const char *r_pg_tags[] = {"ID",NULL};
50 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
51 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
52 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
53 const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
56 static void debug(const char *format, ...)
60 vfprintf(pysamerr, format, ap);
65 // Replaced by list_append_to_end
66 static list_t *list_prepend(list_t *root, void *data)
68 list_t *l = malloc(sizeof(list_t));
75 // Relies on the root->last being correct. Do not use with the other list_*
76 // routines unless they are fixed to modify root->last as well.
77 static list_t *list_append_to_end(list_t *root, void *data)
79 list_t *l = malloc(sizeof(list_t));
92 static list_t *list_append(list_t *root, void *data)
99 l->next = malloc(sizeof(list_t));
104 l = malloc(sizeof(list_t));
112 static void list_free(list_t *root)
125 // Look for a tag "XY" in a predefined const char *[] array.
126 static int tag_exists(const char *tag, const char **tags)
129 if ( !tags ) return -1;
132 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
140 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
141 // or NULL if everything has been read. The lineptr should be freed by the caller. The
142 // newline character is stripped.
143 static const char *nextline(char **lineptr, size_t *n, const char *text)
146 const char *to = text;
148 if ( !*to ) return NULL;
150 while ( *to && *to!='\n' && *to!='\r' ) to++;
155 // Advance the pointer for the next call
156 if ( *to=='\n' ) to++;
157 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
164 *lineptr = malloc(len);
169 *lineptr = realloc(*lineptr, len);
173 debug("[nextline] Insufficient memory!\n");
177 memcpy(*lineptr,text,len);
178 (*lineptr)[len-1] = 0;
183 // name points to "XY", value_from points to the first character of the value string and
184 // value_to points to the last character of the value string.
185 static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
187 HeaderTag *tag = malloc(sizeof(HeaderTag));
188 int len = value_to-value_from+1;
190 tag->key[0] = name[0];
191 tag->key[1] = name[1];
192 tag->value = malloc(len+1);
193 memcpy(tag->value,value_from,len+1);
198 static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
200 list_t *tags = hline->tags;
203 HeaderTag *tag = tags->data;
204 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
212 // 0 .. different types or unique tags differ or conflicting tags, cannot be merged
213 // 1 .. all tags identical -> no need to merge, drop one
214 // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
215 // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
216 static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
220 if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
223 int itype = tag_exists(hline1->type,types);
225 debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
226 return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
229 if ( unique_tags[itype] )
231 t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
232 t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
233 if ( !t1 || !t2 ) // this should never happen, the unique tags are required
236 if ( strcmp(t1->value,t2->value) )
237 return 0; // the unique tags differ, cannot be merged
239 if ( !required_tags[itype] && !optional_tags[itype] )
241 t1 = hline1->tags->data;
242 t2 = hline2->tags->data;
243 if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
247 int missing=0, itag=0;
248 while ( required_tags[itype] && required_tags[itype][itag] )
250 t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
251 t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
253 return 2; // this should never happen
254 else if ( !t1 || !t2 )
255 missing = 1; // there is some tag missing in one of the hlines
256 else if ( strcmp(t1->value,t2->value) )
258 if ( unique_tags[itype] )
259 return 2; // the lines have a matching unique tag but have a conflicting tag
261 return 0; // the lines contain conflicting tags, cannot be merged
266 while ( optional_tags[itype] && optional_tags[itype][itag] )
268 t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
269 t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
276 missing = 1; // there is some tag missing in one of the hlines
277 else if ( strcmp(t1->value,t2->value) )
279 if ( unique_tags[itype] )
280 return 2; // the lines have a matching unique tag but have a conflicting tag
282 return 0; // the lines contain conflicting tags, cannot be merged
286 if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
291 static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
294 HeaderLine *out = malloc(sizeof(HeaderLine));
295 out->type[0] = hline->type[0];
296 out->type[1] = hline->type[1];
302 HeaderTag *old = tags->data;
304 HeaderTag *new = malloc(sizeof(HeaderTag));
305 new->key[0] = old->key[0];
306 new->key[1] = old->key[1];
307 new->value = strdup(old->value);
308 out->tags = list_append(out->tags, new);
315 static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
319 if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
322 tmpl_tags = tmpl_hline->tags;
325 HeaderTag *tmpl_tag = tmpl_tags->data;
326 HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
329 HeaderTag *tag = malloc(sizeof(HeaderTag));
330 tag->key[0] = tmpl_tag->key[0];
331 tag->key[1] = tmpl_tag->key[1];
332 tag->value = strdup(tmpl_tag->value);
333 out_hline->tags = list_append(out_hline->tags,tag);
335 tmpl_tags = tmpl_tags->next;
341 static HeaderLine *sam_header_line_parse(const char *headerLine)
345 const char *from, *to;
348 if ( *from != '@' ) {
349 debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
354 while (*to && *to!='\t') to++;
355 if ( to-from != 2 ) {
356 debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
360 hline = malloc(sizeof(HeaderLine));
361 hline->type[0] = from[0];
362 hline->type[1] = from[1];
365 int itype = tag_exists(hline->type, types);
368 while (*to && *to=='\t') to++;
369 if ( to-from != 1 ) {
370 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
376 while (*to && *to!='\t') to++;
378 if ( !required_tags[itype] && !optional_tags[itype] )
380 // CO is a special case, it can contain anything, including tabs
381 if ( *to ) { to++; continue; }
382 tag = new_tag(" ",from,to-1);
385 tag = new_tag(from,from+3,to-1);
387 if ( header_line_has_tag(hline,tag->key) )
388 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
389 hline->tags = list_append(hline->tags, tag);
392 while (*to && *to=='\t') to++;
393 if ( *to && to-from != 1 ) {
394 debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
404 // Must be of an existing type, all tags must be recognised and all required tags must be present
405 static int sam_header_line_validate(HeaderLine *hline)
411 // Is the type correct?
412 itype = tag_exists(hline->type, types);
415 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
419 // Has all required tags?
421 while ( required_tags[itype] && required_tags[itype][itag] )
423 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
425 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
426 hline->type[0],hline->type[1]);
432 // Are all tags recognised?
437 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
439 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
449 static void print_header_line(FILE *fp, HeaderLine *hline)
451 list_t *tags = hline->tags;
454 fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
460 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
461 fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
462 fprintf(fp, "%s", tag->value);
470 static void sam_header_line_free(HeaderLine *hline)
472 list_t *tags = hline->tags;
475 HeaderTag *tag = tags->data;
480 list_free(hline->tags);
484 void sam_header_free(void *_header)
486 HeaderDict *header = (HeaderDict*)_header;
487 list_t *hlines = header;
490 sam_header_line_free(hlines->data);
491 hlines = hlines->next;
496 HeaderDict *sam_header_clone(const HeaderDict *dict)
498 HeaderDict *out = NULL;
501 HeaderLine *hline = dict->data;
502 out = list_append(out, sam_header_line_clone(hline));
508 // Returns a newly allocated string
509 char *sam_header_write(const void *_header)
511 const HeaderDict *header = (const HeaderDict*)_header;
514 const list_t *hlines;
516 // Calculate the length of the string to allocate
520 len += 4; // @XY and \n
522 HeaderLine *hline = hlines->data;
523 list_t *tags = hline->tags;
526 HeaderTag *tag = tags->data;
527 len += strlen(tag->value) + 1; // \t
528 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
529 len += strlen(tag->value) + 3; // XY:
532 hlines = hlines->next;
540 HeaderLine *hline = hlines->data;
542 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
544 list_t *tags = hline->tags;
547 HeaderTag *tag = tags->data;
548 nout += sprintf(out+nout,"\t");
549 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
550 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
551 nout += sprintf(out+nout,"%s", tag->value);
554 hlines = hlines->next;
555 nout += sprintf(out+nout,"\n");
561 void *sam_header_parse2(const char *headerText)
563 list_t *hlines = NULL;
574 while ( (text=nextline(&buf, &nbuf, text)) )
576 hline = sam_header_line_parse(buf);
577 if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
578 // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
579 hlines = list_append_to_end(hlines, hline);
582 if (hline) sam_header_line_free(hline);
583 sam_header_free(hlines);
584 if ( buf ) free(buf);
588 if ( buf ) free(buf);
593 void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
595 const HeaderDict *dict = (const HeaderDict*)_dict;
596 const list_t *l = dict;
597 khash_t(str) *tbl = kh_init(str);
601 if (_dict == 0) return tbl; // return an empty (not null) hash table
604 HeaderLine *hline = l->data;
605 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
611 HeaderTag *key, *value;
612 key = header_line_has_tag(hline,key_tag);
613 value = header_line_has_tag(hline,value_tag);
614 if ( !key || !value )
620 k = kh_get(str, tbl, key->value);
621 if ( k != kh_end(tbl) )
622 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
623 k = kh_put(str, tbl, key->value, &ret);
624 kh_value(tbl, k) = value->value;
631 char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
633 const HeaderDict *dict = (const HeaderDict*)_dict;
634 const list_t *l = dict;
638 ret = 0; *_n = max = n = 0;
641 HeaderLine *hline = l->data;
642 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
649 key = header_line_has_tag(hline,key_tag);
657 max = max? max<<1 : 4;
658 ret = realloc(ret, max * sizeof(void*));
660 ret[n++] = key->value;
668 const char *sam_tbl_get(void *h, const char *key)
670 khash_t(str) *tbl = (khash_t(str)*)h;
672 k = kh_get(str, tbl, key);
673 return k == kh_end(tbl)? 0 : kh_val(tbl, k);
676 int sam_tbl_size(void *h)
678 khash_t(str) *tbl = (khash_t(str)*)h;
679 return h? kh_size(tbl) : 0;
682 void sam_tbl_destroy(void *h)
684 khash_t(str) *tbl = (khash_t(str)*)h;
685 kh_destroy(str, tbl);
688 void *sam_header_merge(int n, const void **_dicts)
690 const HeaderDict **dicts = (const HeaderDict**)_dicts;
691 HeaderDict *out_dict;
694 if ( n<2 ) return NULL;
696 out_dict = sam_header_clone(dicts[0]);
698 for (idict=1; idict<n; idict++)
700 const list_t *tmpl_hlines = dicts[idict];
702 while ( tmpl_hlines )
704 list_t *out_hlines = out_dict;
708 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
711 out_hlines = out_hlines->next;
717 print_header_line(pysamerr,tmpl_hlines->data);
718 print_header_line(pysamerr,out_hlines->data);
719 debug("Conflicting lines, cannot merge the headers.\n");
723 sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
729 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
731 tmpl_hlines = tmpl_hlines->next;