I want to write some c code to convert fastq to unmapped bam file. I use htslib to read and write bam file and have some problem. The bam file I generated seems not validated. I guess I missed some part but I dont know what it is. This is my toy code:
// trim_fq_to_bam.c
#include <zlib.h> // for reading compressed .fq file
#include <stdio.h>
#include <string.h>
#include <htslib/sam.h>
#include <htslib/bgzf.h>
#include <htslib/kseq.h>
#include <htslib/hts.h>
KSEQ_INIT(gzFile, gzread)
#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
static const char init_header[] = "@HD\tVN:1.4\tSO:unknown\n";
void dump_read(bam1_t* b) {
printf("->core.tid:(%d)\n", b->core.tid);
printf("->core.pos:(%d)\n", b->core.pos);
printf("->core.bin:(%d)\n", b->core.bin);
printf("->core.qual:(%d)\n", b->core.qual);
printf("->core.l_qname:(%d)\n", b->core.l_qname);
printf("->core.flag:(%d)\n", b->core.flag);
printf("->core.n_cigar:(%d)\n", b->core.n_cigar);
printf("->core.l_qseq:(%d)\n", b->core.l_qseq);
printf("->core.mtid:(%d)\n", b->core.mtid);
printf("->core.mpos:(%d)\n", b->core.mpos);
printf("->core.isize:(%d)\n", b->core.isize);
if (b->data) {
printf("->data:");
int i;
for (i = 0; i < b->l_data; ++i) {
printf("%x ", b->data[i]);
}
printf("\n");
}
if (b->core.l_qname) {
printf("qname: %s\n",bam_get_qname(b));
}
if (b->core.l_qseq) {
printf("qseq:");
int i;
for (i = 0; i < b->core.l_qseq; ++i) {
printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(b),i)]);
}
printf("\n");
printf("qual:");
for (i = 0; i < b->core.l_qseq; ++i) {
printf("%c",bam_get_qual(b)[i]);
}
printf("\n");
}
if (bam_get_l_aux(b)) {
int i = 0;
uint8_t* aux = bam_get_aux(b);
while (i < bam_get_l_aux(b)) {
printf("%.2s:%c:",aux+i,*(aux+i+2));
i += 2;
switch (*(aux+i)) {
case 'Z':
while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; }
break;
}
putc('\n',stdout);
++i;++i;
}
}
printf("\n");
}
int main(int argc, char const *argv[])
{
char *fn_in = "test_data/rd1.fq.gz";
char *fn_out = "test_data/new.bam";
int l = 0;
gzFile *f_rd = gzopen(fn_in, "r");
if (!f_rd){fprintf(stderr, "cant open file: %s\n", fn_in); exit(EXIT_FAILURE);}
kseq_t *seq;
seq = kseq_init(f_rd);
BGZF *fp = bgzf_open(fn_out,"w");
// write header
bam_hdr_t *hdr = bam_hdr_init();
hdr->l_text = strlen(init_header);
hdr->text = strdup(init_header);
hdr->n_targets = 0;
bam_hdr_write(fp, hdr);
// main loop, iter through each fastq records
while ((l = kseq_read(seq)) >= 0){
printf("%s, %s\n", seq->seq.s, seq->qual.s);
bam1_t *q = bam_init1();
//`q->data` structure: qname-cigar-seq-qual-aux
q->l_data = seq->name.l+1+(int)(1.5*seq->seq.l+(seq->seq.l % 2 != 0)); // +1 includes the tailing '\0'
if (q->m_data < q->l_data) {
q->m_data = q->l_data;
kroundup32(q->m_data);
q->data = (uint8_t*)realloc(q->data, q->m_data);
}
q->core.flag = BAM_FMUNMAP;
q->core.l_qname = seq->name.l+1; // +1 includes the tailing '\0'
q->core.l_qseq = seq->seq.l;
q->core.n_cigar = 0; // we have no cigar sequence
memcpy(q->data, seq->name.s, q->core.l_qname); // first set qname
uint8_t *s = bam_get_seq(q);
for (int i = 0; i < q->core.l_qseq; ++i){
bam1_seq_seti(s, i, seq_nt16_table[seq->seq.s[i]]);
}
s = bam_get_qual(q);
for (int i = 0; i < seq->qual.l; ++i) {
s[i] = seq->qual.s[i];
}
dump_read(q);
int ret = bam_write1(fp, q);
printf("return value: %d\n", ret);
bam_destroy1(q);
}
kseq_destroy(seq); // free seq
gzclose(f_rd); // close fastq file
bgzf_close(fp); // close bam file
// validate bam file
BGZF *fp_new = bgzf_open(fn_out,"r");
bam1_t *q = bam_init1();
int ret = bam_read1(fp_new, q);
printf("%d\n", ret);
printf("%d, %d\n", (char *)bam_get_aux(q) - (char *)q->data , q->l_data);
dump_read(q);
return 0;
}
the function dump_read()
is used to print a bam record to see if I have successfully create a valid bam1_t
structure, and the code is copied from here:
https://github.com/samtools/samtools/blob/74a3e0a744f91bc2b1f65a4144a509d6dd791631/test/merge/test_bam_translate.c
with some minor modification. For printing the sequence the original code is printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]);
. but I didnt get the point of jointly using seq_nt16_table[]
and seq_nt16_str[]
, since bam_seqi()
returns the 4 bit representation of a base and seq_nt16_str
convert it to str so these two would be sufficient.
The basic structure of the code is:
open fq file and bam file ->
write bam header (I am not sure if I have done it correctly) ->
iterate through each fastq reads, convert it to a bam1_t
structure ->
use dump_read()
to check if the bam1_t
is correct. ->
write bam1_t
to the bam file. check the return to see if the writing is successful.
So far everything looks fine. the dump_read()
print the same seq and quality strings as the fastq reads. bam_write1()
gives positive returns. And I generated a bam file. But when I try samtools view -h new.bam
, it gives an error [main_samview] truncated file.
Then I hacked the code and find when sam_read1() < -1
it will print this err msg. So I reopened my bam file and calls bam_read1(fp_new, q);
, the return is -4, I checked the source code to see when it will returns -4 and stuck. I don't know which part of my code is wrong.
Also I wonder if there are any toturial on how to use htslib, or any examples so I can start from there. I usually work on Python and I am not familiar with C. Now I just read the source code of htslib but it is a bit messy, with a lot of uncomment blocks.
Sorry I missed this the first time. Have you tried replacing
bgzf_open(fn_out,"w");
and the associated close() with the actual BAM (sam_open()/sam_close()) functions? Those should write the magic bits at the end, which won't be there in generic bgzf files.