UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
0
0
Entering edit mode
20 months ago

I use python package to load and analysis vcf.gz files,

for ch in chs:
    vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
                     path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
                     path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
                     marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
                     map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
                     af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
                     col_sample_af = "",
                     buffer_size=20000, chunk_width=8, chunk_length=20000,
                     ch=ch)

but get this error below:

UnicodeDecodeError                        Traceback (most recent call last)
Cell In[14], line 6
      3 chs = range(1,23)
      5 for ch in chs:
----> 6     vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
      7                      path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
      8                      path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
      9                      marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
     10                      map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
     11                      af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
     12                      col_sample_af = "",
     13                      buffer_size=20000, chunk_width=8, chunk_length=20000,
     14                      ch=ch)

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py:116, in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
    113     os.remove(path_h5)
    115 print("Converting to HDF5...")
--> 116 allel.vcf_to_hdf5(input=path_vcf, output=path_h5, 
    117                   fields = ['variants/*', 'calldata/*', "samples"], 
    118                   types = {"samples":"S60", "calldata/GT":np.int8,
    119                            "calldata/GP":np.float32, "calldata/PL":np.float32}, 
    120                   buffer_size=buffer_size,
    121                   chunk_length = chunk_length, chunk_width=chunk_width,
    122                   compression="gzip") # Do the conversion to hdf5. Takes hours
    123 print("Finished conversion to hdf5!")
    125 print("Merging in LD Map..")

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:693, in vcf_to_hdf5(input, output, group, compression, compression_opts, shuffle, overwrite, vlen, fields, exclude_fields, rename_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length, chunk_width, log)
    690 store_samples, fields = _prep_fields_param(fields)
    692 # setup chunk iterator
--> 693 fields, samples, headers, it = iter_vcf_chunks(
    694     input, fields=fields, exclude_fields=exclude_fields, types=types,
    695     numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
    696     chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
    697     samples=samples, transformers=transformers
    698 )
    700 # handle field renaming
    701 if rename_fields:

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1138, in iter_vcf_chunks(input, fields, exclude_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length)
   1134 stream = _setup_input_stream(input=input, region=region, tabix=tabix,
   1135                              buffer_size=buffer_size)
   1137 # setup iterator
-> 1138 fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
   1140 # setup transformers
   1141 if transformers is not None:
   1142     # API flexibility

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1636, in _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number, chunk_length, fills, region, samples)
   1632 def _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number,
   1633                      chunk_length, fills, region, samples):
   1634 
   1635     # read VCF headers
-> 1636     headers = _read_vcf_headers(stream)
   1638     # setup samples
   1639     samples, loc_samples = _normalize_samples(samples=samples, headers=headers,
   1640                                               types=types)

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1711, in _read_vcf_headers(stream)
   1709 # read first header line
   1710 header = stream.readline()
-> 1711 header = str(header, 'utf8')
   1713 while header and header[0] == '#':
   1715     headers.append(header)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

How can I solve this problem, thanks a lot !

vcf.gz UnicodeDecodeError ancIBD • 1.6k views
ADD COMMENT
0
Entering edit mode

use python package to load and analysis vcf.gz files

looks like it doesn't like gzipped file. Are you sure this software can read gzipped files ?

ADD REPLY
0
Entering edit mode

I tried unzipped file, it shows the same error

ADD REPLY

Login before adding your answer.

Traffic: 2625 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6