Entering edit mode
20 months ago
wangjincheng
•
0
I use python package to load and analysis vcf.gz files,
for ch in chs:
vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
col_sample_af = "",
buffer_size=20000, chunk_width=8, chunk_length=20000,
ch=ch)
but get this error below:
UnicodeDecodeError Traceback (most recent call last)
Cell In[14], line 6
3 chs = range(1,23)
5 for ch in chs:
----> 6 vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
7 path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
8 path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
9 marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
10 map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
11 af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
12 col_sample_af = "",
13 buffer_size=20000, chunk_width=8, chunk_length=20000,
14 ch=ch)
File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py:116, in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
113 os.remove(path_h5)
115 print("Converting to HDF5...")
--> 116 allel.vcf_to_hdf5(input=path_vcf, output=path_h5,
117 fields = ['variants/*', 'calldata/*', "samples"],
118 types = {"samples":"S60", "calldata/GT":np.int8,
119 "calldata/GP":np.float32, "calldata/PL":np.float32},
120 buffer_size=buffer_size,
121 chunk_length = chunk_length, chunk_width=chunk_width,
122 compression="gzip") # Do the conversion to hdf5. Takes hours
123 print("Finished conversion to hdf5!")
125 print("Merging in LD Map..")
File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:693, in vcf_to_hdf5(input, output, group, compression, compression_opts, shuffle, overwrite, vlen, fields, exclude_fields, rename_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length, chunk_width, log)
690 store_samples, fields = _prep_fields_param(fields)
692 # setup chunk iterator
--> 693 fields, samples, headers, it = iter_vcf_chunks(
694 input, fields=fields, exclude_fields=exclude_fields, types=types,
695 numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
696 chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
697 samples=samples, transformers=transformers
698 )
700 # handle field renaming
701 if rename_fields:
File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1138, in iter_vcf_chunks(input, fields, exclude_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length)
1134 stream = _setup_input_stream(input=input, region=region, tabix=tabix,
1135 buffer_size=buffer_size)
1137 # setup iterator
-> 1138 fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
1140 # setup transformers
1141 if transformers is not None:
1142 # API flexibility
File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1636, in _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number, chunk_length, fills, region, samples)
1632 def _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number,
1633 chunk_length, fills, region, samples):
1634
1635 # read VCF headers
-> 1636 headers = _read_vcf_headers(stream)
1638 # setup samples
1639 samples, loc_samples = _normalize_samples(samples=samples, headers=headers,
1640 types=types)
File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1711, in _read_vcf_headers(stream)
1709 # read first header line
1710 header = stream.readline()
-> 1711 header = str(header, 'utf8')
1713 while header and header[0] == '#':
1715 headers.append(header)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
How can I solve this problem, thanks a lot !
looks like it doesn't like gzipped file. Are you sure this software can read gzipped files ?
I tried unzipped file, it shows the same error