I lifted some 23andme files from hg19 to hg38 using the following workflow in R calling samtools,plink and liftOver:
library(tidyverse)
#set working directory to data directory
trio_wd <- str_glue(here::here(),'/trio/K/')
#create file list for raw data
file_list <- str_c(trio_wd,dir(trio_wd)) %>% str_extract('genome.+\\d.txt') %>% str_extract('^(?:(?!admix).)+$') %>%
unique() %>%
{.[!is.na(.)]} %>% str_c(trio_wd,.)
#liftover loop
for (x in file_list){
#extract sample name
name = str_extract(x,'(?<=genome_)(.*?)(?=(_v5|_Full))') %>% str_replace('_','')
#recode to vcf
call = (str_glue("plink --23file {x} {name} --snps-only just-acgt --recode vcf --out {trio_wd}{name}"))
system(call)
#recode to uscf bed format
#https://www.researchgate.net/post/How_can_I_create_a_UCSC_BED_file_from_plink_data
call = str_c("grep -v '^#' ", trio_wd,name,".vcf | awk -F '\t' '{print $1,($2-1),$2,$3}' | sed -e 's/^/chr/' > ", trio_wd,name,".ucsc.bed ")
system(call)
#iftover
#correct chain file : https://groups.google.com/a/soe.ucsc.edu/g/genome/c/BYVXWMpOEjc
system(str_glue("liftOver {trio_wd}{name}.ucsc.bed /reference/hg19ToHg38.over.chain.gz {trio_wd}{name}_liftOver.bed {trio_wd}{name}unlifted.bed"))
#update positions
#read in the original vcf file
vcf <- read_table(str_glue( "{trio_wd}{name}.vcf"), skip = 30)
#read in the liftOver output
lift <- read_table(str_glue("{trio_wd}{name}_liftOver.bed"),col_names = F)
#update the positions of the original vcf file to the lifted positions
vcf %>% filter(ID %in% lift$X4) %>% mutate(POS = lift$X3,
`#CHROM` = lift$X1) %>% write_tsv(str_glue( "{trio_wd}{name}_lift.vcf"), col_names = T)
#recode to plink
call = str_glue("plink -vcf {trio_wd}{name}_lift.vcf --make-bed --aec --chr 1-22 -out {trio_wd}{name}_lift")
system(call)
}
#create list of lifted files
file_list <- list.files(trio_wd) %>% str_extract('(.+((k|K)|P))_lift') %>%
unique %>% {.[!is.na(.)]} %>% str_c(trio_wd,.)
#main loop for data cleaning
for (x in file_list){
name = str_extract(x,'[A-Za-z]+(?=((k|K)|P)).+')
call = str_glue("plink2 --bfile {trio_wd}{name} --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_fixvar")
system(call)
call = str_glue("plink2 --bfile {trio_wd}{name}_fixvar --rm-dup --make-bed -out {trio_wd}{name}_dup")
system(call)
#a few (<10 SNPs): 'duplicate IDs with inconsistent genotype data or variant information', remove if there
if(file.exists(str_glue("{trio_wd}{name}_dup.rmdup.mismatch"))){
call = str_glue("plink -bfile {trio_wd}{name}_fixvar --exclude {trio_wd}{name}_dup.rmdup.mismatch --make-bed -out {trio_wd}{name}_dup_mis")
system(call)
call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_dup_mis_id")
system(call)
call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis_id --rm-dup --make-bed -out {trio_wd}{name}_dup")
system(call)
}
#ensure that there are no positions with same allele coding
call = str_c("awk '$5==$6 {print $2}' " ,trio_wd, name, "_dup.bim >", trio_wd, name, "_dup_noalle.txt")
system(call)
call = str_glue("plink -bfile {trio_wd}{name}_dup --exclude {trio_wd}{name}_dup_noalle.txt --make-bed -out {trio_wd}{name}_dup_fixal")
system(call)
}
file_list <- list.files(trio_wd) %>% str_extract('.+_dup_fixal.bed$') %>%
str_remove('.bed') %>%
{.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))
# try to merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))
for(x in file_list){
system(str_glue("plink --bfile {x} --exclude {trio_wd}K-merge.missnp --make-bed --out {str_replace(x, 'fixal', 'subset')}"))
}
file_list <- list.files(trio_wd) %>% str_extract('.+_dup_subset.bed') %>%
str_remove('.bed') %>%
{.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))
# merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))
If do some sanity check afterwards to verify that the liftover process went well I tried the fixref plugin from bcftools to get an overview of possible ref mismatches:
#use some filtering to improve genotype rates
call <- str_glue('plink --bfile {trio_wd}K --snps-only just-acgt -geno 0.05 --recode vcf bgz --out {trio_wd}K ')
system(call)
#index the compressed vcf file
call = str_glue('bcftools index -f {trio_wd}K.vcf.gz')
system(call)
#call fixref
call = str_glue("bcftools +fixref {trio_wd}K.vcf.gz -- -f reference/GRCh38_full_analysis_set_plus_decoy_hla_modified.fa")
system(call)
The results can be seen in the following screenshot:
This clearly looks to me that something went wrong but I can't figure out what the possible problem here could be. Do I use an incorrect hg38 reference file? Did I use some wrong filter? Is the liftOver procedure incorrect?
make sure you are not 1 off Cheat Sheet For One-Based Vs Zero-Based Coordinate Systems
So when I recode from plink format to vcf I need to adjust my positions using e.g this since bed is 0-based and vcf is 1-based?
i think plink knows what vcf is, and ($2-1) looks ok, but maybe something downstream
plink 1.9 merge will sometimes swap ref/alt alleles when they were previously set correctly, so it is necessary to postprocess its output in some manner.