Question

Extract specific genes from VCF file?

1

Entering edit mode

8.7 years ago

MAPK ★ 2.1k

I have a list of 3000 genes and their start and end locations as shown below. I need to extract the SNPs from VCF file within these locations. What would be the easiest way to do this (or preferably in R)?

   gene_id symbol chromosome start_location end_location
1  "1"     "A1BG" "19"       " 58858171"   " 58864865"
2  "10"    "NAT2" "8"        "  18248754"   "  18258723"
3  "100"   "ADA"  "20"       " 43248162"   "43280376"
4  "1000"  "CDH2" "18"       " 25530926"   " 25616549"

extract VCF VCF snp • 12k views

ADD COMMENT • link 8.7 years ago by MAPK ★ 2.1k

0

Entering edit mode

Use vcftools. Check here: http://vcftools.sourceforge.net/man_latest.html

There is an option --bed where you you take columns 3,4,5 above in a simple tab delimited file without any quotes and the header name chrom, chromStart and chromEnd and it will pull out only variants within those bed file regions. So your script will be:

./vcftools --vcf vcf_file.vcf --bed bed_file_name.bed --recode --out new_vcf_file.vcf

Hope this helps.

ADD REPLY • link 8.6 years ago by das2000sidd ▴ 30

3

Entering edit mode

8.7 years ago

michael.ante ★ 3.9k

Hi Mapk,

I would use awk or perl to create a bed file out of your table. After sorting and merging the bed file, you can use intersectBed with your VCF file.

Cheers,

Michael

ADD COMMENT • link 8.7 years ago by michael.ante ★ 3.9k

1

Entering edit mode

without being entirely clear on what output format the OP is trying to get, I would go even simpler and try a simple grep with a regular expression, something like

grep -E "SOX9|FOXA1|AR" variant_file.txt > output.txt

R has a grep function with roughly the same functionality.

ADD REPLY • link 8.7 years ago by steve ★ 3.5k

0

Entering edit mode

I am also interested to extract information for a list of genes from vcf file and tried the above grep command but it did not work and got empty file.

VCF file has been created from bam (exomes) (via samtools mpileup -> bcftools -> vcfutils.pl)

Any other ways please.

Thanks.

ADD REPLY • link 7.4 years ago by bioinfo8 ▴ 230

1

Entering edit mode

For that command to work, the gene ID's must be contained in the file; I do not think this is typically the case for a VCF file. You will want to make sure that your VCF is annotated first. Might help to open the VCF in Excel to check on its contents.

ADD REPLY • link 7.4 years ago by steve ★ 3.5k

score 6 · Accepted Answer · 2016-04-21

Ok I have finally managed to write this code in R where you can select the genes from biomart or sample it randomly to extract the SNPs from VCF files separated per chromosome. It is not dandy, but does the job and can be customized as needed.

#Extracting some genes and positions from Biomart
#Using biomart
library(biomaRt)
listMarts()
ensembl=useMart("ensembl")

#To use hg19/GrCh37
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice" ,dataset="hsapiens_gene_ensembl")
# grch37 = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice")
# listMarts(grch37)

ensembl<- useDataset("hsapiens_gene_ensembl",mart=ensembl)   # from ensembl using homosapien gene data

listFilters(ensembl)

listDatasets(ensembl) 
# #genes.with.id <- getBM(attributes=c("ensembl_gene_id", "external_gene_id"),values=gene_names, mart= ensembl)
# human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# cc <- getBM(attributes = c("hgnc_symbol","chromosome_name", "start_position","end_position"),
#            filters = "hgnc_symbol", values = *,mart = human)


all.genes <- getBM(attributes=c('ensembl_gene_id','gene_biotype','hgnc_symbol','chromosome_name','start_position','end_position'),   mart = ensembl)
colnames(all.genes)
# "ensembl_gene_id" "gene_biotype"    "hgnc_symbol"     "chromosome_name" "start_position"  "end_position"  

##getting only protein coding genes
all.genes <- all.genes[(!(all.genes[,"hgnc_symbol"])=="" &all.genes[,"gene_biotype"]=="protein_coding" & grepl("(^[0-9]+$)|^(X|Y)$",all.genes[,"chromosome_name"]) & !duplicated(all.genes[,"hgnc_symbol"])),]

#####################################
library(stringr)
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

path.file<-"/mydir/"
all.files <- list.files("/mydir")
all.files <- all.files[grepl("recalibrated.vcf$",all.files)]

##for single file; for loop go below
# bgzip("/mydir/myvcf.chr1.recalibrated.vcf", overwrite=FALSE)
# indexTabix("/mydir/myvcf.chr1.recalibrated.vcf.bgz", format="vcf")

##creating .bgz and .bgz.tbi files
# all.files <- list.files("/mydir", pattern = "*.vcf$",  full.names = TRUE)
#this bit changes the vcf to .gz format
setwd(path.file)
  for (i in all.files) {
    file_gz <- paste0(i, ".gz")
    file_gz_tbi <- paste0(i, ".gz.tbi")

    if(!exists(file_gz))
      bgzip(i, paste0(i, ".gz"))

    if(!exists(file_gz_tbi))
      indexTabix(file_gz, format = "vcf")
  }

save.genes<-all.genes
#all.genes  #this is the table of all genes in bed format

all.files <-list.files("/mydir")
# all.files <- list.files("/mydir", pattern = "*.gz$",
#                         full.names = TRUE)
all.gz.files<-all.files[grepl(".gz$",all.files)]

save.all.genes<-save.genes

##Extracting the genes from VCF


# i<-2
    for(i in 1:length(all.gz.files)){ 
    print(paste0("Doing chromosome:",i))  
#if (save.all.genes[,"chromosome_name"]== sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.bgz.files[i])){

#This bit here selects only the chromosome number: sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i])
all.genes <- save.all.genes[save.all.genes[,"chromosome_name"]== sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),]
if(!is.na(all.genes[,"chromosome_name"][1]) & all.genes[,"chromosome_name"][1]==sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i])){
##file.gz <- "/mydir/myvcf.chr1.recalibrated.vcf.bgz"
file.gz<-all.gz.files[i]
stopifnot(file.exists(file.gz))
file.gz.tbi <- paste(file.gz, ".tbi", sep="")
if(!(file.exists(file.gz.tbi)))
  indexTabix(file.gz, format="vcf")

# start.loc <- 45959538
# end.loc <- 203047868

#chr1.gr <- GRanges("chr1", IRanges(start.loc, end.loc))
chr.gr <- GRanges(paste0("chr",all.genes$chromosome_name), IRanges( all.genes$start_position, all.genes$end_position))

params <- ScanVcfParam(which=chr.gr)
vcf <- readVcf(TabixFile(file.gz), "hg19", params)
#locateVariants(vcf[6,], txdb, AllVariants())
##Writing the vcf subset
writeVcf(vcf, paste0("chr.",sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),".test-sub.vcf"))
   }else{
  next;
}
}