Ok I have finally managed to write this code in R where you can select the genes from biomart
or sample it randomly to extract the SNPs from VCF files separated per chromosome. It is not dandy, but does the job and can be customized as needed.
#Extracting some genes and positions from Biomart
#Using biomart
library(biomaRt)
listMarts()
ensembl=useMart("ensembl")
#To use hg19/GrCh37
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice" ,dataset="hsapiens_gene_ensembl")
# grch37 = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice")
# listMarts(grch37)
ensembl<- useDataset("hsapiens_gene_ensembl",mart=ensembl) # from ensembl using homosapien gene data
listFilters(ensembl)
listDatasets(ensembl)
# #genes.with.id <- getBM(attributes=c("ensembl_gene_id", "external_gene_id"),values=gene_names, mart= ensembl)
# human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# cc <- getBM(attributes = c("hgnc_symbol","chromosome_name", "start_position","end_position"),
# filters = "hgnc_symbol", values = *,mart = human)
all.genes <- getBM(attributes=c('ensembl_gene_id','gene_biotype','hgnc_symbol','chromosome_name','start_position','end_position'), mart = ensembl)
colnames(all.genes)
# "ensembl_gene_id" "gene_biotype" "hgnc_symbol" "chromosome_name" "start_position" "end_position"
##getting only protein coding genes
all.genes <- all.genes[(!(all.genes[,"hgnc_symbol"])=="" &all.genes[,"gene_biotype"]=="protein_coding" & grepl("(^[0-9]+$)|^(X|Y)$",all.genes[,"chromosome_name"]) & !duplicated(all.genes[,"hgnc_symbol"])),]
#####################################
library(stringr)
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
path.file<-"/mydir/"
all.files <- list.files("/mydir")
all.files <- all.files[grepl("recalibrated.vcf$",all.files)]
##for single file; for loop go below
# bgzip("/mydir/myvcf.chr1.recalibrated.vcf", overwrite=FALSE)
# indexTabix("/mydir/myvcf.chr1.recalibrated.vcf.bgz", format="vcf")
##creating .bgz and .bgz.tbi files
# all.files <- list.files("/mydir", pattern = "*.vcf$", full.names = TRUE)
#this bit changes the vcf to .gz format
setwd(path.file)
for (i in all.files) {
file_gz <- paste0(i, ".gz")
file_gz_tbi <- paste0(i, ".gz.tbi")
if(!exists(file_gz))
bgzip(i, paste0(i, ".gz"))
if(!exists(file_gz_tbi))
indexTabix(file_gz, format = "vcf")
}
save.genes<-all.genes
#all.genes #this is the table of all genes in bed format
all.files <-list.files("/mydir")
# all.files <- list.files("/mydir", pattern = "*.gz$",
# full.names = TRUE)
all.gz.files<-all.files[grepl(".gz$",all.files)]
save.all.genes<-save.genes
##Extracting the genes from VCF
# i<-2
for(i in 1:length(all.gz.files)){
print(paste0("Doing chromosome:",i))
#if (save.all.genes[,"chromosome_name"]== sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.bgz.files[i])){
#This bit here selects only the chromosome number: sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i])
all.genes <- save.all.genes[save.all.genes[,"chromosome_name"]== sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),]
if(!is.na(all.genes[,"chromosome_name"][1]) & all.genes[,"chromosome_name"][1]==sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i])){
##file.gz <- "/mydir/myvcf.chr1.recalibrated.vcf.bgz"
file.gz<-all.gz.files[i]
stopifnot(file.exists(file.gz))
file.gz.tbi <- paste(file.gz, ".tbi", sep="")
if(!(file.exists(file.gz.tbi)))
indexTabix(file.gz, format="vcf")
# start.loc <- 45959538
# end.loc <- 203047868
#chr1.gr <- GRanges("chr1", IRanges(start.loc, end.loc))
chr.gr <- GRanges(paste0("chr",all.genes$chromosome_name), IRanges( all.genes$start_position, all.genes$end_position))
params <- ScanVcfParam(which=chr.gr)
vcf <- readVcf(TabixFile(file.gz), "hg19", params)
#locateVariants(vcf[6,], txdb, AllVariants())
##Writing the vcf subset
writeVcf(vcf, paste0("chr.",sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),".test-sub.vcf"))
}else{
next;
}
}
Use vcftools. Check here: http://vcftools.sourceforge.net/man_latest.html
There is an option --bed where you you take columns 3,4,5 above in a simple tab delimited file without any quotes and the header name chrom, chromStart and chromEnd and it will pull out only variants within those bed file regions. So your script will be:
./vcftools --vcf vcf_file.vcf --bed bed_file_name.bed --recode --out new_vcf_file.vcf
Hope this helps.