Ok I have finally managed to write this code in R where you can select the genes from biomart
or sample it randomly to extract the SNPs from VCF files separated per chromosome. It is not dandy, but does the job and can be customized as needed.
library(biomaRt)
listMarts()
ensembl=useMart("ensembl")
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice" ,dataset="hsapiens_gene_ensembl")
ensembl<- useDataset("hsapiens_gene_ensembl",mart=ensembl)
listFilters(ensembl)
listDatasets(ensembl)
all.genes <- getBM(attributes=c('ensembl_gene_id','gene_biotype','hgnc_symbol','chromosome_name','start_position','end_position'), mart = ensembl)
colnames(all.genes)
all.genes <- all.genes[(!(all.genes[,"hgnc_symbol"])=="" &all.genes[,"gene_biotype"]=="protein_coding" & grepl("(^[0-9]+$)|^(X|Y)$",all.genes[,"chromosome_name"]) & !duplicated(all.genes[,"hgnc_symbol"])),]
library(stringr)
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
path.file<-"/mydir/"
all.files <- list.files("/mydir")
all.files <- all.files[grepl("recalibrated.vcf$",all.files)]
setwd(path.file)
for (i in all.files) {
file_gz <- paste0(i, ".gz")
file_gz_tbi <- paste0(i, ".gz.tbi")
if(!exists(file_gz))
bgzip(i, paste0(i, ".gz"))
if(!exists(file_gz_tbi))
indexTabix(file_gz, format = "vcf")
}
save.genes<-all.genes
all.files <-list.files("/mydir")
all.gz.files<-all.files[grepl(".gz$",all.files)]
save.all.genes<-save.genes
for(i in 1:length(all.gz.files)){
print(paste0("Doing chromosome:",i))
all.genes <- save.all.genes[save.all.genes[,"chromosome_name"]== sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),]
if(!is.na(all.genes[,"chromosome_name"][1]) & all.genes[,"chromosome_name"][1]==sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i])){
file.gz<-all.gz.files[i]
stopifnot(file.exists(file.gz))
file.gz.tbi <- paste(file.gz, ".tbi", sep="")
if(!(file.exists(file.gz.tbi)))
indexTabix(file.gz, format="vcf")
chr.gr <- GRanges(paste0("chr",all.genes$chromosome_name), IRanges( all.genes$start_position, all.genes$end_position))
params <- ScanVcfParam(which=chr.gr)
vcf <- readVcf(TabixFile(file.gz), "hg19", params)
writeVcf(vcf, paste0("chr.",sub(".*?chr(.*?)\\.recalibrated.*", "\\1", all.gz.files[i]),".test-sub.vcf"))
}else{
next;
}
}
Use vcftools. Check here: http://vcftools.sourceforge.net/man_latest.html
There is an option --bed where you you take columns 3,4,5 above in a simple tab delimited file without any quotes and the header name chrom, chromStart and chromEnd and it will pull out only variants within those bed file regions. So your script will be:
./vcftools --vcf vcf_file.vcf --bed bed_file_name.bed --recode --out new_vcf_file.vcf
Hope this helps.