You can speedup mpilup by cutting your genome into regions of interest
see my demo project https://github.com/lindenb/ngsxml
in this project, I declare regions of interest:
<?xml version="1.0" encoding="UTF-8"?>
<segments>
<segment chrom="chr4_gl000194_random" start="0" end="191469"/>
<segment chrom="chr1_gl000192_random" start="0" end="547496"/>
</segments>
here two segments are declared and called in parallel:
$(call vcf_segment,Proj1,samtools,chr1_gl000192_random,0,547496) : $(call project_dir,Proj1)/BAM/${tmp.prefix}Proj1.bam.list \
$(addsuffix .fai,${REFERENCE}) ${samtools.exe} ${bcftools.exe}
mkdir -p $(dir $@) && \
${samtools.exe} mpileup -uf ${REFERENCE} -b $< -r chr1_gl000192_random:0-547496 | \
${bcftools.exe} call --variants-only --multiallelic-caller --output-type z --output $@
$(call vcf_segment,Proj1,samtools,chr4_gl000194_random,0,191469) : $(call project_dir,Proj1)/BAM/${tmp.prefix}Proj1.bam.list \
$(addsuffix .fai,${REFERENCE}) ${samtools.exe} ${bcftools.exe}
mkdir -p $(dir $@) && \
${samtools.exe} mpileup -uf ${REFERENCE} -b $< -r chr4_gl000194_random:0-191469 | \
${bcftools.exe} call --variants-only --multiallelic-caller --output-type z --output $@
we create a list of vcf
$$(call vcf_list,$(1),$(2)) : \
$(call vcf_segment,$(1),$(2),chr4_gl000194_random,0,191469) \
$(call vcf_segment,$(1),$(2),chr1_gl000192_random,0,547496)
mkdir -p $$(dir $$@)
rm -f $$(addsuffix .tmp,$$@)
echo "$(call vcf_segment,$(1),$(2),chr4_gl000194_random,0,191469)" >> $$(addsuffix .tmp,$$@)
echo "$(call vcf_segment,$(1),$(2),chr1_gl000192_random,0,547496)" >> $$(addsuffix .tmp,$$@)
mv $$(addsuffix .tmp,$$@) $$@
and we merge the VCFs
$$(call vcf_final,$(1),$(2)) : $$(call vcf_list,$(1),$(2)) ${picard.jar}
mkdir -p $$(dir $$@) && \
${java.exe} -jar $$(filter %.jar,$$^) GatherVcfs I=$$< O=$$(addsuffix .tmp.vcf,$$@) && \
${bgzip.exe} -f $$(addsuffix .tmp.vcf,$$@) && \
${tabix.exe} -f -p vcf $$(addsuffix .tmp.vcf.gz,$$@) && \
mv $$(addsuffix .tmp.vcf.gz,$$@) $$@ && \
mv $$(addsuffix .tmp.vcf.gz.tbi,$$@) $$(addsuffix .tbi,$$@)
Maybe the bcftools/vcf file component is a bit superfluous actually so please ignore it if it helps simplify a script.