|
#!/bin/bash |
|
OPTIND=1 |
|
|
|
##################################### |
|
## Defaults/Constants |
|
##################################### |
|
|
|
#file and directory constants |
|
tmp_fq="/tmp-fq" |
|
fin_fq="/final-fq" |
|
alndat="/assembly" |
|
vardat="/variants" |
|
qcdat="/qc" |
|
ssheet="SampleSheet.csv" |
|
sdata="sample-list.tsv" |
|
proclog="proc-log.txt" |
|
hstrim="~/bin/trim_truseq_adaptors.pl" |
|
mstrim="~/bin/trim_truseq_adaptors_miseq.pl" |
|
hssrt="~/bin/sort_fastq.pl" |
|
mssrt="~/bin/sort_fastq_miseq.pl" |
|
|
|
#access values in sdata |
|
col_sname=1 |
|
col_sfile=2 |
|
col_fname=3 |
|
col_lib=4 |
|
col_ref=5 |
|
|
|
#default arguments |
|
def_keeprg=1 #keep read group by default |
|
def_minq=20 #minimum base quality |
|
def_mint=50 #minimum read length |
|
def_der="f" #forward strand by default |
|
def_ms=1 #default to using MiSeq specific tools |
|
def_arch=0 #default to leaving intermediate files uncompressed |
|
|
|
#set default values |
|
keeprg=$def_keeprg |
|
minq=$def_minq |
|
mint=$def_mint |
|
rder=$def_der |
|
miseq=$def_ms |
|
intarch=$def_arch |
|
|
|
###################################### |
|
## functions |
|
###################################### |
|
#run specified adapter trimming script |
|
run_trimmer(){ |
|
infile=$1 |
|
outdir=$2 |
|
sname=$3 |
|
fname=$4 |
|
d=$5 |
|
l=$6 |
|
trimmer=$7 |
|
|
|
$trimmer -i $infile -o "$outdir/$sname-$fname-t-raw.fastq" -d $d -l $l |
|
|
|
} |
|
export -f run_trimmer |
|
|
|
#filter reads by quality core |
|
#requires files from run_trimmer |
|
q_filter(){ |
|
infile=$1 |
|
outdir=$2 |
|
minq=$3 |
|
numq=$4 |
|
|
|
#make name for output file |
|
fpre=$(echo $infile | grep -oP "(?<=/)([^/]+)(?=-t-raw.fastq)") |
|
outfi="$outdir/$fpre-qf-t.fastq" |
|
|
|
#run quality filtering |
|
fastq_quality_filter -v -q $minq -p $numq -i $infile -o $outfi |
|
} |
|
export -f q_filter |
|
|
|
#trim reads of bases below minq |
|
#requires files from q_filter |
|
q_trim(){ |
|
infile=$1 |
|
outdir=$2 |
|
minq=$3 |
|
minl=$4 |
|
|
|
#make name for output file |
|
fpre=$(echo $infile | grep -oP "(?<=/)([^/]+)(?=-qf-t.fastq)") |
|
outfi="$outdir/$fpre-final.fastq" |
|
|
|
#perform trimming |
|
fastq_quality_trimmer -v -t $minq -l $minl -i $infile -o $outfi |
|
} |
|
export -f q_trim |
|
|
|
#generate QC stats for fastq data |
|
qc_stats(){ |
|
infile=$1 |
|
outdir=$2 |
|
|
|
#filenames |
|
fpre=$(echo $infile | grep -oP "(?<=/)([^/]+)(?=.fastq)") |
|
statfi="$outdir/$fpre-fq-stats.txt" |
|
ncdist="$outdir/$fpre-nuc-dist.png" |
|
qbox="$outdir/$fpre-qbox.png" |
|
|
|
#generate quality stats |
|
fastx_quality_stats -i $infile -o $statfi |
|
|
|
#generate figures |
|
fastx_nucleotide_distribution_graph.sh -i $statfi -o $ncdist |
|
fastq_quality_boxplot_graph.sh -i $statfi -o $qbox |
|
} |
|
export -f qc_stats |
|
|
|
#sort read files by mate pairs |
|
#requires files from q_trim |
|
sort_pairs(){ |
|
indir=$1 |
|
outdir=$2 |
|
sname=$3 |
|
sorter=$4 |
|
|
|
#generate filenames |
|
f_file="$indir/$sname*_R1_*-final.fastq" |
|
r_file="$indir/$sname*_R2_*-final.fastq" |
|
outfi="$outdir/$sname-paired.fastq" |
|
#pick sorter based on miseq |
|
$sorter -f $f_file -r $r_file -o $outfi |
|
} |
|
export -f sort_pairs |
|
|
|
#generate assembly with BWA |
|
#requires files from sort_pairs |
|
do_bwa(){ |
|
indir=$1 |
|
outdir=$2 |
|
sname=$3 |
|
refgen=$4 |
|
lib=$5 |
|
spec=$6 |
|
keeprg=$7 |
|
|
|
#generate filenames |
|
f_file="$indir/$sname-paired_for.fastq" |
|
r_file="$indir/$sname-paired_rev.fastq" |
|
outfi="$outdir/$sname.sam" |
|
|
|
#generate read group |
|
rg="@RG\tID:$sname\tSM:$spec-$lib\tLB:$lib\tPL:ILLUMINA\tPU:NA\t" |
|
|
|
#perform assembly |
|
if [ $keeprg -eq 1 ] |
|
then |
|
#add RG data to sam header |
|
bwa mem -t 4 -R $rg $refgen $f_file $r_file > $outfi |
|
else |
|
#omit RG data |
|
bwa mem -t 4 $refgen $f_file $r_file > $outfi |
|
fi |
|
} |
|
export -f do_bwa |
|
|
|
#post-assembly processing |
|
post_asm(){ |
|
indir=$1 |
|
outdir=$2 |
|
sname=$3 |
|
refgen=$4 |
|
|
|
#output files |
|
raw_bam="$indir/unsrt-$sname.bam" |
|
srt_bam="$outdir/srtd-$sname" |
|
srt_sam="$outdir/srtd-$sname.sam" |
|
|
|
#raw SAM to unsorted BAM |
|
samtools view -buT $refgen "$indir/$sname.sam" > $raw_bam |
|
|
|
#unsorted BAM to sorted BAM |
|
samtools sort $raw_bam $srt_bam |
|
|
|
#sorted BAM to sorted SAM |
|
#don't need a SAM file for following steps |
|
#samtools view "$srt_bam.bam" > $srt_sam |
|
|
|
#index the sorted BAM |
|
samtools index "$srt_bam.bam" |
|
} |
|
export -f post_asm |
|
|
|
#generate raw VCF |
|
initial_vars(){ |
|
indir=$1 |
|
outdir=$2 |
|
sname=$3 |
|
refgen=$4 |
|
|
|
#generate outputfile |
|
outfi="$outdir/stmp-$sname.vcf" |
|
|
|
#VCF |
|
samtools mpileup -vuf $refgen "$indir/srtd-$sname.bam" > $outfi |
|
} |
|
export -f initial_vars |
|
|
|
#picard duplicate marking |
|
picard_mdup(){ |
|
indir=$1 |
|
dupdir=$2 |
|
statdir=$3 |
|
sname=$4 |
|
|
|
#generate output file names |
|
dupfi="$dupdir/$sname-dedup.bam" |
|
statfi="$statdir/$sname-dedup.txt" |
|
|
|
#bash seems to not like the alias |
|
#picard MarkDuplicates \ |
|
java -XX:ParallelGCThreads=16 \ |
|
-Xmx16G \ |
|
-jar /opt/picard-tools/picard.jar MarkDuplicates \ |
|
INPUT="$indir/srtd-$sname.bam" \ |
|
OUTPUT=$dupfi \ |
|
METRICS_FILE=$statfi |
|
|
|
#index this bam |
|
samtools index $dupfi |
|
} |
|
export -f picard_mdup |
|
|
|
#GATK |
|
gatk_genos(){ |
|
bamdir=$1 |
|
vardir=$2 |
|
qcdir=$3 |
|
sname=$4 |
|
refgen=$5 |
|
|
|
#create targets |
|
echo "gatk-$sname: creating targets" |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T RealignerTargetCreator \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-dedup.bam" \ |
|
-o "$bamdir/$sname.intervals" |
|
|
|
#realign indels |
|
echo "gatk-$sname: realigning indels" |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T IndelRealigner \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-dedup.bam" \ |
|
-targetIntervals "$bamdir/$sname.intervals" \ |
|
-o "$bamdir/$sname-realn.bam" |
|
|
|
#base recal |
|
echo "gatk-$sname: recalibrating bases - tabel gen" |
|
#first generate teh recalibrated scores |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T BaseRecalibrator \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-realn.bam" \ |
|
-knownSites "$vardir/stmp-$sname.vcf" \ |
|
-o "$bamdir/$sname-recal.table" |
|
|
|
#follow up with base recal for qc |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T BaseRecalibrator \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-realn.bam" \ |
|
-knownSites "$vardir/stmp-$sname.vcf" \ |
|
-BQSR "$bamdir/$sname-recal.table" \ |
|
-o "$bamdir/$sname-recal-post.table" |
|
|
|
#uses R packages ggplot2 and gsalib, need these installed |
|
#generate QC plot for recal |
|
# java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
# -K /opt/gatk/gatk.key \ |
|
# -et STDOUT \ |
|
# -T AnalyzeCovariates \ |
|
# -R $refgen \ |
|
# -before "$bamdir/$sname-recal.table" \ |
|
# -after "$bamdir/$sname-recal-post.table" \ |
|
# -plots "$qcdir/$sname-gatk-baserecal.pdf" |
|
|
|
#apply recal scores |
|
echo "gatk-$sname: recalibrating bases - printing reads" |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T PrintReads \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-realn.bam" \ |
|
-o "$bamdir/$sname-recal.bam" \ |
|
-BQSR "$bamdir/$sname-recal.table" |
|
|
|
#perform genotyping |
|
echo "gatk=$sname: performing genotyping" |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T UnifiedGenotyper \ |
|
-R $refgen \ |
|
-I "$bamdir/$sname-recal.bam" \ |
|
-o "$vardir/$sname-gatk.vcf" |
|
|
|
#convert VCFs to table |
|
java -Xmx16G -jar /opt/gatk/GenomeAnalysisTK.jar \ |
|
-K /opt/gatk/gatk.key \ |
|
-et STDOUT \ |
|
-T VariantsToTable \ |
|
-R $refgen \ |
|
-F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F AF -F AC -F AN -F MQ \ |
|
-F DP \ |
|
-V "$vardir/$sname-gatk.vcf" \ |
|
-o "$vardir/$sname-gatk.table" |
|
|
|
} |
|
export -f gatk_genos |
|
|
|
#function for writing messages to stdout and to logfile |
|
log_msg(){ |
|
message=$1 |
|
logfi=$2 |
|
|
|
if [ -f $logfi ] |
|
then |
|
echo $message >> logfi |
|
else |
|
touch $logfi |
|
#if touch didn't work, exit |
|
if [ $? -gt 0 ] |
|
then |
|
echo "Unable to touch logfile, exiting" && exit 1 |
|
fi |
|
#otherwise, write message |
|
echo $message >> logfi |
|
fi |
|
|
|
#putmesage to stdout |
|
echo $message |
|
} |
|
export -f log_msg |
|
|
|
#collect stats |
|
sam_stats(){ |
|
bamdir=$1 |
|
qcdir=$2 |
|
sname=$3 |
|
refgen=$4 |
|
|
|
#generate stats |
|
samtools stats -r $refgen "$bamdir/$sname-recal.bam" > "$qcdir/$sname.st" |
|
samtools flagstat "$bamdir/$sname-recal.bam" > "$qcdir/$sname.fst" |
|
|
|
#generate plots |
|
plot-bamstats -p "$qcdir/$sname" "$qcdir/$sname.st" |
|
} |
|
export -f sam_stats |
|
|
|
#usage |
|
usage(){ |
|
echo "\n##############################################################" |
|
echo "Usage:" |
|
echo " -s: Directory containing raw FASTQ data for desired samples" |
|
echo " -p: Project Name" |
|
echo " -o: Directory for storing results from assembly" |
|
echo " -q: Minimum quality score" |
|
echo " -t: Minimum read length for trimming/quality filtering" |
|
echo " -m: Set to 0 for HiSeq data" |
|
echo " -r: Keep read group data" |
|
echo " -g: Reference genome (fasta format)" |
|
echo " -a: Archive intermediate results, does not comress QC stats" |
|
echo "###############################################################\n" |
|
|
|
#to do: write help/detailed usage |
|
|
|
exit $1 |
|
} |
|
export -f usage |
|
|
|
######################################## |
|
### parse args |
|
######################################## |
|
while getopts ":s:p:o:q:t:m:r:g:a:h" opt; do |
|
case $opt in |
|
#sample data directory |
|
s) |
|
if [ -d "$OPTARG" ] |
|
then |
|
sample_dir="$OPTARG" |
|
else |
|
echo "Error, unable to find sample directory" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#output prefix |
|
p) |
|
if [ -n "$OPTARG" ] |
|
then |
|
out_pre="$OPTARG" |
|
else |
|
echo "Error, must supply project name" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#output directory |
|
o) |
|
if [ -n "$OPTARG" ] |
|
then |
|
out_dir="$OPTARG" |
|
else |
|
echo "Error, must supply an output directory" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#quality score |
|
q) |
|
if [ $OPTARG -gt 0 ] |
|
then |
|
minq="$OPTARG" |
|
else |
|
echo "Error, quality threshold must be greater than 0" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#filtering threshold |
|
t) |
|
if [ $OPTARG -gt 0 ] |
|
then |
|
mint="$OPTARG" |
|
else |
|
echo "Error, filtering threshold must be greater than 0" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#miseq or hiseq |
|
m) |
|
if [ $OPTARG -eq 0 ] |
|
then |
|
miseq=0 |
|
else |
|
miseq=1 |
|
fi |
|
;; |
|
#assign read groups prior to assembly |
|
r) |
|
if [ $OPTARG -eq 0 ] |
|
then |
|
keeprg=0 |
|
else |
|
keeprg=1 |
|
fi |
|
;; |
|
#reference genome |
|
g) |
|
if [ -f "$OPTARG" ] |
|
then |
|
refgen="$OPTARG" |
|
else |
|
echo "Error: unable to access reference genome" >&2 |
|
usage 1 |
|
fi |
|
;; |
|
#archive intermediate files |
|
a) |
|
intarch=1 |
|
;; |
|
#print help/usage and exit |
|
h) |
|
usage(0) |
|
#default case is invalid argument |
|
\?) |
|
echo "Error, invalid argument: $OPTARG" >&2 |
|
usage 1 |
|
;; |
|
esac |
|
done |
|
|
|
################################### |
|
### setup |
|
################################### |
|
|
|
#create project directories |
|
tmp_fq="$out_dir/$tmp_fq" |
|
fin_fq="$out_dir/$fin_fq" |
|
alndat="$out_dir/$alndat" |
|
vardat="$out_dir/$vardat" |
|
qcdat="$out_dir/$qcdat" |
|
mkdir -p $out_dir |
|
mkdir -p $tmp_fq |
|
mkdir -p $fin_fq |
|
mkdir -p $alndat |
|
mkdir -p $vardat |
|
mkdir -p $qcdat |
|
|
|
#prepare sample-list file |
|
rm -f $out_dir/$sdata |
|
rm -f $out_dir/$sdata |
|
|
|
#DEBUG |
|
#rm $tmp_fq/* |
|
#rm $fin_fq/* |
|
#rm $alndat/* |
|
#rm $qcdat/* |
|
|
|
#assumes each sample has data in own dir within sample dir |
|
for i in `ls $sample_dir`; |
|
do |
|
[ -d "$sample_dir/$i" ] || continue |
|
|
|
#collect several values from sample's SampleSheet.csv |
|
sname=$(tail -n1 $sample_dir/$i/$ssheet | cut -d "," -f6) |
|
lib=$(tail -n1 $sample_dir/$i/$ssheet | cut -d "," -f3) |
|
spec=$(tail -n1 $sample_dir/$i/$ssheet | cut -d "," -f4) |
|
|
|
#create an entry for each file associated with a given sample |
|
#sample name, file name+path, file name, library and ref spec |
|
for f in `ls $sample_dir/$i/*.fastq`; |
|
do |
|
fname=$(echo $f | grep -oP "(?<=/)([^/]+)(?=.fastq)") |
|
printf "$sname\t$f\t$fname\t$lib\t$spec\n" >> $out_dir/$sdata |
|
done |
|
done |
|
|
|
#generate strings for commonly used sample data |
|
par_sname=$(cut -f $col_sname $out_dir/$sdata) |
|
par_fname=$(cut -f $col_fname $out_dir/$sdata) |
|
par_sfile=$(cut -f $col_sfile $out_dir/$sdata) |
|
par_lib=$(cut -f $col_lib $out_dir/$sdata) |
|
par_ref=$(cut -f $col_ref $out_dir/$sdata) |
|
|
|
############################################### |
|
#### processing |
|
############################################## |
|
#write some data to log file |
|
log_msg "###################################################" $out_dir/$proclog |
|
log_msg "Samples to be processed: $par_sname" $out_dir/$proclog |
|
log_msg "Project directory: $out_dir" $out_dir/$proclog |
|
log_msg "Minimum Q: $minq" $out_dir/$proclog |
|
log_msg "Minimum bases/read length: $mint" $out_dir/$proclog |
|
log_msg "###################################################" $out_dir/$proclog |
|
|
|
#trim adapters from raw data |
|
log_msg "Trimming sequencing adapters from raw fastq data" $out_dir/$proclog |
|
if [ $miseq -eq 1 ] |
|
then |
|
trimmer=$mstrim |
|
log_msg "Trimming for MiSeq adaptors" $out_dir/$proclog |
|
else |
|
trimmer=$hstrim |
|
log_msg "Trimming for HiSeq adaptors" $out_dir/$proclog |
|
fi |
|
#trim using specified trimmer |
|
parallel --no-notice --xapply run_trimmer {1} $tmp_fq {2} {3} $rder $mint \ |
|
$trimmer ::: $par_sfile ::: $par_sname ::: $par_fname >> $out_dir/$proclog |
|
log_msg "Trimming complete" $out_dir/$proclog |
|
|
|
#filter by quality score |
|
log_msg "Filtering by read quality score" $out_dir/$proclog |
|
parallel --no-notice q_filter {} $tmp_fq $minq $mint ::: \ |
|
`ls $tmp_fq/*-t-raw.fastq` >> $out_dir/$proclog |
|
log_msg "Quality filtering complete" $out_dir/$proclog |
|
|
|
#trim by quality score |
|
log_msg "Trim by quality score" $out_dir/$proclog |
|
parallel --no-notice q_trim {} $fin_fq $minq $mint ::: \ |
|
`ls $tmp_fq/*-qf-t.fastq` >> $out_dir/$proclog |
|
log_msg "Quality trimming complete" $out_dir/$proclog |
|
|
|
#generate QC data on raw and processed data |
|
log_msg "Generating FQ stats/plots" $out_dir/$proclog |
|
parallel --no-notice qc_stats {} $qcdat ::: $par_sfile |
|
parallel --no-notice qc_stats $fin_fq/{} $qcdat ::: \ |
|
`ls $fin_fq` >> $out_dir/$proclog |
|
log_msg "done with FQ stats/plots" $out_dir/$proclog |
|
|
|
#perform sorting |
|
log_msg "Sorting read pairs" $out_dir/$proclog |
|
if [ $miseq -eq 1 ] |
|
then |
|
sorter=$mssrt |
|
log_msg "Sorting pairs for MiSeq data" $out_dir/$proclog |
|
else |
|
sorter=$hssrt |
|
log_msg "Sorting pairs for HiSeq data" $out_dir/$proclog |
|
fi |
|
parallel --no-notice sort_pairs $fin_fq $fin_fq {} $sorter ::: \ |
|
$par_sname >> $out_dir/$proclog |
|
log_msg "sorting complete" $out_dir/$proclog |
|
|
|
#index reference genome |
|
log_msg "Indexing reference genome" $out_dir/$proclog |
|
samtools faidx $refgen 2>> $out_dir/$proclog |
|
bwa index $refgen 2>> $out_dir/$proclog |
|
|
|
#make sequecne dict for GATK stuff |
|
rname=$(echo $refgen | grep -oP "(.*)(?=\.)") |
|
java -XX:ParallelGCThreads=16 \ |
|
-Xmx16G \ |
|
-jar /opt/picard-tools/picard.jar CreateSequenceDictionary \ |
|
REFERENCE=$refgen \ |
|
OUTPUT="$rname.dict" >> $out_dir/$proclog |
|
|
|
log_msg "Indexing complete" $out_dir/$proclog |
|
|
|
#perform assembly |
|
log_msg "Generating assemblies" $out_dir/$proclog |
|
parallel --no-notice --xapply \ |
|
do_bwa $fin_fq $alndat {1} $refgen {2} {3} $keeprg ::: $par_sname ::: \ |
|
$par_lib ::: $par_ref 2>> $out_dir/$proclog |
|
log_msg "assembly complete" $out_dir/$proclog |
|
|
|
#sort sam and generat bams |
|
log_msg "Post processing assembly output" $out_dir/$proclog |
|
parallel --no-notice post_asm $alndat $alndat {} $refgen ::: \ |
|
$par_sname 2>> $out_dir/$proclog |
|
log_msg "Processing complete" $out_dir/$proclog |
|
|
|
#generate raw VCF for GATK pipeline |
|
log_msg "Generating raw VCFs" $out_dir/$proclog |
|
parallel --no-notice initial_vars $alndat $vardat {} $refgen ::: \ |
|
$par_sname >> $out_dir/$proclog |
|
log_msg "Finished generating raw VCFs" $out_dir/$proclog |
|
|
|
#remove duplicates |
|
log_msg "Removing duplicates from assemblies" $out_dir/$proclog |
|
parallel --no-notice picard_mdup $alndat $alndat $qcdat {} ::: \ |
|
$par_sname >> $out_dir/$proclog |
|
log_msg "Deduping complete" $out_dir/$proclog |
|
|
|
#genotype with GATK |
|
log_msg "Performing genotyping" $out_dir/$proclog |
|
parallel --no-notice gatk_genos $alndat $vardat $qcdat {} $refgen ::: \ |
|
$par_sname >> $out_dir/$proclog |
|
log_msg "genotyping complete" $out_dir/$proclog |
|
|
|
#generate stats |
|
log_msg "Generating assembly stats" $out_dir/$proclog |
|
parallel --no-notice sam_stats $alndat $qcdat {} $refgen ::: \ |
|
$par_sname >> $out_dir/$proclog |
|
log_msg "assembly stats generated" $out_dir/$proclog |
|
|
|
#create final results directory for each sample |
|
log_msg "Creating final results directories" $out_dir/$proclog |
|
for i in $par_sname; |
|
do |
|
resdir="$out_dir/$i-results" |
|
mkdir -p $resdir |
|
#in this directory, store |
|
# - final BAM file, the recalibrated file from GATK |
|
# - final VCF file from GATK |
|
# - consensus sequence built from final VCF (need to implement) |
|
# - final fastq reads used for assembly (compressed) |
|
cp "$alndat/$i.bam" $resdir |
|
cp "$alndat/$i.bai" $resdir |
|
cp "$vardat/$i-gatk.vcf" $resdir |
|
cp "$vardat/$i-gatk.vcf.idx" $resdir |
|
#cp consensus sequence |
|
|
|
#compress final fastq data prior to moving to results dir |
|
parallel --no-notice gzip {} ::: `ls $alndat/$i-paired_*.fastq` |
|
parallel --no-notice mv {} $resdir ::: `ls $alndat/$i-paired_*.gz` |
|
done |
|
log_msg "finished writing final results" $out_dir/$proclog |
|
|
|
#clean up temporary files, if specified |
|
if [ $intarch -eq 1 ] |
|
then |
|
log_msg "Compressing intermediate files" $out_dir/$proclog |
|
#make tarballs |
|
tar -czf "$tmp_fq.tar.gz" $tmp_fq |
|
tar -czf "$fin_fq.tar.gz" $tmp_fq |
|
tar -czf "$alndat.tar.gz" $alndat |
|
tac -czf "$vardat.tar.gz" $vardat |
|
log_msg "archives finished" $out_dir/$proclog |
|
fi |
|
|
|
#all done |
|
log_msg "All samples fully processed, exiting" $out_dir/$proclog |
Hi, what do you use for known sites, an empty file? The BaseRecalibrator step "does a by-locus traversal operating only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative of poor base quality". Is this an appropriate assumption for viruses? Thanks.
BaseRecalibrator requires a list of validated variants, such as dbSNP. There are somewhat arbitrary guidelines for creating your own list (see here), but it's probably not relevant for your experiment.
Thanks. IndelRealigner and BaseRecalibrator are not relevant. So I went directly from remove duplicates to HaplotypeCaller. The command ran successfully with no errors but resulted in 0 variants! Perhaps because a mutation rate of 20% at a genome position is not considered 'significant' for humans, which is what GATK was designed for?
I'll try next creating a vcf file with the mutations I identified manually and using this as a "known" vcf db file and see if GATK calls these.
FYI. Not called even when in 'known' vcf file.I think I need to go the old fashioned way and read the literature!!
You have to remember that calling variants isn't as simple as counting the number of bases at a given position that don't match the reference. You have to consider the quality of the read, mapping quality, location of the aligned position within the read and etc.
You wouldn't want to call SNPs off of low quality bases located in the 3' end of the read, they're likely to be
Give me a moment, I'll pull up a script I have. It won't translate directly, and include the whole process from raw fq data, but you can see the steps I took. Check my OP.
EDIT: Sorry, it is very long, I'll give you the gist here.
Assuming you've assembled and etc:
The step VariantsToTable is what give you the final VCF, with properly called variants.
Thank you, Joe!