My tool VCFFilterJS can do this.
Here is a javascript file that select the variant where one sample has a genotype different from the others:
function accept(ctx)
{
var x,y,g1,g2,count_same=0;
var sampleList=header.getSampleNamesInOrder();
/** loop over one sample */
for(x=0;x < sampleList.size();++x)
{
g1=ctx.getGenotype( sampleList.get(x) );
/** ignore non-called */
if(! g1.isCalled() ) continue;
count_same=0;
/** loop over the other samples */
for(y=0;y< sampleList.size() && count_same==0 ;++y)
{
if(x==y) continue;/* same sample ?*/
g2=ctx.getGenotype( sampleList.get(y) );
/** ignore non-called */
if(! g2.isCalled() ) continue;
/** is g1==g2 ? */
if( g1.sameGenotype( g2 ) )
{
count_same++;
}
}
/* found no other same genotype */
if(count_same==0) return true;
}
return false;
}
accept(variant);
and a test with 1000 genomes:
curl "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr1.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz" |\
gunzip -c |\
java -jar dist/vcffilterjs.jar -f select.js |\
grep -v "#" | cut -f 1-5
1 13957 rs201747181 TC T
1 51914 rs190452223 T G
1 54753 rs143174675 T G
1 55313 rs182462964 A T
1 55326 rs3107975 T C
1 55330 rs185215913 G A
1 55388 rs182711216 C T
1 55416 rs193242050 G A
1 55427 rs183189405 T C
1 62156 rs181864839 C T
1 63276 rs185977555 G A
1 66457 rs13328655 T A
1 69534 rs190717287 T C
1 72148 rs182862337 C T
1 77470 rs192898053 T C
1 79137 rs143777184 A T
1 81949 rs181567186 T C
1 83088 rs186081601 G C
1 83977 rs180759811 A G
1 84346 rs187855973 T C
verify with rs201747181
curl "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr1.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz" |\
gunzip -c |\
grep rs201747181 |\
cut -f 10- |\
tr " " "\n" |\
cut -d ':' -f 1 |\
sort |\
uniq -c
1013 0|0
26 0|1
7 1|0
1 1|1 <============ HERE
Here is another example with only one sample:
function accept(ctx)
{
var y,g2;
var sampleList=header.getSampleNamesInOrder();
var g1=ctx.getGenotype("M10475");
/** ignore non-called */
if(g1== null || ! g1.isCalled() ) return false;
/** loop over the other samples */
for(y=0;y< sampleList.size();++y)
{
g2=ctx.getGenotype( sampleList.get(y) );
if(g2.getSampleName().equals(g1.getSampleName())) continue;
/** ignore non-called */
if(! g2.isCalled() ) continue;
/** is g1==g2 ? */
if( g1.sameGenotype( g2 ) ) return false;
}
/* found no other same genotype */
return true;
}
accept(variant);
exec:
$ curl -k "https://raw.github.com/arq5x/gemini/master/test/test5.vep.snpeff.vcf" | java -jar dist/vcffilterjs.jar -f script.js | grep -A 1 CHROM
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT M10475 M10478 M10500 M128215
chr1 145273345 . T C 289.85 . AC=3;AF=0.38;AN=8;BaseQRankSum=1.062;CSQ=missense_variant|Tct/Cct|S/P|ENSG00000213240|NOTCH2NL|ENST00000369340|4/6|benign(0.238)|tolerated(0.45),missense_variant|Tct/Cct|S/P|ENSG00000213240|NOTCH2NL|ENST00000362074|3/5|benign(0.238)|tolerated(0.45),missense_variant&NMD_transcript_variant|Tct/Cct|S/P|ENSG00000255168||ENST00000468030|3/23|benign(0.416)|tolerated(0.55),missense_variant|Tct/Cct|S/P|ENSG00000213240|NOTCH2NL|ENST00000344859|3/6|possibly_damaging(0.545)|tolerated(0.44);DP=1000;DS;Dels=0.00;EFF=EXON(MODIFIER|||||RP11-458D21.5|nonsense_mediated_decay|NON_CODING|ENST00000468030|3),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tct/Cct|S67P|230|NOTCH2NL|protein_coding|CODING|ENST00000344859|),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tct/Cct|S67P|236|NOTCH2NL|protein_coding|CODING|ENST00000362074|),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Tct/Cct|S67P|236|NOTCH2NL|protein_coding|CODING|ENST00000369340|);FS=3.974;HRun=1;HaplotypeScore=17.4275;MQ=29.25;MQ0=0;MQRankSum=-1.370;QD=0.39;ReadPosRankSum=-1.117 GT:AD:DP:GQ:PL 0/0:226,22:250:99:0,158,4259 0/1:224,24:250:6:6,0,5314 0/1:219,28:249:57:57,0,5027 0/1:215,34:250:99:269,0,3796
Visit GeneTalk and perform an inheritance filter step (look for dominant variants) after you have uploaded the multiple VCF file and set up the pedigree information.
Thank you! Alexej
Dear all,
When I ran this command which worked before, but now it did not work.. I could not figure it out..
I doubt my super cluster/ computer has problem ...
Old problem solved but new one comes...
Here is commands that I use:
Here is the script.js (from Pierre)
output file is small as 26K. here is the output look like
But no MORE information after the line of (
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
)Sorry that I posted question as an answer here!