|
#!/usr/bin/env perl |
|
use strict; |
|
use warnings; |
|
|
|
use Bio::Tools::GFF; |
|
use Bio::SeqFeatureI; |
|
|
|
my %fstore1 = (); |
|
my %fstore2 = (); |
|
my %conflicts = (); |
|
|
|
# specify 2 input files to compare |
|
my $gffio1 = Bio::Tools::GFF->new(-file => $ARGV[0], -gff_version => 3); |
|
my $gffio2 = Bio::Tools::GFF->new(-file => $ARGV[1], -gff_version => 3); |
|
# gff file that contains new features in file 2 |
|
my $create = Bio::Tools::GFF->new(-gff_version => 3, |
|
-file => ">gffdiff_create.gff3"); |
|
# gff file that contains the fieatures missing in file 2 from file 1 |
|
my $delete = Bio::Tools::GFF->new(-gff_version => 3, |
|
-file => ">gffdiff_delete.gff3"); |
|
# file with conflicting features, e.g. position identical but different ID |
|
my $conffh; |
|
open ($conffh, '>' , "gffdiff_conflicts") or die "couldn't open gffdiff_conflicts $!\n"; |
|
|
|
# loop over the input stream file 1 |
|
while(my $feature = $gffio1->next_feature()) { |
|
my $id = ($feature->primary_id) ? |
|
$feature->primary_id : new_id($feature); |
|
### some sanity checks |
|
die "something went wrong with the ID" unless $id; |
|
die "duplicate feature ID $id" if exists $fstore1{$id}; |
|
$fstore1{$id} = $feature; |
|
} |
|
print "read ".scalar ( keys %fstore1)." unique features from file 1\n"; |
|
$gffio1->close(); |
|
# read features of second file |
|
my $new = my $changed = my $conflicts = my $deleted = 0; |
|
while(my $feature = $gffio2->next_feature()) { |
|
my $id = ($feature->primary_id) ? |
|
$feature->primary_id : new_id($feature); |
|
$fstore2{$id} = $feature; |
|
### some sanity checks |
|
die "something went wrong with the ID" unless $id; |
|
my @what = (); |
|
if (my $a = is_new($id, $feature)) { |
|
# print "Feature $id is new\n"; |
|
if (ref $a) { |
|
$conflicts++; |
|
## store conflict not to delete it later |
|
$conflicts{new_id($a)} = 1; |
|
print $conffh " >>>>> ".$ARGV[0]."\nC: ".$a->gff_string()."\n-----\n"; |
|
print $conffh " <<<<< " .$ARGV[1]."\nC: ".$feature->gff_string()."\n<<<<<\n"; |
|
} else { |
|
$create->write_feature($feature); |
|
$new++; |
|
} |
|
} |
|
elsif (@what = what_changed($id, $feature)) { |
|
#print "Feature $id has different ". (join " and ", @what)."\n"; |
|
#print "- ".$fstore1{$id}->gff_string."\n"; |
|
$delete->write_feature($fstore1{$id}); |
|
#print "+ ".$feature->gff_string()."\n"; |
|
$create->write_feature($feature); |
|
$changed++; |
|
|
|
} else { |
|
# print "$id unchanged\n"; |
|
} |
|
|
|
} |
|
$gffio2->close(); |
|
print "read ".scalar ( keys %fstore2)." unique features from file 2\n"; |
|
|
|
|
|
foreach my $key (keys %fstore1) { |
|
if (! exists $fstore2{$key}) { |
|
# protect feature from deletion if it is already known as an ID conflict |
|
if (! exists $conflicts{new_id($fstore1{$key})} ) { |
|
$delete->write_feature($fstore1{$key}); |
|
$deleted++; |
|
} |
|
} |
|
|
|
} |
|
$create->close; |
|
$delete->close; |
|
close $conffh or die $!; |
|
|
|
|
|
print "$new new features, $changed changed, $conflicts conflicting, $deleted deleted features\n"; |
|
|
|
|
|
|
|
sub new_id { |
|
my $feature = shift; |
|
my $start = $feature->start; |
|
my $end = $feature->end; |
|
my $strand = $feature->strand; |
|
my $type = $feature->primary_tag(); |
|
my $seq_id = $feature->seq_id(); |
|
return "$seq_id:$type:$start:$end:$strand"; |
|
} |
|
|
|
|
|
sub is_new { |
|
my $id = shift; |
|
my $feature = shift; |
|
if (! exists $fstore1{$id}) { |
|
if (! exists $fstore1{new_id($feature)}) { |
|
return 1; |
|
} else { |
|
return $fstore1{new_id($feature)} |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
sub what_changed { |
|
my ($id, $feature) = @_; |
|
die "no id" unless $id; |
|
die "no feature for id $id" unless ref $feature; |
|
unless (ref $fstore1{$id}) { |
|
warn "$id not found, shouldn't happen:".$fstore1{$id}; |
|
return (); |
|
} |
|
my @what = (); |
|
## quick and dirty check |
|
return () if $feature->gff_string eq $fstore1{$id}->gff_string; |
|
## now we have to find out what's different (we ignore most tags for now): |
|
#$feature->strand ne $fstore1{$id}->strand || |
|
push @what, |
|
"position/strand" unless ($feature->equals($fstore1{$id}), 'strong'); |
|
push @what, "phase" if $feature->phase ne $fstore1{$id}->phase; |
|
push @what, "primary_tag" if $feature->primary_tag() ne $fstore1{$id}->primary_tag(); |
|
push @what, "source_tag" if $feature->source_tag() ne $fstore1{$id}->source_tag(); |
|
push @what, "display_id" if $feature->display_id() ne $fstore1{$id}->display_id(); |
|
## TODO: check all tags: TODO: |
|
### my %tags1 = map $fstore1{$id}->get_all_tags(); |
|
return @what; |
|
} |
Compare annotation tool
agat_sp_compare_annotations.pl
from AGAT toolkit. Install AGAT usingconda
.Thank you!! I will try it :)