Here's a naive approach:
library("Biostrings")
library("seqinr")
## Load protein sequences
#Protein_Seq <- read.fasta("Protein_sequneces.fa", seqtype="AA", as.string="TRUE")
#Protein_Sequences_Only <- getSequence(Protein_Seq, as.string=TRUE)
data("BLOSUM62")
protein_dat <- read.table("Protein_sequneces.dat")
Smith_Waterman_Scores <- data.frame(Seq1=as.numeric(),
Seq2=as.numeric(),
Score=as.numeric(),
stringsAsFactors=FALSE)
## Perform alignment
for (i in 1:length(protein_dat)) {
for (j in 1:length(protein_dat))
{
t <- pairwiseAlignment(protein_dat[i,], protein_dat[j,],
substitutionMatrix=BLOSUM62,type="local")
Smith_Waterman_Scores <- rbind(Smith_Waterman_Scores, c(i,j,t@score))
}
}
names(Smith_Waterman_Scores)[1] <- "First.Protein"
names(Smith_Waterman_Scores)[2] <- "Second.Protein"
names(Smith_Waterman_Scores)[3] <- "Score"
### Normalize Smith-Waterman similarity scores
dt <- data.table(Smith_Waterman_Scores)
dt.lookup <- dt[First.Protein == Second.Protein]
setkey(dt,"First.Protein" )
setkey(dt.lookup,"First.Protein" )
colnames(dt.lookup) <- c("First.Protein","Second.Protein","Score1")
dt <- dt[dt.lookup]
setkey(dt,"Second.Protein" )
setkey(dt.lookup,"Second.Protein")
colnames(dt.lookup) <- c("First.Protein","Second.Protein","Score2")
dt <- dt[dt.lookup][
, Normalized := Score / (sqrt(Score1) * sqrt(Score2))][
, .(First.Protein, Second.Protein, Normalized)]
dt <- dt[order(dt$First.Protein),]
Smith_Waterman_Scores <- as.data.frame(dt)
Given the number of pre-existing alignment tools, the most efficient method would be to not write anything and just use someone elses (likely online and easily findable via google) tool.
I ended up using
R
which was reasonably fast for my limited computational resources. I usedBiostrings::pairwiseAlignment
.is there any answer?