Hi all,
Please forgive any unprofessional way of asking a good question, as this is my first post.
I am currently doing Motif analysis from a chip seq data set.
I have an ouput from Fimo where each individual motif occurence is listed with a unique id. my aim is to identify motifs that are close to nanog motifs.
Nanog motif is identified in the motif_alt_id column as starting with ("^3-"). Using the start and end column, i want to find motiifs that overlaps with nanog, so as to establish a pattern of occurence.
dataframe from fimo:
data.frame( stringsAsFactors = FALSE, row.names = c("32607","57458","62651", "76805","109343","136670","172499","221228","231748", "232284"), motif_id = c("motif_er_4","motif_er_61", "motif_er_24","motif_er_45","motif_er_28","motif_er_33", "motif_er_8","motif_er_51","motif_er_26","motif_er_1"), motif_alt_id = c("4-TGGGTGKGGY","11-GGGTGGGG", "24-TCGCCACAGG","20-CCCATTGAGAAG","3-AGCCACACCCWN", "8-GCACTTCAAAGG","8-TGCATAACAA","1-RACAAAGG", "1-TTTGCATWWCAA","1-CCTTTGTTMT"), sequence_name = c("X_1009","X_1009","X_1009", "X_1009","X_1009","X_1009","X_1009","X_1009","X_1009", "X_1009"), start = c(433L, 434L, 308L, 424L, 432L, 159L, 284L, 66L, 282L, 64L), stop = c(442L, 441L, 317L, 435L, 443L, 170L, 293L, 73L, 293L, 73L), strand = c("+", "+", "-", "-", "-", "+", "+", "+", "+", "-"), score = c(14.3028,13.7615,14.3761, 13.4895,13.3394,12.8288,11.4912,11.7615,11.8203,11.2566), p.value = c(5.01e-06,8.13e-06,8.92e-06, 1.04e-05,1.52e-05,1.8e-05,2.43e-05,3.26e-05,3.47e-05, 3.48e-05), q.value = c(0.0509,0.044,0.552,0.384, 0.0629,0.262,0.506,0.273,0.185,0.264), matched_sequence = c("GGGGTGGGGT","GGGTGGGG", "TAGCCACAGG","CCCATAGAGATG","CACCCCACCCCA","TAACTTCAAAGC", "TGCATAAGAA","AACAAAGG","TATGCATAAGAA","CCTTTGTTGC") )
I defined nanog_id as <- c("3-AGCCACACCCWN", "3-GAGCCATTAA", "3-GCMATTAA")
what i have done.
nanog_list <- fimo
fimo %>% mutate(overlap_seq = ifelse(x[["motif_alt_id"]] %in% nanog_id, ifelse(x[["start"]]+1 %in% nanog_list[["start"]], x[["matched_sequence"]] , 0), NA))
result: data.frame( stringsAsFactors = FALSE, row.names = c("32607","57458","62651", "76805","109343","136670","172499","221228","231748", "232284","260330","293617"), motif_id = c("motif_er_4","motif_er_61", "motif_er_24","motif_er_45","motif_er_28","motif_er_33", "motif_er_8","motif_er_51","motif_er_26","motif_er_1", "motif_er_53","motif_er_61"), motif_alt_id = c("4-TGGGTGKGGY","11-GGGTGGGG", "24-TCGCCACAGG","20-CCCATTGAGAAG","3-AGCCACACCCWN", "8-GCACTTCAAAGG","8-TGCATAACAA","1-RACAAAGG", "1-TTTGCATWWCAA","1-CCTTTGTTMT","3-GCMATTAA","11-GGGTGGGG"), sequence_name = c("X_1009","X_1009","X_1009", "X_1009","X_1009","X_1009","X_1009","X_1009","X_1009", "X_1009","X_1009","X_1009"), start = c(433L,434L,308L,424L,432L, 159L,284L,66L,282L,64L,216L,105L), stop = c(442L,441L,317L,435L,443L, 170L,293L,73L,293L,73L,223L,112L), strand = c("+","+","-","-","-","+", "+","+","+","-","-","+"), score = c(14.3028,13.7615,14.3761, 13.4895,13.3394,12.8288,11.4912,11.7615,11.8203,11.2566, 10.8036,11.8073), p.value = c(5.01e-06,8.13e-06,8.92e-06, 1.04e-05,1.52e-05,1.8e-05,2.43e-05,3.26e-05,3.47e-05, 3.48e-05,3.96e-05,4.59e-05), q.value = c(0.0509,0.044,0.552,0.384, 0.0629,0.262,0.506,0.273,0.185,0.264,0.766,0.0884), matched_sequence = c("GGGGTGGGGT","GGGTGGGG", "TAGCCACAGG","CCCATAGAGATG","CACCCCACCCCA","TAACTTCAAAGC", "TGCATAAGAA","AACAAAGG","TATGCATAAGAA","CCTTTGTTGC", "GCCATTAA","GGGAGGAG"), overlap_seq = c(NA,NA,NA,NA,"CACCCCACCCCA", NA,NA,NA,NA,NA,"GCCATTAA",NA) )
However, what i am expecting is: data.frame( stringsAsFactors = FALSE, row.names = c("32607","57458","62651", "76805","109343","136670","172499","221228","231748", "232284"), motif_id = c("motif_er_4","motif_er_61", "motif_er_24","motif_er_45","motif_er_28","motif_er_33", "motif_er_8","motif_er_51","motif_er_26","motif_er_1"), motif_alt_id = c("4-TGGGTGKGGY","11-GGGTGGGG", "24-TCGCCACAGG","20-CCCATTGAGAAG","3-AGCCACACCCWN", "8-GCACTTCAAAGG","8-TGCATAACAA","1-RACAAAGG", "1-TTTGCATWWCAA","1-CCTTTGTTMT"), sequence_name = c("X_1009","X_1009","X_1009", "X_1009","X_1009","X_1009","X_1009","X_1009","X_1009", "X_1009"), start = c(433L, 434L, 308L, 424L, 432L, 159L, 284L, 66L, 282L, 64L), stop = c(442L, 441L, 317L, 435L, 443L, 170L, 293L, 73L, 293L, 73L), strand = c("+", "+", "-", "-", "-", "+", "+", "+", "+", "-"), score = c(14.3028,13.7615,14.3761, 13.4895,13.3394,12.8288,11.4912,11.7615,11.8203,11.2566), p.value = c(5.01e-06,8.13e-06,8.92e-06, 1.04e-05,1.52e-05,1.8e-05,2.43e-05,3.26e-05,3.47e-05, 3.48e-05), q.value = c(0.0509,0.044,0.552,0.384, 0.0629,0.262,0.506,0.273,0.185,0.264), matched_sequence = c("GGGGTGGGGT","GGGTGGGG", "TAGCCACAGG","CCCATAGAGATG","CACCCCACCCCA","TAACTTCAAAGC", "TGCATAAGAA","AACAAAGG","TATGCATAAGAA","CCTTTGTTGC"), overlap_seq = c(NA, NA, NA, NA, "4-TGGGTGKGGY", NA, NA, NA, NA, NA) )
for motif_alt_id that starts with "^3-" i want to find matched sequence for each
e.g "3-AGCCACACCCWN" has a start 434 and stop of 441, the closet motif that overlap with it is 4-TGGGTGKGGY , i want to those list of motifs that overlap with nanog motif.