filtering dataframe using multiple columns in R
3
Hi guys,
So, i have this dataframe (for e.g.):
CGDid Mass Source.File pepSeq
C1_00060W_A 5117.5552 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00060W_A 5117.5552 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00060W_A 5117.5552 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00061W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00063W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00062W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
And the thing is i want to filter when the columns CGDid, Mass, pepSeq are equal and the Source.File is different. I am trying to figure out the best way to this but i am blocked.
I will be very appreciate if someone could help me!
All the best,
Andreia
filter
R
dataframe
• 1.3k views
t1 <- readr::read_tsv("./1.t")
dplyr::group_by(t1, CGDid, Mass, pepSeq) %>% dplyr::distinct(Source.File)
output:
# A tibble: 9 x 4
# Groups: CGDid, Mass, pepSeq [6]
CGDid Mass Source.File pepSeq
<chr> <dbl> <chr> <chr>
1 C1_00060W… 5118. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
2 C1_00060W… 5118. T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
3 C1_00061W… 6304. T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
4 C1_00060W… 6304. T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
5 C1_00060W… 6304. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
6 C1_00063W… 6304. T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
7 C1_00060W… 6305. T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
8 C1_00060W… 6305. T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
9 C1_00062W… 6305. T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
Using aggregate :
aggregate(Source.File ~ CGDid + Mass + pepSeq, data = d,
FUN = function(x) { c(names = toString(unique(x)),
count = length(unique(x))) })
# CGDid Mass pepSeq Source.File.names Source.File.count
# 1 C1_00060W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A T0_4_excl.raw, T0_4.raw 2
# 2 C1_00062W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A T0_4.raw 1
# 3 C1_00060W_A 5117.555 NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK T0_4_excl.raw, T0_4.raw 2
# 4 C1_00060W_A 6304.165 NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR T0_4.raw, T0_4_excl.raw 2
# 5 C1_00061W_A 6304.165 NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR T0_4.raw 1
# 6 C1_00063W_A 6304.165 NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR T0_4.raw 1
# example data
d <- read.table(text = " CGDid Mass Source.File pepSeq
C1_00060W_A 5117.5552 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00060W_A 5117.5552 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00060W_A 5117.5552 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
C1_00061W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00063W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00062W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
C1_00060W_A 6305.1489 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A",
header = TRUE)
library(data.table)
df= fread("data.txt", sep = " ", header = T)`
CGDid Mass Source.File pepSeq
1: C1_00060W_A 5117.555 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
2: C1_00060W_A 5117.555 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
3: C1_00060W_A 5117.555 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
4: C1_00061W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
5: C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
6: C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
7: C1_00063W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
8: C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
9: C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
10: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
11: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
12: C1_00060W_A 6305.149 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
13: C1_00062W_A 6305.149 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
14: C1_00060W_A 6305.149 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
15: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
df= unique(df)
CGDid Mass Source.File pepSeq
1: C1_00060W_A 5117.555 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
2: C1_00060W_A 5117.555 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
3: C1_00061W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
4: C1_00060W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
5: C1_00060W_A 6304.165 T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
6: C1_00063W_A 6304.165 T0_4.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
7: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
8: C1_00060W_A 6305.149 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
9: C1_00062W_A 6305.149 T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
•
link
updated 3.5 years ago by
Ram
44k
•
written 3.5 years ago by
PKW
▴
110
Login before adding your answer.
Traffic: 3084 users visited in the last hour
Thank you all :)
Instead of adding an answer thanking people, please upvote helpful answers and accept answers that solve your problem.