filtering dataframe using multiple columns in R
3
0
Entering edit mode
3.5 years ago

Hi guys,

So, i have this dataframe (for e.g.):

   CGDid           Mass      Source.File        pepSeq
 C1_00060W_A    5117.5552   T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00061W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00063W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00062W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A

And the thing is i want to filter when the columns CGDid, Mass, pepSeq are equal and the Source.File is different. I am trying to figure out the best way to this but i am blocked. I will be very appreciate if someone could help me!

All the best, Andreia

filter R dataframe • 1.3k views
ADD COMMENT
0
Entering edit mode

Thank you all :)

ADD REPLY
0
Entering edit mode

Instead of adding an answer thanking people, please upvote helpful answers and accept answers that solve your problem.

ADD REPLY
1
Entering edit mode
3.5 years ago
Zhilong Jia ★ 2.2k
t1 <- readr::read_tsv("./1.t")
dplyr::group_by(t1, CGDid, Mass, pepSeq)  %>% dplyr::distinct(Source.File)

output:

# A tibble: 9 x 4
# Groups:   CGDid, Mass, pepSeq [6]
  CGDid       Mass Source.File   pepSeq                                         
  <chr>      <dbl> <chr>         <chr>                                          
1 C1_00060W… 5118. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK 
2 C1_00060W… 5118. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK 
3 C1_00061W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
4 C1_00060W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
5 C1_00060W… 6304. T0_4_excl.raw NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
6 C1_00063W… 6304. T0_4.raw      NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK…
7 C1_00060W… 6305. T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
8 C1_00060W… 6305. T0_4.raw      K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
9 C1_00062W… 6305. T0_4.raw      K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVI…
ADD COMMENT
0
Entering edit mode
3.5 years ago
zx8754 12k

Using aggregate:

aggregate(Source.File ~ CGDid + Mass + pepSeq, data = d,
          FUN = function(x) { c(names = toString(unique(x)),
                                count = length(unique(x))) })
#         CGDid     Mass                                                       pepSeq       Source.File.names Source.File.count
# 1 C1_00060W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A T0_4_excl.raw, T0_4.raw                 2
# 2 C1_00062W_A 6305.149 K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A                T0_4.raw                 1
# 3 C1_00060W_A 5117.555               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK T0_4_excl.raw, T0_4.raw                 2
# 4 C1_00060W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR T0_4.raw, T0_4_excl.raw                 2
# 5 C1_00061W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR                T0_4.raw                 1
# 6 C1_00063W_A 6304.165     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR                T0_4.raw                 1


# example data
d <- read.table(text = "   CGDid           Mass      Source.File        pepSeq
 C1_00060W_A    5117.5552   T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00060W_A    5117.5552   T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 C1_00061W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00063W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4.raw        NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6304.165    T0_4_excl.raw   NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00062W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4.raw        K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
 C1_00060W_A    6305.1489   T0_4_excl.raw   K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A",
 header = TRUE)
ADD COMMENT
0
Entering edit mode
3.5 years ago
PKW ▴ 110
library(data.table)
df= fread("data.txt", sep = " ", header = T)` 


            CGDid     Mass   Source.File                                                       pepSeq
 1: C1_00060W_A 5117.555 T0_4_excl.raw               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 2: C1_00060W_A 5117.555      T0_4.raw               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 3: C1_00060W_A 5117.555      T0_4.raw               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
 4: C1_00061W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 5: C1_00060W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 6: C1_00060W_A 6304.165 T0_4_excl.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 7: C1_00063W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 8: C1_00060W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
 9: C1_00060W_A 6304.165 T0_4_excl.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
10: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
11: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
12: C1_00060W_A 6305.149      T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
13: C1_00062W_A 6305.149      T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
14: C1_00060W_A 6305.149      T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
15: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A


df= unique(df)

         CGDid     Mass   Source.File                                                       pepSeq
1: C1_00060W_A 5117.555 T0_4_excl.raw               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
2: C1_00060W_A 5117.555      T0_4.raw               NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDK
3: C1_00061W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
4: C1_00060W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
5: C1_00060W_A 6304.165 T0_4_excl.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
6: C1_00063W_A 6304.165      T0_4.raw     NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR
7: C1_00060W_A 6305.149 T0_4_excl.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
8: C1_00060W_A 6305.149      T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
9: C1_00062W_A 6305.149      T0_4.raw K.NGFQQQQQQQQQQQQQQQQQQQQIVAPPAAPPAPPTPVTSLSVIDKSQYIVNPTQR.A
ADD COMMENT

Login before adding your answer.

Traffic: 1528 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6