Question

How to iterate over all elements of a dataframe and make changes based on a function

0

Entering edit mode

3.3 years ago

maria2019 ▴ 250

I have a dataframe with many cols. Some of the cols have vectors in them. What I want to do, is for everything in the df, if the elements of the vectors are identical, to change the vector to one element but if there is at least one difference, to keep it.

an example of my dataframe:

enter image description here

this is the output that I want

enter image description here

Basically,

> if length(unique(each_vector))==1-> do nothing , otherwise, only choose the first element of the vector.

I would be thankful if someone could help

function R datafram • 1.9k views

ADD COMMENT • link 3.2 years ago by maria2019 ▴ 250

0

Entering edit mode

I guess first, how did you end up in this situation? Second, are the data frame elements lists, or are they filled with strings like "c(1, 2, 3)"?

ADD REPLY • link 3.3 years ago by rpolicastro 13k

0

Entering edit mode

No, they are not filled with strings. exactly same as the photo that I have posted.

it is the out put of the following code:

df <- aggregate(df, by= list(df$ID) ,FUN='list')

ADD REPLY • link 3.3 years ago by maria2019 ▴ 250

0

Entering edit mode

Can you add your data to the post with dput(df)?

ADD REPLY • link 3.3 years ago by rpolicastro 13k

0

Entering edit mode

    dput(df)


"HLA-DRB1*14:05:01", "HLA-DRB1*14:54:01", 
"HLA-DRB1*15:01:01:01", "HLA-DRB1*15:01:01:02", "HLA-DRB1*15:01:01:03", 
"HLA-DRB1*15:01:01:04", "HLA-DRB1*15:02:01", "HLA-DRB1*15:03:01:01", 
"HLA-DRB1*15:03:01:02", "HLA-DRB1*16:02:01"), class = "factor"), 
    start = c(155190174L, 155190174L, 155190174L, 155190174L, 
    155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
    155190174L, 155190174L), end = c(155190174L, 155190174L, 
    155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
    155190174L, 155190174L, 155190174L, 155190174L), width = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), strand = structure(c(3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("+", 
    "-", "*"), class = "factor"), REF = c("C", "C", "C", "C", 
    "C", "C", "C", "C", "C", "C", "C"), ALT = c("G", "G", "G", 
    "G", "G", "G", "G", "G", "G", "G", "G"), FILTER = c("VQSRTrancheSNP99.90to100.00", 
    "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
    "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
    "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
    "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
    "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00"
    ), IMPACT_SnpEff = c("MODIFIER", "MODIFIER", "MODIFIER", 
    "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
    "MODIFIER", "MODIFIER", "MODIFIER"), Feature_SnpEff = c("ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1"), Feature_VEP = c("ENST00000467134", 
    "ENST00000468978", "ENST00000471283", "ENST00000473363", 
    "ENST00000485118", "ENST00000498431", "ENST00000610359", 
    "ENST00000610468", "ENST00000611571", "ENST00000611577", 
    "ENST00000612778"), BIOTYPE_SnpEff = c("protein_coding", 
    "protein_coding", "protein_coding", "protein_coding", "protein_coding", 
    "protein_coding", "protein_coding", "protein_coding", "protein_coding", 
    "protein_coding", "protein_coding"), Feature_type_SnpEff = c("transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript"
    )), row.names = 290:300, class = "data.frame")

    df_agg<-aggregate(df, by= list(df$Row.names) ,FUN='list') 
    dput(df_agg)


    "HLA-DRB1*14:05:01", "HLA-DRB1*14:54:01", "HLA-DRB1*15:01:01:01", 
    "HLA-DRB1*15:01:01:02", "HLA-DRB1*15:01:01:03", "HLA-DRB1*15:01:01:04", 
    "HLA-DRB1*15:02:01", "HLA-DRB1*15:03:01:01", "HLA-DRB1*15:03:01:02", 
    "HLA-DRB1*16:02:01"), class = "factor")), start = list(c(155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L
), c(155190178L, 155190178L, 155190178L, 155190178L, 155190178L, 
155190178L)), end = list(c(155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 155190174L, 
155190174L, 155190174L, 155190174L, 155190174L), c(155190178L, 
155190178L, 155190178L, 155190178L, 155190178L, 155190178L)), 
    width = list(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
        c(1L, 1L, 1L, 1L, 1L, 1L)), strand = list(structure(c(3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("+", "-", 
    "*"), class = "factor"), structure(c(3L, 3L, 3L, 3L, 3L, 
    3L), .Label = c("+", "-", "*"), class = "factor")), REF = list(
        c("C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", 
        "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", 
        "C", "C", "C"), c("C", "C", "C", "C", "C", "C")), ALT = list(
        c("G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", 
        "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", 
        "G", "G", "G"), c("G", "G", "G", "G", "G", "G")), FILTER = list(
        c("VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00"), c("VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00", "VQSRTrancheSNP99.90to100.00", 
        "VQSRTrancheSNP99.90to100.00")), IMPACT_SnpEff = list(
        c("MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
        "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
        "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
        "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
        "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER"
        ), c("MODIFIER", "MODIFIER", "MODIFIER", "MODIFIER", 
        "MODIFIER", "MODIFIER")), Feature_SnpEff = list(c("ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1", 
    "ENST00000624572.1", "ENST00000624572.1", "ENST00000624572.1"
    ), c("ENST00000473363.2", "ENST00000473363.2", "ENST00000473363.2", 
    "ENST00000473363.2", "ENST00000473363.2", "ENST00000473363.2"
    )), Feature_VEP = list(c("ENST00000368392", "ENST00000368393", 
    "ENST00000368396", "ENST00000368398", "ENST00000438413", 
    "ENST00000447623", "ENST00000457295", "ENST00000462215", 
    "ENST00000462317", "ENST00000466913", "ENST00000467134", 
    "ENST00000468978", "ENST00000471283", "ENST00000473363", 
    "ENST00000485118", "ENST00000498431", "ENST00000610359", 
    "ENST00000610468", "ENST00000611571", "ENST00000611577", 
    "ENST00000612778", "ENST00000614519", "ENST00000615517", 
    "ENST00000620103", "ENST00000620770"), c("ENST00000337604", 
    "ENST00000338684", "ENST00000342482", "ENST00000343256", 
    "ENST00000368389", "ENST00000368390")), BIOTYPE_SnpEff = list(
        c("protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding", "protein_coding", "protein_coding", 
        "protein_coding"), c("nonsense_mediated_decay", "nonsense_mediated_decay", 
        "nonsense_mediated_decay", "nonsense_mediated_decay", 
        "nonsense_mediated_decay", "nonsense_mediated_decay")), 
    Feature_type_SnpEff = list(c("transcript", "transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript", 
    "transcript", "transcript", "transcript"), c("transcript", 
    "transcript", "transcript", "transcript", "transcript", "transcript"
    ))), row.names = c(NA, -2L), class = "data.frame")

ADD REPLY • link updated 3.3 years ago by rpolicastro 13k • written 3.3 years ago by maria2019 ▴ 250

score 2 · Accepted Answer · 2021-11-19

Example data.

df <- structure(list(row = 1:4, col1 = list("X", c("X", "X", "X"), 
    c("X", "X", "Y"), "Y"), col2 = list(c(1, 2), c(1, 1, 1, 1, 
1), 1, c(1, 2, 3))), row.names = c(NA, -4L), class = "data.frame")

> df
  row    col1          col2
1   1       X          1, 2
2   2 X, X, X 1, 1, 1, 1, 1
3   3 X, X, Y             1
4   4       Y       1, 2, 3

base R answer.

check_unique <- \(x) if (length(unique(x)) == 1) unique(x) else x
df[, 2:ncol(df)] <- lapply(df[, 2:ncol(df)], sapply, check_unique)

> df
  row    col1    col2
1   1       X    1, 2
2   2       X       1
3   3 X, X, Y       1
4   4       Y 1, 2, 3