How to separate combined values?
2.5 years ago
Hyper_Odin ▴ 320

Hello all,

I got this matrix after pivoting the data, now I am looking for a solution wherethis

can be converted to this deletion; snv.

I am not able to figure out how to do this. I have tried separate(), but it doesn't work

I would be grateful if somebody could at least give me a hint on how to do it!


Please use dput() to provide example data.

Here it is :

    structure(list(id = c("H021-2YTE1K", "H021-4YZREF", "H021-FYR8SS"), CC2D2A = list("insertion", NULL, NULL), BRAF = list("deletion", 
    NULL, NULL), ZNF711 = list("insertion", NULL, NULL), SRRM1 = list(
    NULL, "deletion", NULL), ZFP69B = list(NULL, "insertion", 
    NULL), MRPL19 = list(NULL, "deletion", NULL), CDKN1A = list(
    NULL, "insertion", NULL), RBM12B = list(NULL, "insertion", 
    NULL), MTUS2 = list(NULL, "deletion", NULL), EVPL = list(
    NULL, "insertion", NULL), SLC25A5 = list(NULL, "deletion", 
    NULL), CUL4B = list(NULL, "deletion", NULL), `IGKV1D-8` = list(
    NULL, NULL, "insertion"), FGFR3 = list(NULL, NULL, "deletion"), 
    SEMA3A = list(NULL, NULL, "insertion"), ENPP2 = list(NULL, 
        NULL, "deletion"), `IGLV3-1` = list(NULL, NULL, c("deletion", 
    "SNV", "SNV", "SNV")), MTMR11 = list("SNV", NULL, NULL), 
    DNMT3A = list("SNV", NULL, NULL), GTF3C2 = list("SNV", NULL, 
        NULL), TNS1 = list("SNV", NULL, NULL), OXTR = list("SNV", 
        NULL, NULL), PET112 = list("SNV", NULL, NULL), GPBP1 = list(
        "SNV", NULL, NULL), PGBD1 = list("SNV", NULL, NULL), 
    DST = list("SNV", NULL, NULL), FNDC1 = list("SNV", NULL, 
        NULL), DYNC1I1 = list("SNV", NULL, NULL), ZNF251 = list(
        "SNV", NULL, NULL), ZDHHC12 = list("SNV", NULL, NULL), 
    C12orf40 = list("SNV", NULL, NULL), HSD17B6 = list("SNV", 
        "SNV", NULL), FGD6 = list("SNV", NULL, NULL), CENPJ = list(
        "SNV", NULL, NULL), NRG4 = list("SNV", NULL, NULL), EIF4A1 = list(
        "SNV", NULL, NULL), TAF15 = list("SNV", NULL, NULL), 
    HEATR6 = list("SNV", NULL, NULL), FADS6 = list("SNV", NULL, 
        NULL), RAD23A = list("SNV", NULL, NULL), TMEM161A = list(
        "SNV", NULL, NULL), ZNF808 = list("SNV", NULL, NULL), 
    GNB1L = list("SNV", NULL, NULL), CELSR1 = list("SNV", NULL, 
        NULL), SAMD11 = list(NULL, "SNV", NULL), NOC2L = list(
        NULL, "SNV", NULL), MXRA8 = list(NULL, "SNV", NULL), 
    NOL9 = list(NULL, "SNV", NULL), SLC45A1 = list(NULL, "SNV", 
        NULL), H6PD = list(NULL, "SNV", NULL), KIF17 = list(NULL, 
        "SNV", NULL), RAP1GAP = list(NULL, "SNV", NULL), IFNLR1 = list(
        NULL, "SNV", NULL), GRHL3 = list(NULL, "SNV", NULL), 
    DCDC2B = list(NULL, "SNV", NULL), FAM167B = list(NULL, "SNV", 
        NULL), CLSPN = list(NULL, "SNV", NULL), MACF1 = list(
        NULL, "SNV", NULL), WDR65 = list(NULL, "SNV", NULL), 
    DMRTA2 = list(NULL, "SNV", NULL), ZFYVE9 = list(NULL, "SNV", 
        NULL), PARS2 = list(NULL, "SNV", NULL), INADL = list(
        NULL, "SNV", NULL), ITGB3BP = list(NULL, "SNV", NULL), 
    MIER1 = list(NULL, "SNV", NULL), CTH = list(NULL, "SNV", 
        NULL), GLMN = list(NULL, "SNV", NULL), SASS6 = list(NULL, 
        "SNV", NULL), ALX3 = list(NULL, "SNV", NULL), PTPN22 = list(
        NULL, "SNV", NULL), CHRNB2 = list(NULL, "SNV", NULL), 
    TDRD5 = list(NULL, "SNV", NULL), BRINP3 = list(NULL, "SNV", 
        NULL), UCHL5 = list(NULL, "SNV", NULL), DISP1 = list(
        NULL, "SNV", NULL), ALK = list(NULL, "SNV", NULL), APLF = list(
        NULL, "SNV", NULL), ATP6V1B1 = list(NULL, "SNV", NULL), 
    SFXN5 = list(NULL, "SNV", NULL), C2orf78 = list(NULL, "SNV", 
        NULL), CTNNA2 = list(NULL, "SNV", NULL), DNAH6 = list(
        NULL, "SNV", NULL), PCDP1 = list(NULL, "SNV", NULL), 
    PTPN18 = list(NULL, "SNV", NULL), MAP3K19 = list(NULL, "SNV", 
        NULL), NEUROD1 = list(NULL, "SNV", NULL), DNAH7 = list(
        NULL, "SNV", NULL), HECW2 = list(NULL, "SNV", NULL), 
    NBEAL1 = list(NULL, c("SNV", "SNV"), NULL), ABCA12 = list(
        NULL, "SNV", NULL), CCDC108 = list(NULL, "SNV", NULL), 
    SLC4A3 = list(NULL, "SNV", NULL), C3orf20 = list(NULL, "SNV", 
        NULL), ZCWPW2 = list(NULL, "SNV", NULL), C3orf49 = list(
        NULL, "SNV", NULL), CNTN3 = list(NULL, "SNV", NULL), 
    MYH15 = list(NULL, "SNV", NULL), KIAA1524 = list(NULL, "SNV", 
        NULL), COPG1 = list(NULL, "SNV", NULL), PLOD2 = list(
        NULL, "SNV", NULL), SERPINI1 = list(NULL, "SNV", NULL), 
    FYTTD1 = list(NULL, "SNV", NULL), LDB2 = list(NULL, "SNV", 
        NULL), PCDH7 = list(NULL, "SNV", NULL), C4orf19 = list(
        NULL, "SNV", NULL), CORIN = list(NULL, "SNV", "SNV"), 
    WDFY3 = list(NULL, "SNV", NULL), METAP1 = list(NULL, "SNV", 
        NULL), ELOVL6 = list(NULL, "SNV", NULL), DDX60 = list(
        NULL, "SNV", NULL), ADAM29 = list(NULL, "SNV", NULL), 
    WDR17 = list(NULL, "SNV", NULL), NEIL3 = list(NULL, "SNV", 
        NULL), PRIMPOL = list(NULL, c("SNV", "SNV"), NULL), SLC6A19 = list(
        NULL, "SNV", NULL), ADCY2 = list(NULL, "SNV", NULL), 
    C5orf22 = list(NULL, "SNV", NULL), RXFP3 = list(NULL, "SNV", 
        NULL), AMACR = list(NULL, "SNV", NULL), RAI14 = list(
        NULL, "SNV", NULL), CDC20B = list(NULL, "SNV", NULL), 
Could you specify what you want to achieve? This is a data frame with columns as lists.If you want to work only on the selected column something like this should work :

paste(unique(unlist(dataframe$columnOfIntrest)), collapse = ";")

But if you want do such conversion for all columns that's a different story.

Hi Hamid, So, in a partcular columns i have something like this

[1] " "
[2] "insertion"
[3] "deletion"
[4] "c(\"deletion\", \"SNV\", \"SNV\", \"SNV\")"
[5] "SNV"
[6] "c(\"SNV\", \"SNV\", \"SNV\", \"SNV\", \"SNV\", \"SNV\")"
[7] "c(\"SNV\", \"SNV\", \"SNV\", \"SNV\")"
[8] "c(\"SNV\", \"SNV\", \"SNV\", \"SNV\", \"SNV\", \"SNV\", \"SNV\")"

i want to remove the repeating terms and the c() and keep only one term. For example, i want my output to look like this

[1] " "
[2] "insertion"
[3] "deletion"
[4] "deletion", "SNV"
[5] "SNV"
[6] "SNV"
[7] "SNV"
[8] "SNV"

thank you so much

2.5 years ago
Hyper_Odin ▴ 320

Thanks, everyone, I managed to solve it with some help. So, first, i removed all the unnecessary symbols using gsub c(" ")

df2 = gsub("^c\\(|\\)", "", df2)
df2 =  gsub("\"", "", df2)

followed by this:

df3 =  apply(df2, 2, function(y) sapply(y, function(x) paste(unique(unlist(str_split(x,", "))), collapse = "; ")))

