I have a list of samples mutated for signalling pathways like below 1 if mutated and 0 if not mutated
> dput(pathway)
structure(list(sample = c("LP6005334.DNA_H01", "LP6005334.DNA_H01",
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01",
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01",
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01",
"LP6005334.DNA_H01", "LP6005500.DNA_D03", "LP6005500.DNA_D03",
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03",
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03",
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03",
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6007600", "LP6007600",
"LP6007600", "LP6007600", "LP6007600", "LP6007600", "LP6007600",
"LP6007600", "LP6007600", "LP6007600", "LP6007600", "LP6007600",
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03",
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03",
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03",
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03",
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03",
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03",
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03",
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03",
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04",
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04",
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04",
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04",
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02",
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02",
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02",
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02",
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02",
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02",
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02",
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02",
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02",
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02",
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02",
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02",
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02",
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02",
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02",
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02",
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01",
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01",
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01",
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01",
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01",
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01",
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01",
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01",
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07",
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07",
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07",
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07",
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06",
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06",
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06",
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06",
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04",
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04",
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04",
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04",
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01",
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01",
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01",
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01",
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02",
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02",
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02",
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02",
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03",
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03",
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03",
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03",
"s15", "s15", "s15", "s15", "s15", "s15", "s15", "s15", "s15",
"s15", "s15", "s15", "s15", "s15", "s18", "s18", "s18", "s18",
"s18", "s18", "s18", "s18", "s18", "s18", "s18", "s18", "s18",
"s18", "s18", "s18", "s24", "s24", "s24", "s24", "s24", "s24",
"s24", "s24", "s24", "s24", "s24", "s24", "s24", "s24", "s24",
"s30", "s30", "s30", "s30", "s30", "s30", "s30", "s30", "s30",
"s30", "s30", "s30", "s30", "s30", "s59", "s59", "s59", "s59",
"s59", "s59", "s59", "s59", "s59", "s59", "s59", "s59", "s67",
"s67", "s67", "s67", "s67", "s67", "s67", "s67", "s67", "s67",
"s67", "s67", "s67", "s67", "s80", "s80", "s80", "s80", "s80",
"s80", "s80", "s80", "s80", "s80", "s80", "s80", "s80", "s80",
"s80", "s80", "s86", "s86", "s86", "s86", "s86", "s86", "s86",
"s86", "s86", "s86", "s86", "s86", "s86", "s86", "s94", "s94",
"s94", "s94", "s94", "s94", "s94", "s94", "s94", "s94", "s94",
"s94", "s94", "s94"), Pathway = c("PI3K", "Cell_Cycle", "RTK-RAS",
"WNT", "TGF-Beta", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC",
"NRF2", "PI3K", "Cell_Cycle", "WNT", "TGF-Beta", "RTK-RAS", "WNT",
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta",
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2", "PI3K", "TGF-Beta", "RTK-RAS", "WNT",
"CR", "CF", "TP53", "Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2",
"Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF",
"TP53", "NOTCH", "Hippo", "MYC", "NRF2", "RTK-RAS", "TGF-Beta",
"PI3K", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", "Hippo",
"MYC", "NRF2", "Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT",
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "PI3K",
"Cell_Cycle", "RTK-RAS", "TGF-Beta", "WNT", "CR", "CF", "TP53",
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "TGF-Beta",
"RTK-RAS", "WNT", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC",
"NRF2", "Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR",
"CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta", "PI3K",
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "WNT", "TGF-Beta",
"RTK-RAS", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2",
"PI3K", "TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle",
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta",
"PI3K", "WNT", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2",
"Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF",
"TP53", "NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "CF",
"CR", "Hippo", "MYC", "NOTCH", "NRF2", "PI3K", "RTK-RAS", "TGF-Beta",
"TP53", "WNT", "Cell_Cycle", "RTK-RAS", "TGF-Beta", "PI3K", "WNT",
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "RTK-RAS",
"TGF-Beta", "PI3K", "WNT", "CR", "CF", "TP53", "Cell_Cycle",
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta",
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "RTK-RAS", "WNT",
"TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle",
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "PI3K", "Cell_Cycle",
"RTK-RAS", "TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53",
"Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle",
"WNT", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53",
"Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta", "PI3K",
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2", "Cell_Cycle", "WNT", "TGF-Beta", "PI3K",
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "RTK-RAS", "WNT",
"TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle",
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "WNT",
"TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle",
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta",
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH",
"Hippo", "MYC", "NRF2"), value = c(1L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -346L))
I want to convert this to a boolean matrix (wide format) in which pathways are in columns and samples are in rows
A sample obtains 1 if mutated for a pathway and 0 if not mutated for a pathway like
> head(p)
sample value Cell_Cycle Hippo MYC NOTCH NRF2 PI3K TGF-Beta RTK-RAS TP53
1 LP6008334.DNA_C02 0 0 0 0 0 0 0 0 0 0
2 LP6008334.DNA_A03 0 0 0 0 0 0 0 0 0 0
3 LP6005334.DNA_H01 0 0 0 0 0 0 0 0 0 0
4 LP6008337.DNA_H06 0 0 0 0 0 0 0 0 0 0
5 s15 0 0 0 0 0 0 0 0 0 0
6 LP6008460.DNA_D01 0 0 0 0 0 0 0 0 0 0
WNT CF CR
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
5 0 0 0
6 0 0 0
>
I have tried this But all I get is zero
for (pathway in setdiff(unique(p$Pathway), colnames(p))) {
p <- cbind(p,array(0,nrow(p)))
colnames(p)[ncol(p)] <- pathway
}
I also have been trying
reshape(pathway, idvar = "sample", timevar = "Pathway", direction = "wide")
But gives wrong things of 0 and 1
Please help me
Hi,
In your data you have the same sample with the same pathway, with different values. This makes sense to you?
This cannot be converted into a wider format.
António
Thank you
The main data comes from here
First a matrix of boolean values for mutation status of each gene in each sample like in which I merged the corresponding pathway to the gene in one column
I then melt that like
Do you think from the first boolean matrix of mutational status I can build boolean values of pathway instead of feeding the melt results in tidy verse or whatever?
I think that I identified your problem. So, the thing is that you've mutations per sample/variable per gene per pathway. So, the same pathway in the same sample/variable can have a 0 and 1, because it can have 2 genes belonging to the same pathway. So, you cannot just drop your genes and have a table with samples per pathways.
So, depending on what you'll do next with this wider format, you can merge the gene names with pathway or sample, in order to make unique combinations of pathway-gene or sample-gene. This would allow you to keep all the information instead of choosing randomly which genes represent in the wider format (excluding this information from the table).
So, I don't think that you need to melt. So, if you think that my suggestion makes sense, regarding the combination of pathway-gene names into one label, in order to create unique labels, you just need to do (assume that your unmelted data frame object
d1
isdata
):I hope this is what you're looking for,
António
Sorry when I tried your code with my complete d1 rather than its header pasted here I got this error
You have certain samples where you have the Pathway both mutated and not mutated. You need to resolve this before pivoting.
Sorry the problem is I don't know how to resolve this Is there any code to identify such a samples and remove them before widening? Because doing that manually is tedious and error prone