problem with tidyverse for gene expression profile
1
0
Entering edit mode
2.2 years ago
wanaga3166 ▴ 10

Hi all,

I have these data.frame:

module_df
structure(list(gene_id = c("A1BG", "A1CF", "A2LD1", "A2M", "A2ML1", 
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AADAC", "AADACL2", 
"AADACL3", "AADACL4", "AADAT"), colors = c("turquoise", "blue", 
"turquoise", "grey60", "turquoise", "blue", "turquoise", "turquoise", 
"blue", "yellow", "magenta", "turquoise", "grey", "turquoise", 
"red")), row.names = c("A1BG", "A1CF", "A2LD1", "A2M", "A2ML1", 
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AADAC", "AADACL2", 
"AADACL3", "AADACL4", "AADAT"), class = "data.frame")

submod
structure(list(gene_id = c("A2M", "AASS", "ABCD2", "ABCG1", "ABHD14A", 
"ABHD15", "ABHD5", "ACAA2", "ACACA", "ACACB", "ACAD11", "ACAD8", 
"ACADM", "ACADS", "ACADSB"), colors = c("grey60", "brown", "brown", 
"grey60", "brown", "brown", "brown", "brown", "brown", "brown", 
"brown", "brown", "brown", "brown", "brown")), row.names = c(4L, 
25L, 65L, 72L, 83L, 85L, 89L, 109L, 110L, 111L, 113L, 114L, 117L, 
118L, 119L), class = "data.frame")

subexpr
structure(list(GeneSymbol = c("A2M", "AASS", "ABCD2", "ABCG1", 
"ABHD14A", "ABHD15", "ABHD5", "ACAA2", "ACACA", "ACACB"), R012.EGM2 = c(50.2764951996428, 
101.665447606096, 5.14281915226989, 12.1175440994071, 127.400290446813, 
50.6721806650688, 187.444869718312, 125.028697858068, 124.640594250484, 
14.559615204624), R016.EGM2 = c(46.6399755559676, 73.1954332331196, 
5.99306742010033, 11.9097840149146, 123.360595739224, 55.5747760958692, 
195.751905602931, 130.587362354726, 162.118704445471, 15.098902880636
), V06.EGM2 = c(51.8486405122139, 70.9836654034483, 5.36299847633948, 
11.8503634313783, 112.439818697326, 51.4168701523525, 280.399638597663, 
120.828626173478, 129.803136030716, 12.9402241789224), V07.EGM2 = c(117.955683199945, 
86.7501578979337, 5.0894935767762, 12.1221347970042, 114.720275503257, 
50.1008498711308, 194.799263873044, 116.159360477615, 149.444200673643, 
14.6041181366271), V08.EGM2 = c(94.9274274162506, 70.5808603502644, 
5.49133134976474, 12.2734830659801, 105.231609450716, 44.7581009650209, 
204.585401390909, 104.92616313392, 100.125365909659, 13.4964153291074
), N179 = c(192.745422746882, 74.3515057108856, 29.365366693371, 
10.4388992737693, 104.897752588442, 57.5620930848904, 206.669342475002, 
137.362576209483, 70.0328556923676, 128.151538222808), N181 = c(215.662725276862, 
58.1513722826225, 25.2175610232832, 14.9148176681094, 98.5597745615199, 
50.3461555048319, 382.111329968073, 138.065978007562, 52.5064295370523, 
130.508317256036), N184 = c(101.949872053494, 98.4591638219055, 
28.2773164342233, 11.5115144425951, 109.929603150963, 39.3030585235286, 
176.159896274476, 167.023033236092, 120.143842776412, 132.777016208134
), V181 = c(174.216915831257, 108.294596352281, 25.9726969260387, 
13.1836455007548, 121.667169254219, 61.5397177077925, 206.185640237808, 
116.408716711222, 185.906463762304, 129.926726255828)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

I would like to obtain a data.frame like that:

GeneSymbol | name | value | module

AASS | ASC2.jo | 54.23 | grey60

AAS2 | MSC | 24.12 | brown

However, when I execute my code, the column module is empty (full of NA). Also, I don't want gene_id column. How to correct that? Thank you for your help.

#-------------------------------------------------------------------
# Examine Expression Profiles
#-------------------------------------------------------------------

# pick out a few modules of interest here
modules_of_interest = c("grey60", "brown")

# Pull out list of genes in that module
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

subexpr = data_BE[which(data_BE$GeneSymbol%in%submod$gene_id),]

subexpr = subset(subexpr, select = -c(V179))

submod_df = data.frame(subexpr) %>%
  mutate(gene_id = row.names(.)) %>%
  pivot_longer(c(-GeneSymbol, -gene_id)) %>%
  mutate(module = module_df[gene_id,]$colors)
R tidyverse • 797 views
ADD COMMENT
0
Entering edit mode

My quick guess is that your error is coming from these lines:

subexpr = data_BE[which(data_BE$GeneSymbol%in%submod$gene_id),]
subexpr = subset(subexpr, select = -c(V179))

Have you run your code line-by-line and confirmed that each step is doing what you think?

ADD REPLY
2
Entering edit mode
2.2 years ago
Basti ★ 2.0k

left_join may be more effective, and you do not need to create gene_id if you do not need it finally (in addition there is a confusion between gene_id in module_df, so it is better to not create it) :

submod_df = subexpr %>%
  pivot_longer(!GeneSymbol) %>%
  left_join(module_df,by=c("GeneSymbol"="gene_id"))

# A tibble: 90 × 4
   GeneSymbol name      value colors
   <chr>      <chr>     <dbl> <chr> 
 1 A2M        R012.EGM2  50.3 grey60
 2 A2M        R016.EGM2  46.6 grey60
 3 A2M        V06.EGM2   51.8 grey60
 4 A2M        V07.EGM2  118.  grey60
 5 A2M        V08.EGM2   94.9 grey60
 6 A2M        N179      193.  grey60
 7 A2M        N181      216.  grey60
 8 A2M        N184      102.  grey60
 9 A2M        V181      174.  grey60
10 AASS       R012.EGM2 102.  NA    
# … with 80 more rows
# ℹ Use `print(n = ...)` to see more rows
ADD COMMENT
0
Entering edit mode

Thank you Basti ! I thought left_join exist only for SQL.

ADD REPLY

Login before adding your answer.

Traffic: 1404 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6