How to average replicate data using R (Column and Raw)
3
2
Entering edit mode
6.5 years ago
WUSCHEL ▴ 810

I have a big data matrix and each column has named with multiple information and separated by an underscore.

For an example; Genotype_Time_Replicate: X_T0_1, X_T0_2 etc

I want to average my data for downstream analysis.

How can I average replications (Case 1: in column data), and averaging repeated raw data and column replication data (Case 2: Column and raw)

Final Expectation looks like;

Picture1

Sample data frame is given below,

Case 1 df:

structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))

Case 2 df:

structure(list(Gene = c("mut", "ACTIN", "ACTIN", "Pq", "UBQ", "UBQ", "Xa"), X_T0_R1 = c(0.344814469, 1.209073623, 1.071457953, 0.362842359, 1.014392244, 1.571055788, 0.570729408), X_T0_R2 = c(0.449930853, 1.031557118, 1.054965621, 0.522831228, 0.83300542, 0.967355216, 0.501057748), X_T0_R3 = c(0.601209073, 1.695796471, 1.052815987, 0.571729222, 1.391288288, 1.773644641, 0.453820027), X_T1_R1 = c(0.427800244, 1.308884798, 0.991302515, 0.329510681, 0.773414746, 1.029619555, 0.362504535), X_T1_R2 = c(0.418589633, 1.811507215, 1.206305091, 0.29886302, 0.895616224, 1.196317937, 0.408657559), X_T1_R3 = c(0.468263467, 1.352236153, 1.444060418, 0.359970383, 0.942421479, 2.388771681, 0.145078696), X_T2_R1 = c(0.300362616, 1.654754505, 1.109259911, 0.306699247, 0.585608303, 1.945573895, 0.270237172), X_T2_R2 = c(0.27920993, 1.573822163, 1.152985196, 0.310218502, 0.493783209, 1.573792123, 0.36659012), X_T2_R3 = c(1.792971556, 0.665809249, 0.778594892, 2.161999623, 1.888984449, 0.456632731, 1.631251843), X_T3_R1 = c(1.118011513, 0.570411874, 1.044634812, 1.213092011, 1.817947271, 0.234950383, 1.384650094), X_T3_R2 = c(1.008515071, 0.916509523, 0.905764637, 1.244132809, 0.752181246, 0.797524026, 1.010615689), X_T3_R3 = c(0.816620011, 0.740345088, 1.106478019, 0.899414205, 0.909160589, 0.672469518, 0.594865366), Y_T0_R1 = c(3.307846716, 0.027550169, 0.645327389, 2.887386508, 1.042465604, 0.05047425, 4.318466199), Y_T0_R2 = c(2.035398381, 0.633422527, 0.888069994, 2.062827838, 1.82433679, 0.500792593, 1.182188977), Y_T0_R3 = c(1.500168876, 0.877196975, 1.088593542, 1.392198697, 1.162069878, 0.470956741, 1.511890878), Y_T1_R1 = c(1.095875029, 0.777981021, 1.050238479, 1.17216374, 0.945470429, 0.40568268, 0.872396888), Y_T1_R2 = c(0.452742932, 0.352610874, 0.787861253, 0.477126035, 0.320200734, 1.826032539, 0.332244865), Y_T1_R3 = c(0.45960558, 0.478390214, 0.645688363, 0.395673468, 0.215407604, 0.759507568, 0.700730905), Y_T2_R1 = c(1.559068766, 0.062252184, 0.937463531, 0.994007758, 0.482591298, 1.269828631, 0.237326878), Y_T2_R2 = c(1.390406257, 0.215685731, 1.087380361, 1.018431329, 0.585660661, 1.05095161, 0.173209498), Y_T2_R3 = c(1.00828232, 0.376013801, 0.782410602, 0.906376375, 0.572489629, 1.359345852, 0.302963483), Y_T3_R1 = c(1.182635592, 0.117426355, 1.013642281, 0.967559933, 0.306328031, 1.231521805, 0.257804624), Y_T3_R2 = c(1.366839578, 0.341411017, 1.337125947, 0.943784803, 0.721978298, 1.10875345, 0.189978177), Y_T3_R3 = c(1.594404053, 0.209740069, 0.92384942, 0.897659445, 0.457172538, 1.543831721, 0.272475233), Z_T0_R1 = c(1.237203711, 0.233057698, 1.077219174, 1.156260667, 0.264806683, 1.591044318, 0.255767162), Z_T0_R2 = c(1.211301515, 0.251870699, 1.141522554, 1.194071909, 0.20882802, 1.533752995, 0.278059859), Z_T0_R3 = c(0.645425334, 1.53688617, 0.439888106, 0.819063313, 1.769224478, 0.250876057, 1.998822839), Z_T1_R1 = c(0.971645792, 0.671074934, 0.469502588, 1.312821698, 1.306039773, 1.40561198, 1.704347344), Z_T1_R2 = c(0.859830596, 1.580097955, 1.366461274, 1.24037716, 0.80578233, 1.116605654, 1.211928025), Z_T1_R3 = c(0.785228306, 1.286123696, 1.10243547, 0.996917372, 1.215506569, 0.683697612, 1.000232952), Z_T2_R1 = c(0.475576762, 2.673806674, 0.732913032, 0.763693301, 3.091813549, 0.347384763, 3.16064337), Z_T2_R2 = c(0.810829692, 1.590506889, 1.162262268, 1.367255133, 1.378518959, 0.677096267, 2.006934309), Z_T2_R3 = c(1.02507371, 2.164918846, 1.440885034, 1.185511625, 1.934374556, 0.460659928, 1.277191061), Z_T3_R1 = c(0.834953495, 2.155130232, 1.209137833, 0.934189133, 1.048650427, 0.704562113, 1.145400709), Z_T3_R2 = c(0.886903303, 0.237343684, 0.921370232, 0.737206101, 0.318232441, 1.314051524, 0.9314835), Z_T3_R3 = c(0.748710472, 0.501419194, 0.914476206, 0.641169316, 0.119979817, 1.187578276, 0.918544916)), row.names = c(NA, -7L), class = c("data.table", "data.frame"))

if possible could you please help me with an easy approach using R programming

R gene statistics • 14k views
ADD COMMENT
3
Entering edit mode
6.5 years ago

Assumption is that average is applied for every 3 columns and updated with data frame df1 from OP:

results=data.frame(apply(array(as.matrix(df1[,-1]), c(nrow(df1),3, ncol(df1)/3)),3, rowMeans))
results=cbind(df1$Gene, results)
results
  df1$Gene        X1        X2        X3        X4        X5        X6
1       AA 1.3260611 1.4999564 1.1185287 1.0250821 0.4245372 0.7232431
2       PQ 0.2227680 0.2931172 0.5482280 0.8411926 3.1441979 0.5767650
3       XY 0.2934894 0.3244539 0.8463782 1.1081067 2.1958315 0.8962974
4      UBQ 1.2741957 1.1195535 0.9363191 0.9577433 1.0143500 0.6048100
         X7         X8        X9       X10       X11       X12
1 0.3355092 0.32827521 0.7883300 1.1252923 2.3579242 0.6789214
2 1.3203722 1.15039119 1.0353998 1.0003211 0.7406999 0.6879927
3 0.2908953 0.09222987 0.8850981 1.3859353 1.9825102 1.0582903
4 1.2099619 1.40975868 1.1052330 0.8773592 0.5850953 1.0053477

Please change column names.

ADD COMMENT
1
Entering edit mode

Another one in R assuming that sample replicates bear _R[0-9] pattern at the end:

> d=data.frame(matrix(nrow = nrow(df1)))
> for (i in unique(gsub("_R[1-9]","",names(df1)))[-1]){
        d[,i]=apply(df1[,grepl(gsub("_R[1-9]","",i),names(df1))],1, mean)
}
> d[,1]=df1[,1]
> names(d)[1]=names(df1)[1]
> d
  Gene      X_T0      X_T1      X_T2      X_T3      Y_T0      Y_T1
1   AA 1.3260611 1.4999564 1.1185287 1.0250821 0.4245372 0.7232431
2   PQ 0.2227680 0.2931172 0.5482280 0.8411926 3.1441979 0.5767650
3   XY 0.2934894 0.3244539 0.8463782 1.1081067 2.1958315 0.8962974
4  UBQ 1.2741957 1.1195535 0.9363191 0.9577433 1.0143500 0.6048100
       Y_T2       Y_T3      Z_T0      Z_T1      Z_T2      Z_T3
1 0.3355092 0.32827521 0.7883300 1.1252923 2.3579242 0.6789214
2 1.3203722 1.15039119 1.0353998 1.0003211 0.7406999 0.6879927
3 0.2908953 0.09222987 0.8850981 1.3859353 1.9825102 1.0582903
4 1.2099619 1.40975868 1.1052330 0.8773592 0.5850953 1.0053477
ADD REPLY
0
Entering edit mode

Thank you as always cpad01 . May I know your real identity, please! You have helped me seral times.

BTW, Could you please help me with my case 2 as well, When I have repeated measurements of the same gene [duplicate / triplicate data for a Gene] (in raws) how can I take their average ( then basically df looks like case1) and proceed like above.

Because I see in my working df same gene is measure few times / using different peptides, so I want to average them first and proceed like above.

ADD REPLY
1
Entering edit mode

for df2 (please cross check the results and let me know if the results are off. I checked mean for one or two entries):

library(stringr)
library(tidyr)
gdf2 = gather(df2, "group", "Expression", -Gene)
gdf2$tgroup = apply(str_split_fixed(gdf2$group, "_", 3)[, c(1, 2)], 1, paste, collapse ="_")
library(dplyr)
gdf2 %>% group_by(Gene, tgroup) %>% summarize(expression_mean = mean(Expression)) %>% spread(., tgroup, expression_mean)

# A tibble: 5 x 13
# Groups:   Gene [5]
  Gene   X_T0  X_T1  X_T2  X_T3  Y_T0  Y_T1  Y_T2  Y_T3  Z_T0  Z_T1
  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ACTIN 1.19  1.35  1.16  0.881 0.693 0.682 0.577 0.657 0.780 1.08 
2 mut   0.465 0.438 0.791 0.981 2.28  0.669 1.32  1.38  1.03  0.872
3 Pq    0.486 0.329 0.926 1.12  2.11  0.682 0.973 0.936 1.06  1.18 
4 UBQ   1.26  1.20  1.16  0.864 0.842 0.745 0.887 0.895 0.936 1.09 
5 Xa    0.509 0.305 0.756 0.997 2.34  0.635 0.238 0.240 0.844 1.31 
# ... with 2 more variables: Z_T2 <dbl>, Z_T3 <dbl>

2nd method:

> results=data.frame(apply(array(as.matrix(df2[,-1]), c(nrow(df2),3, ncol(df2)/3)),3, rowMeans))
> results=cbind(df2$Gene, results)
> library(plyr)
> dresults=ddply(results,"df2$Gene",numcolwise(mean))
> dresults
  df2$Gene        X1        X2        X3        X4        X5        X6
1    ACTIN 1.1859445 1.3523827 1.1558710 0.8806907 0.6933601 0.6821284
2      mut 0.4653181 0.4382178 0.7908480 0.9810489 2.2811380 0.6694078
3       Pq 0.4858009 0.3294480 0.9263058 1.1188797 2.1141377 0.6816544
4      UBQ 1.2584569 1.2043603 1.1573958 0.8640388 0.8418493 0.7453836
5       Xa 0.5085357 0.3054136 0.7560264 0.9967104 2.3375154 0.6351242
         X7        X8        X9       X10       X11       X12
1 0.5768677 0.6571992 0.7800741 1.0792827 1.6275488 0.9898129
2 1.3192524 1.3812931 1.0313102 0.8722349 0.7704934 0.8235224
3 0.9729385 0.9363347 1.0564653 1.1833721 1.1054867 0.7708548
4 0.8868113 0.8949310 0.9364221 1.0888740 1.3149747 0.7821758
5 0.2378333 0.2400860 0.8442166 1.3055028 2.1482562 0.9984764
>

3rd solution:

> df2_final=data.frame(matrix(nrow = nrow(df2)))
> for (i in unique(gsub("_R[1-9]","",names(df2)))[-1]){
+     df2_final[,i]=apply(df2[,grepl(gsub("_R[1-9]","",i),names(df2))],1, mean)
+ }
> df2_final[,1]=df2[,1]
> names(df2_final)[1]=names(df2)[1]
> df2_final=ddply(df2_final,"Gene",numcolwise(mean))
> df2_final
   Gene      X_T0      X_T1      X_T2      X_T3      Y_T0      Y_T1
1 ACTIN 1.1859445 1.3523827 1.1558710 0.8806907 0.6933601 0.6821284
2   mut 0.4653181 0.4382178 0.7908480 0.9810489 2.2811380 0.6694078
3    Pq 0.4858009 0.3294480 0.9263058 1.1188797 2.1141377 0.6816544
4   UBQ 1.2584569 1.2043603 1.1573958 0.8640388 0.8418493 0.7453836
5    Xa 0.5085357 0.3054136 0.7560264 0.9967104 2.3375154 0.6351242
       Y_T2      Y_T3      Z_T0      Z_T1      Z_T2      Z_T3
1 0.5768677 0.6571992 0.7800741 1.0792827 1.6275488 0.9898129
2 1.3192524 1.3812931 1.0313102 0.8722349 0.7704934 0.8235224
3 0.9729385 0.9363347 1.0564653 1.1833721 1.1054867 0.7708548
4 0.8868113 0.8949310 0.9364221 1.0888740 1.3149747 0.7821758
5 0.2378333 0.2400860 0.8442166 1.3055028 2.1482562 0.9984764
ADD REPLY
0
Entering edit mode

Thank you very much cpad0112 , I like the first approach. I do not know when I'm learning R up to this level :)

Thanks again, I appreciate your kind help always! Well, I wish I know your real name, maybe next time!

ADD REPLY
1
Entering edit mode

Keep visiting biostars. I learnt a lot from here esp awk, sed and python. Hope this forum would help you become better in whatever you are pursuing.

ADD REPLY
1
Entering edit mode
6.5 years ago

It seems you're looking for the colMeans() and rowMeans() functions in R

ADD COMMENT
0
Entering edit mode

Thanks, Heriché, But his approach apparently difficult for my working df. Looking for some easy step!

ADD REPLY
1
Entering edit mode

Use subsetting. Using your example, rowMeans(df1[,c(1:3)]) would give the values in column X_T0.

ADD REPLY
1
Entering edit mode
6.5 years ago
russhh 5.7k

I couldn't read your structure code into R. However, witht the following:

set.seed(1)
a <- data.frame(
 Gene = c("AA", "PQ", "XY", "UBQ"),
 X_TO_R1 = rnorm(4),
 X_TO_R2 = rnorm(4),
 X_TO_R3 = rnorm(4),
 X_T1_R1 = rnorm(4),
 X_T1_R2 = rnorm(4),
 X_T1_R3 = rnorm(4)
 )

library(dplyr)
library(tidyr)
tidyr::gather(a, "experiment", "value", X_TO_R1:X_T1_R3) %>%
    tidyr::separate(experiment, c("genotype", "time", "rep"), sep = "_") %>%
    group_by(Gene, time) %>%
    summarise(mean_val = mean(value)) %>%
    ungroup() %>%
    spread(time, mean_val)

Will give you the first case (modulo some formatting):

    Gene          T1         TO
* <fctr>       <dbl>      <dbl>
1     AA  0.09384884  0.0929451
2     PQ -0.16290913 -0.3140711
3    UBQ -0.48012799  0.9078162
4     XY  0.67357237  0.3878605
ADD COMMENT
2
Entering edit mode

in similar lines (updated with OP data. df1=first code in OP):

> library(dplyr)
> library(stringr)
> library(tidyr)
> gdf1 = gather(df1, "group", "Expression", -Gene)
> gdf1$tgroup = apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)], 1, paste, collapse ="_")
> library(dplyr)
> gdf1 %>% group_by(Gene, tgroup) %>% summarize(expression_mean = mean(Expression)) %>% spread(., tgroup, expression_mean)
# A tibble: 4 x 13
# Groups:   Gene [4]
  Gene   X_T0  X_T1  X_T2  X_T3  Y_T0  Y_T1  Y_T2   Y_T3  Z_T0  Z_T1
  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
1 AA    1.33  1.50  1.12  1.03  0.425 0.723 0.336 0.328  0.788 1.13 
2 PQ    0.223 0.293 0.548 0.841 3.14  0.577 1.32  1.15   1.04  1.00 
3 UBQ   1.27  1.12  0.936 0.958 1.01  0.605 1.21  1.41   1.11  0.877
4 XY    0.293 0.324 0.846 1.11  2.20  0.896 0.291 0.0922 0.885 1.39
ADD REPLY
0
Entering edit mode

@russhh & cpad0112 Thank you. I've edited my df structure. It was an error from dput() output, sorry about that.

Both of your syntaxes are tempting :)

However, I see by using russhh's syntax, all my genotypes get averaged together. What I am expecting is to average only 3 replicates at each time point of each genotype separately.

Thank you for fixing this cpad0112

ADD REPLY

Login before adding your answer.

Traffic: 2102 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6