Separate one column in two column R
3
1
Entering edit mode
6.6 years ago
Kian ▴ 50

How can separate one column in two column in R?

         id        rs143        rs148       rs149      rs1490 
 1    02003s         NA          11          22          11    
 2    02003s         NA          10          11          22   
 3    02003s         NA          11          11          12   
 4    02003s         NA          10          11          11 
 5    02003s         NA          10          11          11  

 in result i want this format:

            id     rs143  rs143.1 rs148 rs148.1  rs149  rs149.1  rs1490   rs1490.1
     1    02003s    NA    NA        1     1       2       2       1        1
     2    02003s    NA    NA        1     0       1       1       2        2
     3    02003s    NA    NA        1     1       1       1       1        2
     4    02003s    NA    NA        1     0       1       1       1        1
     5    02003s    NA    NA        1     0       1       1       1        1
R separate column SNP • 3.1k views
ADD COMMENT
4
Entering edit mode

You're going to want to use thestrsplit() function with a separator of "". Lots of examples can be found with a few searches.

ADD REPLY
1
Entering edit mode

What have you tried?

ADD REPLY
0
Entering edit mode

What is the next step? Maybe consider using plink and read about "compound genotype" flag?

ADD REPLY
0
Entering edit mode

Yes, Its Right. Plink need allelic format of genotype for QC. Can you tell me, whats the right conversion of 0,1,2 codes to allelic format need for PLINK? I assume 0 should be convert to 11 for minor hemozygote, 1 to 10 for hetro and 2 convert to 22 for hemoz. is it right?

ADD REPLY
0
Entering edit mode

https://www.biostars.org/u/39484/: While you accepted @Kevin's answer there are a couple more posted now. Can you test them and see if they work? You can accept more than one answer as correct as long as they work.
Upvote|Bookmark|Accept

Please do the same for your previous posts as well.

ADD REPLY
4
Entering edit mode
6.6 years ago
zx8754 12k

A bit simpler version of Kevin's answer, loop through columns, split, assign column names, and finally bind them back together.

res <-  do.call(cbind,
                list(df[, 1, drop = FALSE],
                     lapply(colnames(df[, -1]), function(i){
                       as.data.frame(list(substring(df[, i], 1, 1),
                                          substring(df[, i], 2, 2)),
                                     col.names = c(i, i))
                       })
                     )
                )

res

#       id rs143 rs143.1 rs148 rs148.1 rs149 rs149.1 rs1490 rs1490.1
# 1 02003s  <NA>    <NA>     1       1     2       2      1        1
# 2 02003s  <NA>    <NA>     1       0     1       1      2        2
# 3 02003s  <NA>    <NA>     1       1     1       1      1        2
# 4 02003s  <NA>    <NA>     1       0     1       1      1        1
# 5 02003s  <NA>    <NA>     1       0     1       1      1        1
ADD COMMENT
2
Entering edit mode
6.6 years ago

Requires a nice mixture of diverse functions here. This could be done in a single line, but would be complex.

> df
      id rs143 rs148 rs149 rs1490
1 02003s    NA    11    22     11
2 02003s    NA    10    11     22
3 02003s    NA    11    11     12
4 02003s    NA    10    11     11
5 02003s    NA    10    11     11

#Ensure that NAs are encoded as characters
> df[is.na(df)] <- "NA"

#split each value by an empty string delimiter, then re-merge all columns back together
> df2 <- data.frame(df$id, do.call(cbind, lapply(df[,2:ncol(df)], function(x) t(as.data.frame(strsplit(as.character(x), split=""))))), row.names=c(1:nrow(df)))

> df2
   df.id X1 X2 X3 X4 X5 X6 X7 X8
1 02003s  N  A  1  1  2  2  1  1
2 02003s  N  A  1  0  1  1  2  2
3 02003s  N  A  1  1  1  1  1  2
4 02003s  N  A  1  0  1  1  1  1
5 02003s  N  A  1  0  1  1  1  1

#Now fix the colnames
> index1 <- seq(from=2, to=ncol(df2), by=2)
> index2 <- seq(from=3, to=ncol(df2), by=2)
> colnames(df2)[index1] <- colnames(df[2:ncol(df)])
> colnames(df2)[index2] <- paste(colnames(df[2:ncol(df)]), ".1", sep="")

> df2
    df.id rs143 rs143.1 rs148 rs148.1 rs149 rs149.1 rs1490 rs1490.1
1 02003s     N       A     1       1     2       2      1        1
2 02003s     N       A     1       0     1       1      2        2
3 02003s     N       A     1       1     1       1      1        2
4 02003s     N       A     1       0     1       1      1        1
5 02003s     N       A     1       0     1       1      1        1

#Restore he NAs
> df2[df2=="N"] <- NA
> df2[df2=="A"] <- NA

df
      id rs143 rs148 rs149 rs1490
1 02003s    NA    11    22     11
2 02003s    NA    10    11     22
3 02003s    NA    11    11     12
4 02003s    NA    10    11     11
5 02003s    NA    10    11     11

df2
   df.id rs143 rs143.1 rs148 rs148.1 rs149 rs149.1 rs1490 rs1490.1
1 02003s  <NA>    <NA>     1       1     2       2      1        1
2 02003s  <NA>    <NA>     1       0     1       1      2        2
3 02003s  <NA>    <NA>     1       1     1       1      1        2
4 02003s  <NA>    <NA>     1       0     1       1      1        1
5 02003s  <NA>    <NA>     1       0     1       1      1        1
ADD COMMENT
1
Entering edit mode

Hi Thank you very much Dear Kevin its GREAT!!

ADD REPLY
0
Entering edit mode

Did you mean df[is.na(df)] <- NA , not "NA" ?

ADD REPLY
0
Entering edit mode

No, because the NAs have to be encoded as character NAs. Otherwise, lapply with strsplit as the function does not function correctly.

ADD REPLY
1
Entering edit mode
6.6 years ago

test2.txt= data from OP If naming is no problem:

library(stringr)
test=read.csv("test2.txt", stringsAsFactors = F, header = T, sep="\t")

new_test = cbind(test[1], as.data.frame(sapply(test[,-1], function (x)
  if (is.integer(x)) {
    as.data.frame(str_split_fixed(as.character(x), "", 2))
  }
  else {
    replicate(2, x)
  })))
> new_test
      id rs143.1 rs143.2 rs148.V1 rs148.V2 rs149.V1 rs149.V2 rs1490.V1 rs1490.V2
1 02003s      NA      NA        1        1        2        2         1         1
2 02003s      NA      NA        1        0        1        1         2         2
3 02003s      NA      NA        1        1        1        1         1         2
4 02003s      NA      NA        1        0        1        1         1         1
5 02003s      NA      NA        1        0        1        1         1         1

If naming is important:

df = as.data.frame(cbind(test[1],lapply(test[, -1], function (x)
  if (is.integer(x)) {
    do.call(rbind, strsplit(as.character(x), ""))
  }
  else {
    replicate(2, x)
  })))

names(df)=gsub("\\.1","",names(df))
names(df)=gsub("\\.2","\\.1",names(df))
    > df
      id rs143 rs143.1 rs148 rs148.1 rs149 rs149.1 rs1490 rs1490.1
1 02003s    NA      NA     1       1     2       2      1        1
2 02003s    NA      NA     1       0     1       1      2        2
3 02003s    NA      NA     1       1     1       1      1        2
4 02003s    NA      NA     1       0     1       1      1        1
5 02003s    NA      NA     1       0     1       1      1        1
ADD COMMENT

Login before adding your answer.

Traffic: 1859 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6