我们可能需要complete
这里。按“组”分组后,使用complete
得到的组合unique
每个“组”和“ID”的非 NA“值”
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
group_by(group) %>%
complete(ID, value = unique(value[!is.na(value)])) %>%
na.omit %>%
select(names(df1))
# A tibble: 15 x 3
# Groups: group [3]
# ID group value
# <int> <chr> <chr>
# 1 1 A blue
# 2 2 A blue
# 3 3 A blue
# 4 4 B green
# 5 4 B red
# 6 5 B green
# 7 5 B red
# 8 6 B green
# 9 6 B red
#10 7 C blue
#11 7 C green
#12 8 C blue
#13 8 C green
#14 9 C blue
#15 9 C green
Update
使用新数据集,我们可以做
df2 %>%
group_by(group) %>%
mutate(valnew = str_c(value, specific_value, sep=":")) %>%
select(-value, -specific_value, -dataversion) %>%
complete(ID, valnew = unique(valnew[!is.na(valnew)])) %>%
filter(!is.na(valnew)) %>%
separate(valnew, into = c('value', 'specific_value'), sep=":") %>%
mutate(rn = row_number()) %>%
left_join(df2 %>%
select(ID, dataversion)) %>%
filter(!duplicated(rn)) %>%
select(names(df2))
# A tibble: 15 x 5
# Groups: group [3]
# ID group value specific_value dataversion
# <int> <chr> <chr> <chr> <chr>
# 1 1 A blue sky_blue version1
# 2 2 A blue sky_blue version2
# 3 3 A blue sky_blue version1
# 4 4 B green forest_green version1
# 5 4 B red scarlet version1
# 6 5 B green forest_green version2
# 7 5 B red scarlet version2
# 8 6 B green forest_green <NA>
# 9 6 B red scarlet <NA>
#10 7 C blue royal_blue version2
#11 7 C green lime_green version2
#12 8 C blue royal_blue version1
#13 8 C green lime_green version1
#14 9 C blue royal_blue version1
#15 9 C green lime_green version1
data
df1 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L),
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA)), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10"), class = "data.frame")
df2 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L),
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA), specific_value = c("sky_blue", NA, NA, "forest_green",
"scarlet", NA, NA, "royal_blue", "lime_green", NA), dataversion = c("version1",
"version2", "version1", "version1", "version1", "version2",
NA, "version2", "version1", "version1")), class = "data.frame",
row.names = c(NA,
-10L))