我想计算一行中有多少个值是相同的。这个想法是为了能够过滤掉那些回答相同的问题(即回答所有问题都相同)的受访者,例如通过过滤掉那些在各个列中有超过 90% 相同答案的受访者。
我想出了下面的代码,它可以工作,但在大型数据集上速度很慢。下面的示例有 5 列和 1000 行,但我的实际数据有 30 列和 200.000 行。
# define function
count_identical_values <- function(df){
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
# make df
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))
# apply function
df = count_identical_values(df)
如果有人知道如何加快这一速度,那就太好了!
1
最佳答案
4
是
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
更快?(需要做一些工作来处理NA
值,但rowSums(is.na(as.matrix(df))) == ncol(df)
可能会很快找到它们……)
0
|
下面是一个使用 的矢量化函数collapse::fmode
。它将在不到半秒的时间内处理您的“真实”数据集,并且可以处理任何数据类型。不清楚是否NA
应包括在计数中,因此incl.na
有争论。
rowsames <- function(df, incl.na = TRUE) {
u <- unlist(df, 0, 0)
m <- match(u, unique(u), NA_integer_, if (incl.na) NULL else NA)
rowSums(`dim<-`(fmode(m, sequence(rep.int(nrow(df), ncol(df))), na.rm = TRUE,
use.g.names = FALSE) == m, dim(df)), na.rm = TRUE)
}
将其应用于示例data.frame
:
set.seed(1548243204)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))
rowsames(df)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 2
rowsames(df, FALSE)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 1
注意NA
是第 10 行中最常见的值,因此如果设置为,则返回该行,否则rowsames
返回。2
incl.na
TRUE
1
df[1:10,]
#> statement_1 statement_2 statement_3 statement_4 statement_5
#> 1 5 5 4 NA 2
#> 2 4 3 5 5 5
#> 3 5 5 NA 5 NA
#> 4 4 5 2 1 5
#> 5 3 4 4 4 3
#> 6 3 NA 3 1 1
#> 7 NA 4 NA 2 4
#> 8 NA 1 4 3 3
#> 9 4 2 4 3 4
#> 10 2 NA NA 4 5
我的实际数据有 30 列和 200.000 行
df <- as.data.frame(matrix(sample(c(NA, 1:5), 6e6, 1), 2e5, 30, 0,
list(NULL, paste0("statement_", 1:30))))
system.time(rowsames(df))
#> user system elapsed
#> 0.41 0.03 0.43
额外的基准测试
扩展@RonakShah 的基准测试:
set.seed(3452)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))
fun_dplyr <- function(df) {
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
fun_apply <- function(df) {
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}
fun_dapply <- function(df) {
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}
时间安排:
microbenchmark::microbenchmark(
# fun_dplyr = fun_dplyr(df), # too slow
fun_apply = fun_apply(df),
fun_dapply = fun_dapply(df),
rowsames = rowsames(df)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fun_apply 298.4000 327.0602 355.26551 352.78755 379.27705 433.5157 100
#> fun_dapply 235.7283 261.1802 300.46456 287.33650 335.46850 442.8762 100
#> rowsames 34.4906 36.1296 37.61214 37.16805 38.68065 47.0696 100
1
-
很好,在已经提高 300 倍的速度的基础上又提高了 10 倍……
–
|
您可以使用collapse::dapply
它会更快。
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
要处理所有NA
值,您可以使用当所有值都为时hablar::max_
返回的函数。NA
NA
基准:
set.seed(3452)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))
fun_dplyr <- function(df) {
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
fun_apply <- function(df) {
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}
fun_dapply <- function(df) {
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}
microbenchmark::microbenchmark(
fun_dplyr = fun_dplyr(df),
fun_apply = fun_apply(df),
fun_dapply = fun_dapply(df),
times = 10L
)
返回
Unit: milliseconds
expr min lq mean median uq max neval
fun_dplyr 125572.7109 126954.4355 131877.4115 133178.7886 135347.5074 136411.1468 10
fun_apply 413.7097 439.3520 476.6822 456.8837 507.4259 583.4736 10
fun_dapply 293.8734 315.4473 380.0586 336.0351 363.5058 629.0772 10
|
使用 的另一种按行制表的实现matrixStats
。您可以在函数中包装逻辑以过滤掉超过特定阈值的直接回答者:
> fltr_straight <- \(x, threshold=.9) {
+ mx <- as.matrix(x) |> matrixStats::rowTabulates() |> matrixStats::rowMaxs()
+ mx/ncol(x) <= threshold
+ }
用法
> df_fltr <- df[fltr_straight(df[1:5]), ]
检查哪些人已被过滤掉:
> setdiff(rownames(df), rownames(df_fltr))
[1] "171" "261" "287" "586"
数据:
set.seed(63572972)
df <- data.frame(
matrix(sample(c(NA, 1:5), 1000*5, replace=TRUE),
1000, 5)
)
|
–
|