我想计算一行中有多少个值是相同的。这个想法是为了能够过滤掉那些回答相同的问题(即回答所有问题都相同)的受访者,例如通过过滤掉那些在各个列中有超过 90% 相同答案的受访者。

我想出了下面的代码,它可以工作,但在大型数据集上速度很慢。下面的示例有 5 列和 1000 行,但我的实际数据有 30 列和 200.000 行。

# define function
count_identical_values <- function(df){
  columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
  
  df %>%
    rowwise() %>%
    mutate(identical_count = case_when(
      # all NA, then NA
      all(is.na(c_across(all_of(columns)))) ~ NA_real_,
      # else, count the number of identical values
      TRUE ~ max(table(c_across(all_of(columns)))))) %>%
    ungroup()
}

# make df
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))

# apply function
df = count_identical_values(df)

如果有人知道如何加快这一速度,那就太好了!

1

  • 1
    您的实际数据中的值都是整数和 NA 吗?


    – 


最佳答案
4

apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))

更快?(需要做一些工作来处理NA值,但rowSums(is.na(as.matrix(df))) == ncol(df)可能会很快找到它们……)

0

下面是一个使用 的矢量化函数collapse::fmode。它将在不到半秒的时间内处理您的“真实”数据集,并且可以处理任何数据类型。不清楚是否NA应包括在计数中,因此incl.na有争论。

rowsames <- function(df, incl.na = TRUE) {
  u <- unlist(df, 0, 0)
  m <- match(u, unique(u), NA_integer_, if (incl.na) NULL else NA)
  rowSums(`dim<-`(fmode(m, sequence(rep.int(nrow(df), ncol(df))), na.rm = TRUE,
                        use.g.names = FALSE) == m, dim(df)), na.rm = TRUE)
}

将其应用于示例data.frame

set.seed(1548243204)

df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
                statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))

rowsames(df)[1:10]
#>  [1] 2 3 3 2 3 2 2 2 3 2
rowsames(df, FALSE)[1:10]
#>  [1] 2 3 3 2 3 2 2 2 3 1

注意NA是第 10 行中最常见的值,因此如果设置为,则返回该行,否则rowsames返回2incl.naTRUE1

df[1:10,]
#>    statement_1 statement_2 statement_3 statement_4 statement_5
#> 1            5           5           4          NA           2
#> 2            4           3           5           5           5
#> 3            5           5          NA           5          NA
#> 4            4           5           2           1           5
#> 5            3           4           4           4           3
#> 6            3          NA           3           1           1
#> 7           NA           4          NA           2           4
#> 8           NA           1           4           3           3
#> 9            4           2           4           3           4
#> 10           2          NA          NA           4           5

我的实际数据有 30 列和 200.000 行

df <- as.data.frame(matrix(sample(c(NA, 1:5), 6e6, 1), 2e5, 30, 0,
                           list(NULL, paste0("statement_", 1:30))))
system.time(rowsames(df))
#>    user  system elapsed 
#>    0.41    0.03    0.43

额外的基准测试

扩展@RonakShah 的基准测试:

set.seed(3452)

df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))


fun_dplyr <- function(df) {
  columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
  
  df %>%
    rowwise() %>%
    mutate(identical_count = case_when(
      # all NA, then NA
      all(is.na(c_across(all_of(columns)))) ~ NA_real_,
      # else, count the number of identical values
      TRUE ~ max(table(c_across(all_of(columns)))))) %>%
    ungroup()
}

fun_apply <- function(df) {
  apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}

fun_dapply <- function(df) {
  collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}

时间安排:

microbenchmark::microbenchmark(
  # fun_dplyr = fun_dplyr(df), # too slow
  fun_apply = fun_apply(df),
  fun_dapply = fun_dapply(df),
  rowsames = rowsames(df)
)
#> Unit: milliseconds
#>        expr      min       lq      mean    median        uq      max neval
#>   fun_apply 298.4000 327.0602 355.26551 352.78755 379.27705 433.5157   100
#>  fun_dapply 235.7283 261.1802 300.46456 287.33650 335.46850 442.8762   100
#>    rowsames  34.4906  36.1296  37.61214  37.16805  38.68065  47.0696   100

1

  • 很好,在已经提高 300 倍的速度的基础上又提高了 10 倍……


    – 

您可以使用collapse::dapply它会更快。

collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)

要处理所有NA值,您可以使用当所有值都为时hablar::max_返回的函数NANA

基准:

set.seed(3452)

df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
                statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))


fun_dplyr <- function(df) {
    columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
    
    df %>%
      rowwise() %>%
      mutate(identical_count = case_when(
        # all NA, then NA
        all(is.na(c_across(all_of(columns)))) ~ NA_real_,
        # else, count the number of identical values
        TRUE ~ max(table(c_across(all_of(columns)))))) %>%
      ungroup()
}

fun_apply <- function(df) {
  apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}

fun_dapply <- function(df) {
  collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}

microbenchmark::microbenchmark(
  fun_dplyr = fun_dplyr(df), 
  fun_apply = fun_apply(df), 
  fun_dapply = fun_dapply(df), 
  times = 10L
)

返回

Unit: milliseconds
       expr         min          lq        mean      median          uq         max neval
  fun_dplyr 125572.7109 126954.4355 131877.4115 133178.7886 135347.5074 136411.1468    10
  fun_apply    413.7097    439.3520    476.6822    456.8837    507.4259    583.4736    10
 fun_dapply    293.8734    315.4473    380.0586    336.0351    363.5058    629.0772    10

使用 的另一种按行制表的实现matrixStats。您可以在函数中包装逻辑以过滤掉超过特定阈值的直接回答者:

> fltr_straight <- \(x, threshold=.9) {
+   mx <- as.matrix(x) |> matrixStats::rowTabulates() |> matrixStats::rowMaxs()
+   mx/ncol(x) <= threshold
+ }

用法

> df_fltr <- df[fltr_straight(df[1:5]), ]

检查哪些人已被过滤掉:

> setdiff(rownames(df), rownames(df_fltr))
[1] "171" "261" "287" "586"

数据:

set.seed(63572972)
df <- data.frame(
  matrix(sample(c(NA, 1:5), 1000*5, replace=TRUE),
         1000, 5)
)