我有一个变量x,我想将其分成三组,每个组具有相同的观测值。但是,使用分位数并没有产生最相等的组,因为分位数的截断点可能会导致平局被分配到多个组中。我正在寻找一个函数或算法,它可以找到最佳截断点,同时确保平局不会分散到多个组中。

x = c(26, 34, 27, 26, 38, 40, 34, 28, 27, 36, 29, 30, 29, 44, 30, 
      34, 32, 30, 26, 29, 34, 32, 38, 27, 35, 29, 28, 34, 26, 27, 27, 
      30, 27, 28, 27, 28, 28, 27, 29, 29, 28, 29, 29, 28, 29, 29, 28, 
      27, 29, 27, 36, 34, 34, 39, 34, 31, 31, 33, 35, 31, 31, 32, 37, 
      38, 32, 31, 28, 33, 33, 28, 27, 27, 30, 31, 32, 28, 27, 31, 36, 
      27, 33, 31, 34, 31, 35, 38, 37, 36, 39, 33, 33, 28, 41, 34, 35, 
      37, 37, 41, 32, 37, 30, 34, 38, 30, 40, 35, 31, 30, 30, 29, 29, 
      30, 29, 35, 28, 27, 27, 27, 29, 27, 28, 27, 27, 27, 26, 28, 28, 
      27, 29, 29, 27, 27, 27, 27, 29, 27, 28, 27, 28, 34, 29, 28, 28, 
      28, 29, 38, 33, 39, 28, 27, 28, 27, 29, 34, 29, 32, 70, 26, 29, 
      43, 48, 30, 30, 27, 26, 29, 27, 27, 27, 27, 28, 28, 27, 28, 28, 
      27, 28, 28, 38, 52, 26, 31, 56, 29, 29, 36, 28, 35, 32, 34, 35, 
      28, 27, 37, 26, 26, 32, 26, 27, 30, 28, 28, 30, 29, 30, 29, 29, 
      28, 26, 33, 39, 26, 31, 27, 28, 30, 30, 28, 28, 29, 26, 27, 26, 
      29, 28, 28, 27, 27, 27, 28, 27, 28, 28, 28, 28, 28, 27, 27, 29, 
      27, 26, 28, 28, 27, 27, 28, 27, 28, 28, 30, 27, 30, 28, 32, 34, 
      28, 27, 28, 28, 27, 28, 27, 27, 27, 28, 27, 28, 27, 27, 28, 27, 
      27, 27, 27, 27, 28, 27, 27, 27, 26, 27, 27, 30, 28, 27, 30, 30, 
      42, 26, 27, 40, 33, 29, 29, 29, 52, 58, 44, 32, 43, 30, 27, 38, 
      30, 27, 30, 27, 31, 39, 35, 32, 32, 34, 45, 31, 44, 42, 29, 29, 
      30, 30, 50, 30, 33, 31, 35, 27, 28, 27, 28, 55, 28, 28, 28, 27, 
      27, 28, 29, 27, 28, 27, 28, 28, 28, 28, 27, 28, 29, 34, 45, 27, 
      29, 61, 38, 62, 29, 36, 36, 30, 31, 45, 27, 30, 28, 29, 44, 45, 
      42, 52, 50, 52, 42, 38, 42, 32, 27, 37, 40, 52, 27, 36, 38, 39, 
      34, 30, 29, 34, 29, 26, 35, 43, 33, 40, 35, 33, 41, 61, 45, 35, 
      52, 50, 38, 43, 29, 35, 38, 39, 31, 28, 28, 29, 34, 27, 30, 32, 
      28, 26, 28, 27, 26, 29, 27, 26, 29, 29, 27, 29, 27, 27, 29, 27, 
      30, 29, 25, 30, 27, 29, 29, 30, 30, 27, 30, 28, 28, 27, 29, 29, 
      30, 29, 27, 28, 28, 28, 29, 28, 28, 27, 28, 29, 28, 29, 27, 28, 
      28, 28, 30, 27, 27, 28, 26, 28, 27, 27, 28, 28, 28, 28, 27, 27, 
      28, 27, 28, 27, 35, 27, 27, 28, 29, 27, 27, 28, 26, 27, 28, 28, 
      28, 27, 27, 27, 28, 32, 27, 28, 28, 29, 28, 28, 27, 28, 28, 30, 
      29, 28, 25, 27, 28, 30, 28, 30, 30, 28, 30, 30, 28, 29, 30, 28, 
      28, 26, 27, 28, 45, 36, 40, 28, 50, 45, 30, 45, 40, 30, 45, 45, 
      29, 45, 35, 40, 40, 30, 30, 30, 45, 40, 40, 40, 40, 40, 40, 35, 
      34, 49, 40, 30, 61, 35, 40, 30, 36, 35, 29, 27, 48, 28, 27, 27, 
      26, 27, 29, 27, 26, 27, 31, 27, 27, 28, 29, 28, 27, 28, 29, 38, 
      30, 26, 36, 40, 58, 57, 30, 33, 56, 35, 39, 37, 38, 46, 37, 39, 
      39, 45, 35, 46, 58, 65, 60, 45, 32, 36, 43, 32, 68, 39, 28, 31, 
      27, 28, 27, 37, 38, 30, 30, 28, 36, 45, 28, 26, 28, 28, 28, 27, 
      26, 28, 27, 26, 26, 27, 28, 31, 32, 37, 35, 29, 33, 35, 29, 41, 
      32, 36, 29, 28, 28, 28, 37, 36, 37, 35, 31, 32, 30, 27, 31, 32, 
      31, 33, 28, 33, 29, 27, 28, 31, 28, 31, 28, 34, 27, 27, 28, 27, 
      27, 27, 27, 26, 26, 26, 27, 27, 28, 26, 31, 26, 29, 31, 29, 29, 
      30, 29, 30, 31, 32, 29, 30, 27, 32, 27, 26, 31, 31, 31, 27, 27, 
      33, 27, 28, 28, 28, 26, 27, 27, 28, 30, 27, 27, 30, 29, 26, 27, 
      28, 27, 26, 26, 28, 27, 26, 28, 28, 26, 28, 27, 29, 27, 28, 28, 
      26, 26, 29, 28, 27, 27, 27, 28, 26, 25, 27, 29, 30, 36, 40, 28, 
      38, 26, 27, 27, 50, 27, 45, 27, 28, 26, 25, 35, 35, 44, 30, 27, 
      31, 27, 28, 27, 27, 28, 28, 28, 35, 33, 30, 28, 28, 29, 29, 36, 
      32, 36, 34, 32, 28, 28, 29, 28, 28, 32, 30, 35, 33, 36, 32, 30, 
      32, 36, 34)

quantile(x, probs = c(0.333, 0.666))
#> 33.3% 66.6% 
#>    28    31

l = cut(x, breaks = c(-Inf, 28, 31, Inf))
table(l)
#> l
#> (-Inf,28]   (28,31] (31, Inf] 
#>       387       185       246

#using different cut-off points yielded more equal groups
l = cut(x, breaks = c(-Inf, 28, 32, Inf))
table(l)
#> l
#> (-Inf,28]   (28,32] (32, Inf] 
#>       387       214       217

#again using different cut-off points which yielded more equal groups
l = cut(x, breaks = c(-Inf, 27, 32, Inf))
table(l)
#> l
#> (-Inf,27]   (27,32] (32, Inf] 
#>       222       379       217

创建于 2024-10-07,使用

编辑:我认为“平等”这个词不清楚,所以我想我可以说,我寻求最合适的分配,使最高组和最低组观察数字之间的差异最小,并且只对连续的数字进行分组,而没有多个组之间的联系

10

  • 这个函数的具体要求是什么?如果你的数据都是相同的值怎么办?你提供的测试用例越多,就越容易知道是否能满足要求。


    – 

  • @MrFlick 抱歉造成混淆,假设这个变量是年龄,出于某些原因,我想将其分成最相等的组。当然,如果数据都是相同的值,我就不需要拆分它或使用它,因为我认为它现在被视为常数。如果问题仍然模糊,请告诉我以便编辑它。


    – 

  • 最相等的组?仅此而已?即您仅对连续数字进行分组,还是其他任何值都可以归入任何组?


    – 

  • @Onyambu 我正在寻找一种对连续数字进行分组的方法,类似于使用分位数提供的截止点。但是,正如我所说,我需要考虑数据中的平局,因为平局不能出现在多个组中。我相信这可以通过迭代分位数并尝试不同的截止点直到找到最合适的分组来实现。有没有更优雅的方法或 R 中的特定函数可以有效地完成此操作?


    – 

  • 你有 818 个数据点,对于相等分组,每个组应该有大约 272 个数据点,当考虑到数据似乎呈泊松分布且 lambda 约为 31 时,你不能这样做


    – 



最佳答案
3

我创建了一个函数,它采用数字和断点的向量,然后计算每个箱的长度与平均箱大小(总长度除以 n,即断点数)相比,并返回箱大小和平均箱大小之间的绝对差异之和。

然后我可以使用optim优化该函数并找到最佳断点。但是,由于optim它不是设计用于整数的,因此我将断点和向量(即 x)除以 100,以便更容易达到最佳效果。

fsplit <- function(param, xx, nn){
  l  <- cut(xx, breaks = c(-Inf, param, Inf))
  lt <- table(l)
  d  <- lt - length(xx)/nn
  return(sum(abs(d)))
}

m1 <- optim(c(28,31)/100, xx = x/100, nn = 3, fsplit, method='CG')

## optimal break points
floor(m1$par*100)
#> [1] 27 30
fsplit(floor(m1$par*100), x, 3)
#> [1] 101.3333

table(cut(x, breaks = c(-Inf, floor(m1$par*100), Inf)))
#> (-Inf,27]   (27,30] (30, Inf] 
#>       222       318       278 

基准:

对于 OP 的示例来说,检查每个可能的断点效果相对较好,但对于较大的向量,这将非常昂贵。见下文;

Mdd <- function(x){
fsplit <- function(param, xx, nn){
  l  <- cut(xx, breaks = c(-Inf, param, Inf))
  lt <- table(l)
  d  <- lt - length(xx)/nn
  return(sum(abs(d)))
}
q1 <- quantile(x, probs = c(1/3, 2/3))
d1 <- 10^ceiling(log10(max(x)))
m1 <- optim(q1/d1, xx = x/100, nn = 3, fsplit, method='CG')
unname(floor(m1$par*d1))
}

Ric <- function(x) {
  tbl <- unclass(table(x))
  cutting <- combn(length(tbl) - 1, 2)
  sums <- apply(cutting, 2, \(i) c(
    sum(tbl[1:i[1]]),        
    sum(tbl[(i[1]+1):i[2]]), 
    sum(tbl[-(1:i[2])])))    
  stopifnot(all(colSums(sums) == sum(tbl)))
  count_diff <- apply(sums, 2, \(i) max(i) - min(i))
  return(as.numeric(names(tbl)[cutting[,order(count_diff)[1]]]))
}

TIC <- function(x){
  res <- combn(sort(unique(x)), 2, \(k)
    table(cut(x, breaks = c(-Inf, k, Inf))), simplify = FALSE)
  as.numeric(unlist(strsplit(
    gsub("\\(|]|\\)", "", names(res[[which.min(sapply(res, var))]][2])),",")))
}

对于 OP 的向量,性能是可比的:

microbenchmark::microbenchmark(Mdd(x), TIC(x), Ric(x), 
                               check = "identical")

Unit: milliseconds
   expr     min      lq      mean  median      uq      max neval cld
 Mdd(x)  3.2567  3.6213  4.611345  3.8095  4.1240  51.8633   100   a 
 TIC(x) 81.5470 85.7937 93.407666 91.9252 96.8024 169.0715   100   b
 Ric(x)  3.4082  3.7759  4.664426  3.9968  4.5983  16.9820   100   a 

但对于更大的矢量……

set.seed(123)
xx <- sample(15:200, 2000, replace = T)

microbenchmark::microbenchmark(Mdd(xx), TIC(xx), Ric(xx), 
                               check = "identical", times = 20, unit = "ms")

Unit: milliseconds
    expr      min        lq       mean    median        uq      max neval cld
 Mdd(xx)    0.975    1.0793    1.24086    1.2914    1.3601    1.509    20   a  
 TIC(xx) 3179.531 3300.5153 3408.50612 3389.3137 3488.5846 3689.105    20   b 
 Ric(xx)  142.919  151.5047  176.46253  160.1243  165.8123  288.897    20   c

更新

如果你只想要 3 组,你可以尝试

res <- combn(sort(unique(x)), 2, \(k) table(cut(x, breaks = c(-Inf, k, Inf))), simplify = FALSE)
res[[which.min(sapply(res, var))]]

给出

(-Inf,27]   (27,30] (30, Inf]
      222       318       278

较早的答案

这不是一个聪明的方法,但希望可能会有所帮助

n <- 2
lst <- c()
repeat {
    q <- quantile(x, probs = seq.int(n - 1) / n)
    if (!anyDuplicated(q)) {
        lst <- c(lst,list(table(cut(x, breaks = c(-Inf, q, Inf)))))
    } else {
        break
    }
    n <- n + 1
}
lst[[which.min(sapply(lst, var))]]

给出

(-Inf,27]   (27,28]   (28,30]   (30,35] (35, Inf]
      222       165       153       132       146

2

  • 我想将数字分成 3 组,正如我在问题中所展示的,分位数的问题在于,由于平局,它们不能给出最平衡/最优质的组,所以我认为我不应该依赖它们?


    – 


  • @NEA 查看我的更新


    – 

该脚本会计算所有可能的组,并确定“最相等”的切点,理解为较大组与较小组之间的差异最小的切点。

这种组合方法仅适用于中等数量的组和数据大小。否则,根据总和确定组的任务是 NP 难题(我不知道它是否按顺序切割)

# Count and order the classes
tbl <- unclass(table(x))

# Enumerate the possible cutting points. There are 595 possibilities
cutting <- combn(length(tbl) - 1, 2)

# sum the nubmber of elements of each possible group
sums <- apply(cutting, 2, \(i) c(
  sum(tbl[1:i[1]]),        # From the minimal to the first cutting point (inclusive)
  sum(tbl[(i[1]+1):i[2]]), # from next to the first cutting point to second (inclusive)
  sum(tbl[-(1:i[2])])))    # from next to second to last

#check that sum of posible "sums" equals the sum iof tbl (818) 
stopifnot(all(colSums(sums) == sum(tbl)))

#calculate differnece between the largest and the smallest group
# as a metric of "most equal groups.
count_diff <- apply(sums, 2, \(i) max(i) - min(i))

#FINALLY: best cut points (inclusive)
print(names(tbl)[cutting[,order(count_diff)[1]]])
# 27 & 30
print(sums[,order(count_diff)[1]])
# 222, 318, 278

#second best  (and so on...)
names(tbl)[cutting[,order(count_diff)[2]]]

数据:

x = c(26, 34, 27, 26, 38, 40, 34, 28, 27, 36, 29, 30, 29, 44, 30, 
      34, 32, 30, 26, 29, 34, 32, 38, 27, 35, 29, 28, 34, 26, 27, 27, 
      30, 27, 28, 27, 28, 28, 27, 29, 29, 28, 29, 29, 28, 29, 29, 28, 
      27, 29, 27, 36, 34, 34, 39, 34, 31, 31, 33, 35, 31, 31, 32, 37, 
      38, 32, 31, 28, 33, 33, 28, 27, 27, 30, 31, 32, 28, 27, 31, 36, 
      27, 33, 31, 34, 31, 35, 38, 37, 36, 39, 33, 33, 28, 41, 34, 35, 
      37, 37, 41, 32, 37, 30, 34, 38, 30, 40, 35, 31, 30, 30, 29, 29, 
      30, 29, 35, 28, 27, 27, 27, 29, 27, 28, 27, 27, 27, 26, 28, 28, 
      27, 29, 29, 27, 27, 27, 27, 29, 27, 28, 27, 28, 34, 29, 28, 28, 
      28, 29, 38, 33, 39, 28, 27, 28, 27, 29, 34, 29, 32, 70, 26, 29, 
      43, 48, 30, 30, 27, 26, 29, 27, 27, 27, 27, 28, 28, 27, 28, 28, 
      27, 28, 28, 38, 52, 26, 31, 56, 29, 29, 36, 28, 35, 32, 34, 35, 
      28, 27, 37, 26, 26, 32, 26, 27, 30, 28, 28, 30, 29, 30, 29, 29, 
      28, 26, 33, 39, 26, 31, 27, 28, 30, 30, 28, 28, 29, 26, 27, 26, 
      29, 28, 28, 27, 27, 27, 28, 27, 28, 28, 28, 28, 28, 27, 27, 29, 
      27, 26, 28, 28, 27, 27, 28, 27, 28, 28, 30, 27, 30, 28, 32, 34, 
      28, 27, 28, 28, 27, 28, 27, 27, 27, 28, 27, 28, 27, 27, 28, 27, 
      27, 27, 27, 27, 28, 27, 27, 27, 26, 27, 27, 30, 28, 27, 30, 30, 
      42, 26, 27, 40, 33, 29, 29, 29, 52, 58, 44, 32, 43, 30, 27, 38, 
      30, 27, 30, 27, 31, 39, 35, 32, 32, 34, 45, 31, 44, 42, 29, 29, 
      30, 30, 50, 30, 33, 31, 35, 27, 28, 27, 28, 55, 28, 28, 28, 27, 
      27, 28, 29, 27, 28, 27, 28, 28, 28, 28, 27, 28, 29, 34, 45, 27, 
      29, 61, 38, 62, 29, 36, 36, 30, 31, 45, 27, 30, 28, 29, 44, 45, 
      42, 52, 50, 52, 42, 38, 42, 32, 27, 37, 40, 52, 27, 36, 38, 39, 
      34, 30, 29, 34, 29, 26, 35, 43, 33, 40, 35, 33, 41, 61, 45, 35, 
      52, 50, 38, 43, 29, 35, 38, 39, 31, 28, 28, 29, 34, 27, 30, 32, 
      28, 26, 28, 27, 26, 29, 27, 26, 29, 29, 27, 29, 27, 27, 29, 27, 
      30, 29, 25, 30, 27, 29, 29, 30, 30, 27, 30, 28, 28, 27, 29, 29, 
      30, 29, 27, 28, 28, 28, 29, 28, 28, 27, 28, 29, 28, 29, 27, 28, 
      28, 28, 30, 27, 27, 28, 26, 28, 27, 27, 28, 28, 28, 28, 27, 27, 
      28, 27, 28, 27, 35, 27, 27, 28, 29, 27, 27, 28, 26, 27, 28, 28, 
      28, 27, 27, 27, 28, 32, 27, 28, 28, 29, 28, 28, 27, 28, 28, 30, 
      29, 28, 25, 27, 28, 30, 28, 30, 30, 28, 30, 30, 28, 29, 30, 28, 
      28, 26, 27, 28, 45, 36, 40, 28, 50, 45, 30, 45, 40, 30, 45, 45, 
      29, 45, 35, 40, 40, 30, 30, 30, 45, 40, 40, 40, 40, 40, 40, 35, 
      34, 49, 40, 30, 61, 35, 40, 30, 36, 35, 29, 27, 48, 28, 27, 27, 
      26, 27, 29, 27, 26, 27, 31, 27, 27, 28, 29, 28, 27, 28, 29, 38, 
      30, 26, 36, 40, 58, 57, 30, 33, 56, 35, 39, 37, 38, 46, 37, 39, 
      39, 45, 35, 46, 58, 65, 60, 45, 32, 36, 43, 32, 68, 39, 28, 31, 
      27, 28, 27, 37, 38, 30, 30, 28, 36, 45, 28, 26, 28, 28, 28, 27, 
      26, 28, 27, 26, 26, 27, 28, 31, 32, 37, 35, 29, 33, 35, 29, 41, 
      32, 36, 29, 28, 28, 28, 37, 36, 37, 35, 31, 32, 30, 27, 31, 32, 
      31, 33, 28, 33, 29, 27, 28, 31, 28, 31, 28, 34, 27, 27, 28, 27, 
      27, 27, 27, 26, 26, 26, 27, 27, 28, 26, 31, 26, 29, 31, 29, 29, 
      30, 29, 30, 31, 32, 29, 30, 27, 32, 27, 26, 31, 31, 31, 27, 27, 
      33, 27, 28, 28, 28, 26, 27, 27, 28, 30, 27, 27, 30, 29, 26, 27, 
      28, 27, 26, 26, 28, 27, 26, 28, 28, 26, 28, 27, 29, 27, 28, 28, 
      26, 26, 29, 28, 27, 27, 27, 28, 26, 25, 27, 29, 30, 36, 40, 28, 
      38, 26, 27, 27, 50, 27, 45, 27, 28, 26, 25, 35, 35, 44, 30, 27, 
      31, 27, 28, 27, 27, 28, 28, 28, 35, 33, 30, 28, 28, 29, 29, 36, 
      32, 36, 34, 32, 28, 28, 29, 28, 28, 32, 30, 35, 33, 36, 32, 30, 
      32, 36, 34)