我有一个变量x
,我想将其分成三组,每个组具有相同的观测值。但是,使用分位数并没有产生最相等的组,因为分位数的截断点可能会导致平局被分配到多个组中。我正在寻找一个函数或算法,它可以找到最佳截断点,同时确保平局不会分散到多个组中。
x = c(26, 34, 27, 26, 38, 40, 34, 28, 27, 36, 29, 30, 29, 44, 30,
34, 32, 30, 26, 29, 34, 32, 38, 27, 35, 29, 28, 34, 26, 27, 27,
30, 27, 28, 27, 28, 28, 27, 29, 29, 28, 29, 29, 28, 29, 29, 28,
27, 29, 27, 36, 34, 34, 39, 34, 31, 31, 33, 35, 31, 31, 32, 37,
38, 32, 31, 28, 33, 33, 28, 27, 27, 30, 31, 32, 28, 27, 31, 36,
27, 33, 31, 34, 31, 35, 38, 37, 36, 39, 33, 33, 28, 41, 34, 35,
37, 37, 41, 32, 37, 30, 34, 38, 30, 40, 35, 31, 30, 30, 29, 29,
30, 29, 35, 28, 27, 27, 27, 29, 27, 28, 27, 27, 27, 26, 28, 28,
27, 29, 29, 27, 27, 27, 27, 29, 27, 28, 27, 28, 34, 29, 28, 28,
28, 29, 38, 33, 39, 28, 27, 28, 27, 29, 34, 29, 32, 70, 26, 29,
43, 48, 30, 30, 27, 26, 29, 27, 27, 27, 27, 28, 28, 27, 28, 28,
27, 28, 28, 38, 52, 26, 31, 56, 29, 29, 36, 28, 35, 32, 34, 35,
28, 27, 37, 26, 26, 32, 26, 27, 30, 28, 28, 30, 29, 30, 29, 29,
28, 26, 33, 39, 26, 31, 27, 28, 30, 30, 28, 28, 29, 26, 27, 26,
29, 28, 28, 27, 27, 27, 28, 27, 28, 28, 28, 28, 28, 27, 27, 29,
27, 26, 28, 28, 27, 27, 28, 27, 28, 28, 30, 27, 30, 28, 32, 34,
28, 27, 28, 28, 27, 28, 27, 27, 27, 28, 27, 28, 27, 27, 28, 27,
27, 27, 27, 27, 28, 27, 27, 27, 26, 27, 27, 30, 28, 27, 30, 30,
42, 26, 27, 40, 33, 29, 29, 29, 52, 58, 44, 32, 43, 30, 27, 38,
30, 27, 30, 27, 31, 39, 35, 32, 32, 34, 45, 31, 44, 42, 29, 29,
30, 30, 50, 30, 33, 31, 35, 27, 28, 27, 28, 55, 28, 28, 28, 27,
27, 28, 29, 27, 28, 27, 28, 28, 28, 28, 27, 28, 29, 34, 45, 27,
29, 61, 38, 62, 29, 36, 36, 30, 31, 45, 27, 30, 28, 29, 44, 45,
42, 52, 50, 52, 42, 38, 42, 32, 27, 37, 40, 52, 27, 36, 38, 39,
34, 30, 29, 34, 29, 26, 35, 43, 33, 40, 35, 33, 41, 61, 45, 35,
52, 50, 38, 43, 29, 35, 38, 39, 31, 28, 28, 29, 34, 27, 30, 32,
28, 26, 28, 27, 26, 29, 27, 26, 29, 29, 27, 29, 27, 27, 29, 27,
30, 29, 25, 30, 27, 29, 29, 30, 30, 27, 30, 28, 28, 27, 29, 29,
30, 29, 27, 28, 28, 28, 29, 28, 28, 27, 28, 29, 28, 29, 27, 28,
28, 28, 30, 27, 27, 28, 26, 28, 27, 27, 28, 28, 28, 28, 27, 27,
28, 27, 28, 27, 35, 27, 27, 28, 29, 27, 27, 28, 26, 27, 28, 28,
28, 27, 27, 27, 28, 32, 27, 28, 28, 29, 28, 28, 27, 28, 28, 30,
29, 28, 25, 27, 28, 30, 28, 30, 30, 28, 30, 30, 28, 29, 30, 28,
28, 26, 27, 28, 45, 36, 40, 28, 50, 45, 30, 45, 40, 30, 45, 45,
29, 45, 35, 40, 40, 30, 30, 30, 45, 40, 40, 40, 40, 40, 40, 35,
34, 49, 40, 30, 61, 35, 40, 30, 36, 35, 29, 27, 48, 28, 27, 27,
26, 27, 29, 27, 26, 27, 31, 27, 27, 28, 29, 28, 27, 28, 29, 38,
30, 26, 36, 40, 58, 57, 30, 33, 56, 35, 39, 37, 38, 46, 37, 39,
39, 45, 35, 46, 58, 65, 60, 45, 32, 36, 43, 32, 68, 39, 28, 31,
27, 28, 27, 37, 38, 30, 30, 28, 36, 45, 28, 26, 28, 28, 28, 27,
26, 28, 27, 26, 26, 27, 28, 31, 32, 37, 35, 29, 33, 35, 29, 41,
32, 36, 29, 28, 28, 28, 37, 36, 37, 35, 31, 32, 30, 27, 31, 32,
31, 33, 28, 33, 29, 27, 28, 31, 28, 31, 28, 34, 27, 27, 28, 27,
27, 27, 27, 26, 26, 26, 27, 27, 28, 26, 31, 26, 29, 31, 29, 29,
30, 29, 30, 31, 32, 29, 30, 27, 32, 27, 26, 31, 31, 31, 27, 27,
33, 27, 28, 28, 28, 26, 27, 27, 28, 30, 27, 27, 30, 29, 26, 27,
28, 27, 26, 26, 28, 27, 26, 28, 28, 26, 28, 27, 29, 27, 28, 28,
26, 26, 29, 28, 27, 27, 27, 28, 26, 25, 27, 29, 30, 36, 40, 28,
38, 26, 27, 27, 50, 27, 45, 27, 28, 26, 25, 35, 35, 44, 30, 27,
31, 27, 28, 27, 27, 28, 28, 28, 35, 33, 30, 28, 28, 29, 29, 36,
32, 36, 34, 32, 28, 28, 29, 28, 28, 32, 30, 35, 33, 36, 32, 30,
32, 36, 34)
quantile(x, probs = c(0.333, 0.666))
#> 33.3% 66.6%
#> 28 31
l = cut(x, breaks = c(-Inf, 28, 31, Inf))
table(l)
#> l
#> (-Inf,28] (28,31] (31, Inf]
#> 387 185 246
#using different cut-off points yielded more equal groups
l = cut(x, breaks = c(-Inf, 28, 32, Inf))
table(l)
#> l
#> (-Inf,28] (28,32] (32, Inf]
#> 387 214 217
#again using different cut-off points which yielded more equal groups
l = cut(x, breaks = c(-Inf, 27, 32, Inf))
table(l)
#> l
#> (-Inf,27] (27,32] (32, Inf]
#> 222 379 217
创建于 2024-10-07,使用
编辑:我认为“平等”这个词不清楚,所以我想我可以说,我寻求最合适的分配,使最高组和最低组观察数字之间的差异最小,并且只对连续的数字进行分组,而没有多个组之间的联系
10
最佳答案
3
我创建了一个函数,它采用数字和断点的向量,然后计算每个箱的长度与平均箱大小(总长度除以 n,即断点数)相比,并返回箱大小和平均箱大小之间的绝对差异之和。
然后我可以使用optim
优化该函数并找到最佳断点。但是,由于optim
它不是设计用于整数的,因此我将断点和向量(即 x)除以 100,以便更容易达到最佳效果。
fsplit <- function(param, xx, nn){
l <- cut(xx, breaks = c(-Inf, param, Inf))
lt <- table(l)
d <- lt - length(xx)/nn
return(sum(abs(d)))
}
m1 <- optim(c(28,31)/100, xx = x/100, nn = 3, fsplit, method='CG')
## optimal break points
floor(m1$par*100)
#> [1] 27 30
fsplit(floor(m1$par*100), x, 3)
#> [1] 101.3333
table(cut(x, breaks = c(-Inf, floor(m1$par*100), Inf)))
#> (-Inf,27] (27,30] (30, Inf]
#> 222 318 278
基准:
对于 OP 的示例来说,检查每个可能的断点效果相对较好,但对于较大的向量,这将非常昂贵。见下文;
Mdd <- function(x){
fsplit <- function(param, xx, nn){
l <- cut(xx, breaks = c(-Inf, param, Inf))
lt <- table(l)
d <- lt - length(xx)/nn
return(sum(abs(d)))
}
q1 <- quantile(x, probs = c(1/3, 2/3))
d1 <- 10^ceiling(log10(max(x)))
m1 <- optim(q1/d1, xx = x/100, nn = 3, fsplit, method='CG')
unname(floor(m1$par*d1))
}
Ric <- function(x) {
tbl <- unclass(table(x))
cutting <- combn(length(tbl) - 1, 2)
sums <- apply(cutting, 2, \(i) c(
sum(tbl[1:i[1]]),
sum(tbl[(i[1]+1):i[2]]),
sum(tbl[-(1:i[2])])))
stopifnot(all(colSums(sums) == sum(tbl)))
count_diff <- apply(sums, 2, \(i) max(i) - min(i))
return(as.numeric(names(tbl)[cutting[,order(count_diff)[1]]]))
}
TIC <- function(x){
res <- combn(sort(unique(x)), 2, \(k)
table(cut(x, breaks = c(-Inf, k, Inf))), simplify = FALSE)
as.numeric(unlist(strsplit(
gsub("\\(|]|\\)", "", names(res[[which.min(sapply(res, var))]][2])),",")))
}
对于 OP 的向量,性能是可比的:
microbenchmark::microbenchmark(Mdd(x), TIC(x), Ric(x),
check = "identical")
Unit: milliseconds
expr min lq mean median uq max neval cld
Mdd(x) 3.2567 3.6213 4.611345 3.8095 4.1240 51.8633 100 a
TIC(x) 81.5470 85.7937 93.407666 91.9252 96.8024 169.0715 100 b
Ric(x) 3.4082 3.7759 4.664426 3.9968 4.5983 16.9820 100 a
但对于更大的矢量……
set.seed(123)
xx <- sample(15:200, 2000, replace = T)
microbenchmark::microbenchmark(Mdd(xx), TIC(xx), Ric(xx),
check = "identical", times = 20, unit = "ms")
Unit: milliseconds
expr min lq mean median uq max neval cld
Mdd(xx) 0.975 1.0793 1.24086 1.2914 1.3601 1.509 20 a
TIC(xx) 3179.531 3300.5153 3408.50612 3389.3137 3488.5846 3689.105 20 b
Ric(xx) 142.919 151.5047 176.46253 160.1243 165.8123 288.897 20 c
|
更新
如果你只想要 3 组,你可以尝试
res <- combn(sort(unique(x)), 2, \(k) table(cut(x, breaks = c(-Inf, k, Inf))), simplify = FALSE)
res[[which.min(sapply(res, var))]]
给出
(-Inf,27] (27,30] (30, Inf]
222 318 278
较早的答案
这不是一个聪明的方法,但希望可能会有所帮助
n <- 2
lst <- c()
repeat {
q <- quantile(x, probs = seq.int(n - 1) / n)
if (!anyDuplicated(q)) {
lst <- c(lst,list(table(cut(x, breaks = c(-Inf, q, Inf)))))
} else {
break
}
n <- n + 1
}
lst[[which.min(sapply(lst, var))]]
给出
(-Inf,27] (27,28] (28,30] (30,35] (35, Inf]
222 165 153 132 146
2
-
我想将数字分成 3 组,正如我在问题中所展示的,分位数的问题在于,由于平局,它们不能给出最平衡/最优质的组,所以我认为我不应该依赖它们?
–
-
@NEA 查看我的更新
–
|
该脚本会计算所有可能的组,并确定“最相等”的切点,理解为较大组与较小组之间的差异最小的切点。
这种组合方法仅适用于中等数量的组和数据大小。否则,根据总和确定组的任务是 NP 难题(我不知道它是否按顺序切割)
# Count and order the classes
tbl <- unclass(table(x))
# Enumerate the possible cutting points. There are 595 possibilities
cutting <- combn(length(tbl) - 1, 2)
# sum the nubmber of elements of each possible group
sums <- apply(cutting, 2, \(i) c(
sum(tbl[1:i[1]]), # From the minimal to the first cutting point (inclusive)
sum(tbl[(i[1]+1):i[2]]), # from next to the first cutting point to second (inclusive)
sum(tbl[-(1:i[2])]))) # from next to second to last
#check that sum of posible "sums" equals the sum iof tbl (818)
stopifnot(all(colSums(sums) == sum(tbl)))
#calculate differnece between the largest and the smallest group
# as a metric of "most equal groups.
count_diff <- apply(sums, 2, \(i) max(i) - min(i))
#FINALLY: best cut points (inclusive)
print(names(tbl)[cutting[,order(count_diff)[1]]])
# 27 & 30
print(sums[,order(count_diff)[1]])
# 222, 318, 278
#second best (and so on...)
names(tbl)[cutting[,order(count_diff)[2]]]
数据:
x = c(26, 34, 27, 26, 38, 40, 34, 28, 27, 36, 29, 30, 29, 44, 30,
34, 32, 30, 26, 29, 34, 32, 38, 27, 35, 29, 28, 34, 26, 27, 27,
30, 27, 28, 27, 28, 28, 27, 29, 29, 28, 29, 29, 28, 29, 29, 28,
27, 29, 27, 36, 34, 34, 39, 34, 31, 31, 33, 35, 31, 31, 32, 37,
38, 32, 31, 28, 33, 33, 28, 27, 27, 30, 31, 32, 28, 27, 31, 36,
27, 33, 31, 34, 31, 35, 38, 37, 36, 39, 33, 33, 28, 41, 34, 35,
37, 37, 41, 32, 37, 30, 34, 38, 30, 40, 35, 31, 30, 30, 29, 29,
30, 29, 35, 28, 27, 27, 27, 29, 27, 28, 27, 27, 27, 26, 28, 28,
27, 29, 29, 27, 27, 27, 27, 29, 27, 28, 27, 28, 34, 29, 28, 28,
28, 29, 38, 33, 39, 28, 27, 28, 27, 29, 34, 29, 32, 70, 26, 29,
43, 48, 30, 30, 27, 26, 29, 27, 27, 27, 27, 28, 28, 27, 28, 28,
27, 28, 28, 38, 52, 26, 31, 56, 29, 29, 36, 28, 35, 32, 34, 35,
28, 27, 37, 26, 26, 32, 26, 27, 30, 28, 28, 30, 29, 30, 29, 29,
28, 26, 33, 39, 26, 31, 27, 28, 30, 30, 28, 28, 29, 26, 27, 26,
29, 28, 28, 27, 27, 27, 28, 27, 28, 28, 28, 28, 28, 27, 27, 29,
27, 26, 28, 28, 27, 27, 28, 27, 28, 28, 30, 27, 30, 28, 32, 34,
28, 27, 28, 28, 27, 28, 27, 27, 27, 28, 27, 28, 27, 27, 28, 27,
27, 27, 27, 27, 28, 27, 27, 27, 26, 27, 27, 30, 28, 27, 30, 30,
42, 26, 27, 40, 33, 29, 29, 29, 52, 58, 44, 32, 43, 30, 27, 38,
30, 27, 30, 27, 31, 39, 35, 32, 32, 34, 45, 31, 44, 42, 29, 29,
30, 30, 50, 30, 33, 31, 35, 27, 28, 27, 28, 55, 28, 28, 28, 27,
27, 28, 29, 27, 28, 27, 28, 28, 28, 28, 27, 28, 29, 34, 45, 27,
29, 61, 38, 62, 29, 36, 36, 30, 31, 45, 27, 30, 28, 29, 44, 45,
42, 52, 50, 52, 42, 38, 42, 32, 27, 37, 40, 52, 27, 36, 38, 39,
34, 30, 29, 34, 29, 26, 35, 43, 33, 40, 35, 33, 41, 61, 45, 35,
52, 50, 38, 43, 29, 35, 38, 39, 31, 28, 28, 29, 34, 27, 30, 32,
28, 26, 28, 27, 26, 29, 27, 26, 29, 29, 27, 29, 27, 27, 29, 27,
30, 29, 25, 30, 27, 29, 29, 30, 30, 27, 30, 28, 28, 27, 29, 29,
30, 29, 27, 28, 28, 28, 29, 28, 28, 27, 28, 29, 28, 29, 27, 28,
28, 28, 30, 27, 27, 28, 26, 28, 27, 27, 28, 28, 28, 28, 27, 27,
28, 27, 28, 27, 35, 27, 27, 28, 29, 27, 27, 28, 26, 27, 28, 28,
28, 27, 27, 27, 28, 32, 27, 28, 28, 29, 28, 28, 27, 28, 28, 30,
29, 28, 25, 27, 28, 30, 28, 30, 30, 28, 30, 30, 28, 29, 30, 28,
28, 26, 27, 28, 45, 36, 40, 28, 50, 45, 30, 45, 40, 30, 45, 45,
29, 45, 35, 40, 40, 30, 30, 30, 45, 40, 40, 40, 40, 40, 40, 35,
34, 49, 40, 30, 61, 35, 40, 30, 36, 35, 29, 27, 48, 28, 27, 27,
26, 27, 29, 27, 26, 27, 31, 27, 27, 28, 29, 28, 27, 28, 29, 38,
30, 26, 36, 40, 58, 57, 30, 33, 56, 35, 39, 37, 38, 46, 37, 39,
39, 45, 35, 46, 58, 65, 60, 45, 32, 36, 43, 32, 68, 39, 28, 31,
27, 28, 27, 37, 38, 30, 30, 28, 36, 45, 28, 26, 28, 28, 28, 27,
26, 28, 27, 26, 26, 27, 28, 31, 32, 37, 35, 29, 33, 35, 29, 41,
32, 36, 29, 28, 28, 28, 37, 36, 37, 35, 31, 32, 30, 27, 31, 32,
31, 33, 28, 33, 29, 27, 28, 31, 28, 31, 28, 34, 27, 27, 28, 27,
27, 27, 27, 26, 26, 26, 27, 27, 28, 26, 31, 26, 29, 31, 29, 29,
30, 29, 30, 31, 32, 29, 30, 27, 32, 27, 26, 31, 31, 31, 27, 27,
33, 27, 28, 28, 28, 26, 27, 27, 28, 30, 27, 27, 30, 29, 26, 27,
28, 27, 26, 26, 28, 27, 26, 28, 28, 26, 28, 27, 29, 27, 28, 28,
26, 26, 29, 28, 27, 27, 27, 28, 26, 25, 27, 29, 30, 36, 40, 28,
38, 26, 27, 27, 50, 27, 45, 27, 28, 26, 25, 35, 35, 44, 30, 27,
31, 27, 28, 27, 27, 28, 28, 28, 35, 33, 30, 28, 28, 29, 29, 36,
32, 36, 34, 32, 28, 28, 29, 28, 28, 32, 30, 35, 33, 36, 32, 30,
32, 36, 34)
|
–
–
–
–
–
|