我有这些数据:
simulated_states = c("A", "E", "B", "B", "A", "C", "D", "A", "B", "D", "A", "D",
"D", "E", "D", "D", "D", "E", "A", "A", "A", "B", "A", "C", "C",
"D", "A", "A", "D", "A", "D", "A", "A", "A", "C", "C", "D", "A",
"C", "C", "D", "E", "C", "C", "C", "E", "B", "A", "E", "E", "C",
"C", "D", "E", "C", "E", "E", "A", "E", "B", "A", "A", "E", "E",
"C", "E", "C", "C", "C", "D", "E", "D", "C", "D", "A", "B", "B",
"E", "B", "A", "E", "C", "C", "D", "B", "B", "A", "C", "B", "A",
"D", "A", "D", "E", "C", "D", "D", "A", "A", "C")
我知道如何计算转移概率:
calculate_transition_probs <- function(states) {
transitions <- data.frame(
from = states,
to = c(states[-1], NA)
)
transition_counts <- table(transitions, useNA = "always")
transition_df <- as.data.frame(transition_counts)
colnames(transition_df) <- c("from", "to", "count")
transition_df <- transition_df[!is.na(transition_df$to), ]
transition_df <- transition_df %>%
group_by(from) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
transition_df <- transition_df[, c("from", "to", "count", "percent")]
transition_df <- transition_df[order(transition_df$from, transition_df$to), ]
return(transition_df)
}
transition_probs <- calculate_transition_probs(simulated_states)
结果如下:
from to count percent
A A 7 26.923077
A B 3 11.538462
A C 6 23.076923
A D 5 19.230769
A E 5 19.230769
B A 7 58.333333
B B 3 25.000000
B C 0 0.000000
B D 1 8.333333
B E 1 8.333333
C A 0 0.000000
C B 1 4.545455
C C 9 40.909091
C D 9 40.909091
C E 3 13.636364
D A 9 42.857143
D B 1 4.761905
D C 1 4.761905
D D 4 19.047619
D E 6 28.571429
E A 2 11.111111
E B 4 22.222222
E C 7 38.888889
E D 2 11.111111
E E 3 16.666667
现在,我想扩展它来计算 n 步概率的转移概率。
例如
- 2 个步骤:从 = (A,A) 给出到 = A,从 = (A,B) 给出到 = A,从 = (A,C) 给出到 = A…..从 = (A,B) 给出到 = B,从 = (B,B) 给出到 = B 等等。
- 3 步骤:到 = A 给定从 = (A,A,A),到 = A 给定从 = (A,B,A),等等。
- N 步骤:到 = A 从 = (A,A,A…A) 等给出。
我如何编写一个函数来执行 n 步操作?
例如对于 5 个步骤,输出应如下所示:
from1 from2 from3 from4 from5 to count percent
A A A A A A 0 0
A A A A A E 0 0
A A A A A B 0 0
A A A A A C 0 0
A A A A A D 0 0
4
5 个回答
5
尝试下面的函数。我添加了另一个参数step
来控制您想要的步骤数。
如果您设置step = 1
,则输出与您迄今为止所做的相同。
calculate_transition_probs <- function(states, step = 1) {
nc <- step+1
lagged_mat <- matrix(
states[sequence(rep(length(states), nc), 1:nc)],
ncol = nc
)
trans_prob <- lagged_mat %>%
as.data.frame(stringsAsFactors = TRUE) %>%
head(-step) %>%
group_by(pick(everything()), .drop = FALSE) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
names(trans_prob)[1:nc] <- c(paste0("from", 1:step), "to")
return(trans_prob)
}
结果
calculate_transition_probs(simulated_states, step = 3)
# # A tibble: 625 × 6
# from1 from2 from3 to count percent
# <fct> <fct> <fct> <fct> <int> <dbl>
# 1 A A A A 0 0
# 2 A A A B 1 50
# 3 A A A C 1 50
# 4 A A A D 0 0
# 5 A A A E 0 0
# 6 A A B A 1 100
# 7 A A B B 0 0
# 8 A A B C 0 0
# 9 A A B D 0 0
# 10 A A B E 0 0
# # ℹ 615 more rows
|
更新
根据@Darren Tsai 的评论,更新如下
f <- function(dat, n = 1) {
v <- unique(sort(dat))
nms <- c(paste0("from", 1:n), "to")
p <- aggregate(
perc ~ .,
cbind(
setNames(as.data.frame(embed(dat, n + 1)[, (n + 1):1]), nms),
perc = 1
), sum
)
q <- setNames(expand.grid(rep(list(v), n + 1)), nms)
transform(
merge(q, p, all = TRUE),
perc = ave(ifelse(is.na(perc), 0, perc),
ceiling(seq_along(to) / length(v)),
FUN = \(x) x / sum(na.omit(x))
)
)
}
使得
> f(simulated_states, 1)
from1 to perc
1 A A 0.26923077
2 A B 0.11538462
3 A C 0.23076923
4 A D 0.19230769
5 A E 0.19230769
6 B A 0.58333333
7 B B 0.25000000
8 B C 0.00000000
9 B D 0.08333333
10 B E 0.08333333
11 C A 0.00000000
12 C B 0.04545455
13 C C 0.40909091
14 C D 0.40909091
15 C E 0.13636364
16 D A 0.42857143
17 D B 0.04761905
18 D C 0.04761905
19 D D 0.19047619
20 D E 0.28571429
21 E A 0.11111111
22 E B 0.22222222
23 E C 0.38888889
24 E D 0.11111111
25 E E 0.16666667
> f(simulated_states, 2)
from1 from2 to perc
1 A A A 0.2857143
2 A A B 0.1428571
3 A A C 0.2857143
4 A A D 0.1428571
5 A A E 0.1428571
6 A B A 0.3333333
7 A B B 0.3333333
8 A B C 0.0000000
9 A B D 0.3333333
10 A B E 0.0000000
11 A C A 0.0000000
12 A C B 0.2000000
13 A C C 0.6000000
14 A C D 0.2000000
15 A C E 0.0000000
16 A D A 0.6000000
17 A D B 0.0000000
18 A D C 0.0000000
19 A D D 0.2000000
20 A D E 0.2000000
21 A E A 0.0000000
22 A E B 0.4000000
23 A E C 0.2000000
24 A E D 0.0000000
25 A E E 0.4000000
26 B A A 0.1428571
27 B A B 0.0000000
28 B A C 0.4285714
29 B A D 0.1428571
30 B A E 0.2857143
31 B B A 0.6666667
32 B B B 0.0000000
33 B B C 0.0000000
34 B B D 0.0000000
35 B B E 0.3333333
36 B C A NaN
37 B C B NaN
38 B C C NaN
39 B C D NaN
40 B C E NaN
41 B D A 1.0000000
42 B D B 0.0000000
43 B D C 0.0000000
44 B D D 0.0000000
45 B D E 0.0000000
46 B E A 0.0000000
47 B E B 1.0000000
48 B E C 0.0000000
49 B E D 0.0000000
50 B E E 0.0000000
51 C A A NaN
52 C A B NaN
53 C A C NaN
54 C A D NaN
55 C A E NaN
56 C B A 1.0000000
57 C B B 0.0000000
58 C B C 0.0000000
59 C B D 0.0000000
60 C B E 0.0000000
61 C C A 0.0000000
62 C C B 0.0000000
63 C C C 0.2222222
64 C C D 0.6666667
65 C C E 0.1111111
66 C D A 0.4444444
67 C D B 0.1111111
68 C D C 0.0000000
69 C D D 0.1111111
70 C D E 0.3333333
71 C E A 0.0000000
72 C E B 0.3333333
73 C E C 0.3333333
74 C E D 0.0000000
75 C E E 0.3333333
76 D A A 0.3333333
77 D A B 0.2222222
78 D A C 0.1111111
79 D A D 0.3333333
80 D A E 0.0000000
81 D B A 0.0000000
82 D B B 1.0000000
83 D B C 0.0000000
84 D B D 0.0000000
85 D B E 0.0000000
86 D C A 0.0000000
87 D C B 0.0000000
88 D C C 0.0000000
89 D C D 1.0000000
90 D C E 0.0000000
91 D D A 0.2500000
92 D D B 0.0000000
93 D D C 0.0000000
94 D D D 0.2500000
95 D D E 0.5000000
96 D E A 0.1666667
97 D E B 0.0000000
98 D E C 0.5000000
99 D E D 0.3333333
100 D E E 0.0000000
101 E A A 0.5000000
102 E A B 0.0000000
103 E A C 0.0000000
104 E A D 0.0000000
105 E A E 0.5000000
106 E B A 0.7500000
107 E B B 0.2500000
108 E B C 0.0000000
109 E B D 0.0000000
110 E B E 0.0000000
111 E C A 0.0000000
112 E C B 0.0000000
113 E C C 0.5714286
114 E C D 0.1428571
115 E C E 0.2857143
116 E D A 0.0000000
117 E D B 0.0000000
118 E D C 0.5000000
119 E D D 0.5000000
120 E D E 0.0000000
121 E E A 0.3333333
122 E E B 0.0000000
123 E E C 0.6666667
124 E E D 0.0000000
125 E E E 0.0000000
较旧 (似乎不是 OP 想要的)
您可以尝试下面的代码
f <- function(dat, n = 1) {
from <- do.call(
paste,
c(asplit(as.matrix(embed(head(dat, -1), n)[, n:1]), 2),
sep = "->"
)
)
to <- tail(dat, -n)
data.frame(from, to) %>%
count(from, to, name = "perc") %>%
mutate(perc = perc / sum(perc))
}
你将获得
> f(simulated_states, 1)
from to perc
1 A A 0.07070707
2 A B 0.03030303
3 A C 0.06060606
4 A D 0.05050505
5 A E 0.05050505
6 B A 0.07070707
7 B B 0.03030303
8 B D 0.01010101
9 B E 0.01010101
10 C B 0.01010101
11 C C 0.09090909
12 C D 0.09090909
13 C E 0.03030303
14 D A 0.09090909
15 D B 0.01010101
16 D C 0.01010101
17 D D 0.04040404
18 D E 0.06060606
19 E A 0.02020202
20 E B 0.04040404
21 E C 0.07070707
22 E D 0.02020202
23 E E 0.03030303
> f(simulated_states, 2)
from to perc
1 A->A A 0.02040816
2 A->A B 0.01020408
3 A->A C 0.02040816
4 A->A D 0.01020408
5 A->A E 0.01020408
6 A->B A 0.01020408
7 A->B B 0.01020408
8 A->B D 0.01020408
9 A->C B 0.01020408
10 A->C C 0.03061224
11 A->C D 0.01020408
12 A->D A 0.03061224
13 A->D D 0.01020408
14 A->D E 0.01020408
15 A->E B 0.02040816
16 A->E C 0.01020408
17 A->E E 0.02040816
18 B->A A 0.01020408
19 B->A C 0.03061224
20 B->A D 0.01020408
21 B->A E 0.02040816
22 B->B A 0.02040816
23 B->B E 0.01020408
24 B->D A 0.01020408
25 B->E B 0.01020408
26 C->B A 0.01020408
27 C->C C 0.02040816
28 C->C D 0.06122449
29 C->C E 0.01020408
30 C->D A 0.04081633
31 C->D B 0.01020408
32 C->D D 0.01020408
33 C->D E 0.03061224
34 C->E B 0.01020408
35 C->E C 0.01020408
36 C->E E 0.01020408
37 D->A A 0.03061224
38 D->A B 0.02040816
39 D->A C 0.01020408
40 D->A D 0.03061224
41 D->B B 0.01020408
42 D->C D 0.01020408
43 D->D A 0.01020408
44 D->D D 0.01020408
45 D->D E 0.02040816
46 D->E A 0.01020408
47 D->E C 0.03061224
48 D->E D 0.02040816
49 E->A A 0.01020408
50 E->A E 0.01020408
51 E->B A 0.03061224
52 E->B B 0.01020408
53 E->C C 0.04081633
54 E->C D 0.01020408
55 E->C E 0.02040816
56 E->D C 0.01020408
57 E->D D 0.01020408
58 E->E A 0.01020408
59 E->E C 0.02040816
> f(simulated_states, 3)
from to perc
1 A->A->A B 0.01030928
2 A->A->A C 0.01030928
3 A->A->B A 0.01030928
4 A->A->C C 0.01030928
5 A->A->D A 0.01030928
6 A->A->E E 0.01030928
7 A->B->A C 0.01030928
8 A->B->B E 0.01030928
9 A->B->D A 0.01030928
10 A->C->B A 0.01030928
11 A->C->C D 0.03092784
12 A->C->D A 0.01030928
13 A->D->A A 0.01030928
14 A->D->A D 0.02061856
15 A->D->D E 0.01030928
16 A->D->E C 0.01030928
17 A->E->B A 0.01030928
18 A->E->B B 0.01030928
19 A->E->C C 0.01030928
20 A->E->E C 0.02061856
21 B->A->A E 0.01030928
22 B->A->C B 0.01030928
23 B->A->C C 0.01030928
24 B->A->C D 0.01030928
25 B->A->D A 0.01030928
26 B->A->E C 0.01030928
27 B->A->E E 0.01030928
28 B->B->A C 0.02061856
29 B->B->E B 0.01030928
30 B->D->A D 0.01030928
31 B->E->B A 0.01030928
32 C->B->A D 0.01030928
33 C->C->C D 0.01030928
34 C->C->C E 0.01030928
35 C->C->D A 0.02061856
36 C->C->D B 0.01030928
37 C->C->D E 0.03092784
38 C->C->E B 0.01030928
39 C->D->A A 0.01030928
40 C->D->A B 0.02061856
41 C->D->A C 0.01030928
42 C->D->B B 0.01030928
43 C->D->D A 0.01030928
44 C->D->E C 0.02061856
45 C->D->E D 0.01030928
46 C->E->B A 0.01030928
47 C->E->C C 0.01030928
48 C->E->E A 0.01030928
49 D->A->A A 0.01030928
50 D->A->A C 0.01030928
51 D->A->A D 0.01030928
52 D->A->B B 0.01030928
53 D->A->B D 0.01030928
54 D->A->C C 0.01030928
55 D->A->D A 0.01030928
56 D->A->D D 0.01030928
57 D->A->D E 0.01030928
58 D->B->B A 0.01030928
59 D->C->D A 0.01030928
60 D->D->A A 0.01030928
61 D->D->D E 0.01030928
62 D->D->E A 0.01030928
63 D->D->E D 0.01030928
64 D->E->A A 0.01030928
65 D->E->C C 0.01030928
66 D->E->C D 0.01030928
67 D->E->C E 0.01030928
68 D->E->D C 0.01030928
69 D->E->D D 0.01030928
70 E->A->A A 0.01030928
71 E->A->E B 0.01030928
72 E->B->A A 0.01030928
73 E->B->A E 0.02061856
74 E->B->B A 0.01030928
75 E->C->C C 0.02061856
76 E->C->C D 0.02061856
77 E->C->D D 0.01030928
78 E->C->E C 0.01030928
79 E->C->E E 0.01030928
80 E->D->C D 0.01030928
81 E->D->D D 0.01030928
82 E->E->A E 0.01030928
83 E->E->C C 0.01030928
84 E->E->C E 0.01030928
4
-
看来 OP 需要的是条件概率,而不是总体概率。因此,每 5 行的百分比总和必须为 1。
– -
1@DarrenTsai 谢谢你的评论。现在我更新了我的解决方案
– -
ifelse()
编码不错!您可以通过以下操作摆脱烦恼(perc[is.na(perc)]=0)
。sum(na.omit(x))
在这里添加一些内容吗sum(x, na.rm=TRUE)
?
–
-
1@Friede 谢谢。
sum(na.omit(x))
和sum(x, na.rm=TRUE)
–
|
您可以通过将条目除以其行总和来transition_counts
将其转换为转换矩阵(为简单起见,我将其称为):A
A <- transition_counts / rowSums(transition_counts)
然后,2 步转移概率简单来说就是
A %*% A
to
from A B C D E
A 0.2435780 0.12229330 0.2404796 0.2137937 0.1798554
B 0.3478582 0.15229446 0.1709910 0.1581451 0.1707112
C 0.2169913 0.07974223 0.2398662 0.2642168 0.1991834
D 0.2565411 0.13608217 0.2385630 0.1738936 0.1949202
E 0.2256817 0.12838088 0.2548378 0.2386595 0.1524402
请注意,行和仍然为 1。然后,第 3 步
A %*% A %*% A
或者为了简化,我们可以使用expm
具有以下便捷%^%
功能的包:
library(expm)
A %^% 3
此函数允许您计算第 n 步。
A %^% 10
to
from A B C D E
A 0.2494011 0.1199961 0.2341925 0.2148659 0.1815444
B 0.2494024 0.1199966 0.2341917 0.2148651 0.1815442
C 0.2494006 0.1199957 0.2341927 0.2148664 0.1815446
D 0.2494013 0.1199962 0.2341924 0.2148656 0.1815445
E 0.2494010 0.1199962 0.2341926 0.2148660 0.1815442
以上是接近稳定状态,通过求解给出:
qr.solve(rbind(t(A) - diag(5), rep(1, 5)), c(rep(0,5), 1))
# A B C D E
# 0.2494012 0.1199961 0.2341924 0.2148659 0.1815444
5
-
谢谢爱德华!但我想你可能把这个问题搞得太复杂了?我对这个问题做了一些编辑,以显示最终答案应该是什么样的……也许你可以看一下?
– -
是的 – 你说得对!但是你想要的(所有可能转换的数据框)会很长。
– -
感谢您的指导和反馈…我猜如果 n 较小的话没问题?但是,一旦 n 变得太大,行数就会真正增加…
– -
你知道我该如何调整你的答案吗?
– -
抱歉 – 我误解了你的意思。谢谢!
–
|
假设您想要跟踪中间状态,则在数据框中组装滞后列,将状态粘贴在一起(使用tidyverse::unite
),然后调用table
应该可以工作:
library(dplyr) # for %>% operator
library(tidyverse) # for unite function
n = length(simulated_states)
data.frame(matrix(c(simulated_states[seq(1,n-2)],
simulated_states[seq(2,n-1)],
simulated_states[seq(3,n)]),
ncol=3)) %>%
{unite(data=.,col='d',names(.),sep='')} %>% table
作为函数:
require(tidyverse)
calculate_transition_probs = function(states, m){
n = length(states)
lagged_states = c()
for (i in 1:m){
lagged_states = c(lagged_states,
simulated_states[seq(i,n-m+i)])
}
return(data.frame(matrix(lagged_states,
ncol=m)) %>%
{unite(data=.,col='d',names(.),sep='')} %>%
table)
}
4
-
你好 njp,我尝试运行你的第一个代码块,但出现了这个错误:
– -
n – 2 中的错误:二元运算符的非数字参数
– -
抱歉,
n
您的向量的长度。已更新。
– -
谢谢…你能告诉我如何使用这个功能吗?
–
|
使用卷积 ( convolve
) 进行索引在这里可能性能最佳。使用data.table
:
library(data.table)
transprob <- function(x, steps) {
s1 <- steps + 1L
u <- sort(unique(x))
m <- match(x, u)
z <- length(u)^(0:steps)
dt <- setcolorder(do.call(CJ, rep(list(u), s1)), s1:1)
setnames(dt, c(paste0("from", 1:steps), "to"))
i <- round(convolve(m - 1L, z, 1, "o")[s1:length(x)] + 1)
a <- matrix(tabulate(i, nrow(dt)), z[s1])
dt[,`:=`(count = c(a), percent = c(a/rowSums(a)))]
}
演示1:
transprob(simulated_states, 1L)[]
#> Key: <to, from1>
#> from1 to count percent
#> <char> <char> <int> <num>
#> 1: A A 7 0.26923077
#> 2: B A 7 0.58333333
#> 3: C A 0 0.00000000
#> 4: D A 9 0.42857143
#> 5: E A 2 0.11111111
#> 6: A B 3 0.11538462
#> 7: B B 3 0.25000000
#> 8: C B 1 0.04545455
#> 9: D B 1 0.04761905
#> 10: E B 4 0.22222222
#> 11: A C 6 0.23076923
#> 12: B C 0 0.00000000
#> 13: C C 9 0.40909091
#> 14: D C 1 0.04761905
#> 15: E C 7 0.38888889
#> 16: A D 5 0.19230769
#> 17: B D 1 0.08333333
#> 18: C D 9 0.40909091
#> 19: D D 4 0.19047619
#> 20: E D 2 0.11111111
#> 21: A E 5 0.19230769
#> 22: B E 1 0.08333333
#> 23: C E 3 0.13636364
#> 24: D E 6 0.28571429
#> 25: E E 3 0.16666667
#> from1 to count percent
演示2:
transprob(simulated_states, 2L)[]
#> Key: <to, from2, from1>
#> from1 from2 to count percent
#> <char> <char> <char> <int> <num>
#> 1: A A A 2 0.2857143
#> 2: B A A 1 0.1428571
#> 3: C A A 0 NaN
#> 4: D A A 3 0.3333333
#> 5: E A A 1 0.5000000
#> ---
#> 121: A E E 2 0.4000000
#> 122: B E E 0 0.0000000
#> 123: C E E 1 0.3333333
#> 124: D E E 0 0.0000000
#> 125: E E E 0 0.0000000
演示3:
transprob(simulated_states, 3L)[]
#> Key: <to, from3, from2, from1>
#> from1 from2 from3 to count percent
#> <char> <char> <char> <char> <int> <num>
#> 1: A A A A 0 0.0000000
#> 2: B A A A 0 0.0000000
#> 3: C A A A 0 NaN
#> 4: D A A A 1 0.3333333
#> 5: E A A A 1 1.0000000
#> ---
#> 621: A E E E 0 0.0000000
#> 622: B E E E 0 NaN
#> 623: C E E E 0 0.0000000
#> 624: D E E E 0 NaN
#> 625: E E E E 0 NaN
|
mat
,它看起来很像你的转换矩阵(尽管你实际上并没有这么说)。这个矩阵告诉你从一个状态转换到另一个状态的概率。之后你模拟的内容对我来说并不清楚,而且(我认为)无关紧要。2 步转换矩阵只是mat %*% mat
,它给出了在 2 步之后从每个当前状态转到其他状态的概率。–
–
–
–
|