我有一个称为transactions代表购物车的数据集,我已将其转换为以下格式:

    member  Date        V1
1   1000    15-03-2015  sausage,whole milk,semi-finished bread,yogurt   
2   1000    24-06-2014  whole milk,pastry,salty snack   
3   1000    24-07-2015  canned beer,misc. beverages 
4   1001    25-11-2015  sausage,hygiene articles    
5   1001    27-05-2015  soda,pickled vegetables 
6   1001    02-05-2015  frankfurter,curd    

我需要的是看起来像规范稀疏矩阵购物车数据的东西:

cart sausage whole milk bread yogurt frankfurter #many more cols
1    TRUE    TRUE       TRUE  TRUE   FALSE

经过几个小时的努力,我目前正在以一种非常非 R 的方式来做这件事。我的数据框被调用transactions,并以上面第一个代码块中所示的格式包含我的所有“购物事件”。

ll <- unique(unlist(strsplit(paste0(transactions$V1, collapse=","), ',')))

txn_df <- data.frame()
txn_df[c(ll, "cart")] <- list(character(0))

build_carts <- function(row){
  xs <- sapply(strsplit(row$V1, ","), trimws)  # first `strsplit` by comma, then trim whitespace
  tmp <- data.frame(matrix(nrow=1, ncol = length(txn_df))) #new dataframe
  names(tmp) <- names(txn_df) #copy columns
  tmp$cart <- paste(row$Date, row$member, sep="_") #make a new cart ID
  
  #set present items to TRUE
  for (i in 1:length(xs)) {
    tmp[,which(colnames(tmp)==xs[i])] = TRUE
  }
  
  tmp <- replace(tmp, is.na(tmp), FALSE) # all other items false
  txn_df <<- rbind(txn_df, tmp) #copy to parent DF
}

res <- by(transactions, seq_len(nrow(transactions)), build_carts)

这可行,但正如您想象的那样,非常非常慢。有没有办法在不深入 tidyverse 的情况下加快速度?例如,如果代码对于 tidyverse 菜鸟来说至少可以部分清晰,那就太好了(出于教学目的)。

1

  • 似乎很合适,并且保持都是稀疏的(“分割流派字符串并为流派的存在创建二进制矩阵”之后的两行)


    – 


5 个回答
5

df %>%
   separate_longer_delim(V1, delim = regex(', *'))%>%
   mutate(value = TRUE)%>%
   pivot_wider(names_from = V1, values_fill = FALSE)

# A tibble: 6 × 15
  member Date       sausage `whole milk` `semi-finished bread` yogurt pastry `salty snack` `canned beer`
   <int> <chr>      <lgl>   <lgl>        <lgl>                 <lgl>  <lgl>  <lgl>         <lgl>        
1   1000 15-03-2015 TRUE    TRUE         TRUE                  TRUE   FALSE  FALSE         FALSE        
2   1000 24-06-2014 FALSE   TRUE         FALSE                 FALSE  TRUE   TRUE          FALSE        
3   1000 24-07-2015 FALSE   FALSE        FALSE                 FALSE  FALSE  FALSE         TRUE         
4   1001 25-11-2015 TRUE    FALSE        FALSE                 FALSE  FALSE  FALSE         FALSE        
5   1001 27-05-2015 FALSE   FALSE        FALSE                 FALSE  FALSE  FALSE         FALSE        
6   1001 02-05-2015 FALSE   FALSE        FALSE                 FALSE  FALSE  FALSE         FALSE        
# ℹ 6 more variables: `misc. beverages` <lgl>, `hygiene articles` <lgl>, soda <lgl>, `pickled vegetables` <lgl>,
#   frankfurter <lgl>, curd <lgl>

在基地R

a <- strsplit(df$V1, ', *')
b <- unique(unlist(a))
t(sapply(a, \(x) setNames(b%in% x, b)))

     sausage whole milk semi-finished bread yogurt pastry salty snack canned beer misc. beverages hygiene articles  soda pickled vegetables frankfurter  curd
[1,]    TRUE       TRUE                TRUE   TRUE  FALSE       FALSE       FALSE           FALSE            FALSE FALSE              FALSE       FALSE FALSE
[2,]   FALSE       TRUE               FALSE  FALSE   TRUE        TRUE       FALSE           FALSE            FALSE FALSE              FALSE       FALSE FALSE
[3,]   FALSE      FALSE               FALSE  FALSE  FALSE       FALSE        TRUE            TRUE            FALSE FALSE              FALSE       FALSE FALSE
[4,]    TRUE      FALSE               FALSE  FALSE  FALSE       FALSE       FALSE           FALSE             TRUE FALSE              FALSE       FALSE FALSE
[5,]   FALSE      FALSE               FALSE  FALSE  FALSE       FALSE       FALSE           FALSE            FALSE  TRUE               TRUE       FALSE FALSE
[6,]   FALSE      FALSE               FALSE  FALSE  FALSE       FALSE       FALSE           FALSE            FALSE FALSE              FALSE        TRUE  TRUE

你可以做任何一个

> u <- sort(unlist(s <- strsplit(x, ',')))
> (m1 <- lapply(s, match, u) |> sapply(tabulate, length(u)) |> t() |> 
+   `colnames<-`(u) |> `mode<-`('logical'))
     canned beer  curd frankfurter hygiene articles misc. beverages pastry
[1,]       FALSE FALSE       FALSE            FALSE           FALSE  FALSE
[2,]       FALSE FALSE       FALSE            FALSE           FALSE   TRUE
[3,]        TRUE FALSE       FALSE            FALSE            TRUE  FALSE
[4,]       FALSE FALSE       FALSE             TRUE           FALSE  FALSE
[5,]       FALSE FALSE       FALSE            FALSE           FALSE  FALSE
[6,]       FALSE  TRUE        TRUE            FALSE           FALSE  FALSE
     pickled vegetables salty snack sausage sausage semi-finished bread  soda
[1,]              FALSE       FALSE    TRUE   FALSE                TRUE FALSE
[2,]              FALSE        TRUE   FALSE   FALSE               FALSE FALSE
[3,]              FALSE       FALSE   FALSE   FALSE               FALSE FALSE
[4,]              FALSE       FALSE    TRUE   FALSE               FALSE FALSE
[5,]               TRUE       FALSE   FALSE   FALSE               FALSE  TRUE
[6,]              FALSE       FALSE   FALSE   FALSE               FALSE FALSE
     whole milk whole milk yogurt
[1,]       TRUE      FALSE   TRUE
[2,]       TRUE      FALSE  FALSE
[3,]      FALSE      FALSE  FALSE
[4,]      FALSE      FALSE  FALSE
[5,]      FALSE      FALSE  FALSE
[6,]      FALSE      FALSE  FALSE

或者

> (m2 <- lapply(s, match, u) |> sapply(tabulate, length(u)) |> t() |> 
+   `colnames<-`(u))
     canned beer curd frankfurter hygiene articles misc. beverages pastry
[1,]           0    0           0                0               0      0
[2,]           0    0           0                0               0      1
[3,]           1    0           0                0               1      0
[4,]           0    0           0                1               0      0
[5,]           0    0           0                0               0      0
[6,]           0    1           1                0               0      0
     pickled vegetables salty snack sausage sausage semi-finished bread soda
[1,]                  0           0       1       0                   1    0
[2,]                  0           1       0       0                   0    0
[3,]                  0           0       0       0                   0    0
[4,]                  0           0       1       0                   0    0
[5,]                  1           0       0       0                   0    1
[6,]                  0           0       0       0                   0    0
     whole milk whole milk yogurt
[1,]          1          0      1
[2,]          1          0      0
[3,]          0          0      0
[4,]          0          0      0
[5,]          0          0      0
[6,]          0          0      0

它给你一个布尔或二进制矩阵。

为了类似于您的by结果,该矩阵的增量显示,您可以这样做

lapply(seq_len(nrow(m2)), \(i) m[seq_len(i), ])

对于实际的“稀疏矩阵”,具有内存、存储、计算优势,有CRAN 推荐的Matrix软件包

> library(Matrix)
> (sm1 <- as(m1, "sparseMatrix"))
6 x 15 sparse Matrix of class "lgCMatrix"
  [[ suppressing 15 column names ‘canned beer’, ‘curd’, ‘frankfurter’ ... ]]
                                  
[1,] . . . . . . . . | . | . | . |
[2,] . . . . . | . | . . . . | . .
[3,] | . . . | . . . . . . . . . .
[4,] . . . | . . . . | . . . . . .
[5,] . . . . . . | . . . . | . . .
[6,] . | | . . . . . . . . . . . .
> (sm2 <- as(m2, "sparseMatrix"))
6 x 15 sparse Matrix of class "dgCMatrix"
  [[ suppressing 15 column names ‘canned beer’, ‘curd’, ‘frankfurter’ ... ]]
                                  
[1,] . . . . . . . . 1 . 1 . 1 . 1
[2,] . . . . . 1 . 1 . . . . 1 . .
[3,] 1 . . . 1 . . . . . . . . . .
[4,] . . . 1 . . . . 1 . . . . . .
[5,] . . . . . . 1 . . . . 1 . . .
[6,] . 1 1 . . . . . . . . . . . .

该消息指的是未打印的列名称;他们仍然存在:

> sm2[1, 1]
canned beer 
          0 

数据:

> dput(dat)
structure(list(member = c(1000L, 1000L, 1000L, 1001L, 1001L, 
1001L), Date = c("15-03-2015", "24-06-2014", "24-07-2015", "25-11-2015", 
"27-05-2015", "02-05-2015"), V1 = c("sausage,whole milk,semi-finished bread,yogurt", 
"whole milk,pastry,salty snack", "canned beer,misc. beverages", 
"sausage,hygiene articles", "soda,pickled vegetables", "frankfurter,curd"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", 
"6"))

因为你用问题标题打败了我,所以让我们创建一个稀疏矩阵,而不是 先创建一个密集矩阵。

首先我们使用 data.table 包重塑数据:

library(data.table)
DT <- fread('    member  Date        V1
1   1000    15-03-2015  "sausage,whole milk,semi-finished bread,yogurt"   
2   1000    24-06-2014  "whole milk,pastry,salty snack"   
3   1000    24-07-2015  "canned beer,misc. beverages" 
4   1001    25-11-2015  "sausage,hygiene articles"    
5   1001    27-05-2015  "soda,pickled vegetables" 
6   1001    02-05-2015  "frankfurter,curd"  ', skip = 1, header = FALSE, sep = " ")
setnames(DT, c("rn", "member", "Date", "food"))
DT <- cbind(DT, DT[, tstrsplit(food, ",", fixed = TRUE)])
DT[, food := NULL]
DT <- melt(DT, id.vars = 1:3, na.rm = TRUE)
DT[, cart := interaction(member, Date, drop = TRUE)]
DT[, value := factor(value)]
#        rn member       Date variable               value            cart
#     <int>  <int>     <char>   <fctr>              <fctr>          <fctr>
#  1:     1   1000 15-03-2015       V1             sausage 1000.15-03-2015
#  2:     2   1000 24-06-2014       V1          whole milk 1000.24-06-2014
#  3:     3   1000 24-07-2015       V1         canned beer 1000.24-07-2015
#  4:     4   1001 25-11-2015       V1             sausage 1001.25-11-2015
#  5:     5   1001 27-05-2015       V1                soda 1001.27-05-2015
#  6:     6   1001 02-05-2015       V1         frankfurter 1001.02-05-2015
#  7:     1   1000 15-03-2015       V2          whole milk 1000.15-03-2015
#  8:     2   1000 24-06-2014       V2              pastry 1000.24-06-2014
#  9:     3   1000 24-07-2015       V2     misc. beverages 1000.24-07-2015
# 10:     4   1001 25-11-2015       V2    hygiene articles 1001.25-11-2015
# 11:     5   1001 27-05-2015       V2  pickled vegetables 1001.27-05-2015
# 12:     6   1001 02-05-2015       V2                curd 1001.02-05-2015
# 13:     1   1000 15-03-2015       V3 semi-finished bread 1000.15-03-2015
# 14:     2   1000 24-06-2014       V3         salty snack 1000.24-06-2014
# 15:     1   1000 15-03-2015       V4              yogurt 1000.15-03-2015

然后我们使用sparse.model.matrix一些巧妙的矩阵乘法。这个想法来自 Matrix.utils 包,不幸的是它已

library(Matrix)
tmp <- DT[, {
  temp <- sparse.model.matrix(~ value - 1, data = .SD) 
  one <- as(matrix(1L, ncol = nrow(temp), nrow = 1L), 
            "sparseMatrix")
  list(list(one %*% temp))
  }, by = cart]

M <- do.call(rbind, tmp[["V1"]])
dimnames(M)[[1]] <- tmp[["cart"]]
dimnames(M)[[2]] <- gsub("^value", "", dimnames(M)[[2]])
M <- as(M, "lMatrix")
print(M, col.names = TRUE)   
#6 x 13 sparse Matrix of class "lgCMatrix"
#                canned beer curd frankfurter hygiene articles misc. beverages pastry pickled vegetables salty snack sausage semi-finished bread soda
#1000.15-03-2015           .    .           .                .               .      .                  .           .       |                   |    .
#1000.24-06-2014           .    .           .                .               .      |                  .           |       .                   .    .
#1000.24-07-2015           |    .           .                .               |      .                  .           .       .                   .    .
#1001.25-11-2015           .    .           .                |               .      .                  .           .       |                   .    .
#1001.27-05-2015           .    .           .                .               .      .                  |           .       .                   .    |
#1001.02-05-2015           .    |           |                .               .      .                  .           .       .                   .    .
#                whole milk yogurt
#1000.15-03-2015          |      |
#1000.24-06-2014          |      .
#1000.24-07-2015          .      .
#1001.25-11-2015          .      .
#1001.27-05-2015          .      .
#1001.02-05-2015          .      .

如果再多努力一点,第二步就可以进一步改进。by不需要使用 data.table 。也许您可以从 Matrix.utils 代码库复制相关函数。

这是一个以 R 为基数的函数model.matrix。这个函数比问题的更简单。

f <- function(X) {
  y <- strsplit(X[["V1"]], ",") |> lapply(trimws)
  member <- rep(X[["member"]], lengths(y))
  Date <- rep(X[["Date"]], lengths(y))
  out <- data.frame(member, Date, V1 = unlist(y))
  mm <- model.matrix(~ 0 + V1, out)
  colnames(mm) <- sub("V1", "", colnames(mm))
  cbind(out[1:2], mm) |>
    aggregate(. ~ member + Date, data = _, sum)
}  

f(dat)
#>   member       Date canned beer curd frankfurter hygiene articles
#> 1   1001 02-05-2015           0    1           1                0
#> 2   1000 15-03-2015           0    0           0                0
#> 3   1000 24-06-2014           0    0           0                0
#> 4   1000 24-07-2015           1    0           0                0
#> 5   1001 25-11-2015           0    0           0                1
#> 6   1001 27-05-2015           0    0           0                0
#>   misc. beverages pastry pickled vegetables salty snack sausage
#> 1               0      0                  0           0       0
#> 2               0      0                  0           0       1
#> 3               0      1                  0           1       0
#> 4               1      0                  0           0       0
#> 5               0      0                  0           0       1
#> 6               0      0                  1           0       0
#>   semi-finished bread soda whole milk yogurt
#> 1                   0    0          0      0
#> 2                   1    0          1      1
#> 3                   0    0          1      0
#> 4                   0    0          0      0
#> 5                   0    0          0      0
#> 6                   0    1          0      0

创建于 2024-03-26,使用

1

  • 您需要“聚合”结果。每个成员和日期只能有一行。


    – 

这是另一种data.table方法:

首先根据 V1 值(逗号分隔)分割行,然后转换为宽,这会自动使用唯一的 V1 值作为列名。

样本数据

library(data.table)
DT <- fread('member  Date        V1
 1000    15-03-2015  "sausage,whole milk,semi-finished bread,yogurt"   
 1000    24-06-2014  "whole milk,pastry,salty snack"   
 1000    24-07-2015  "canned beer,misc. beverages" 
 1001    25-11-2015  "sausage,hygiene articles"    
 1001    27-05-2015  "soda,pickled vegetables" 
 1001    02-05-2015  "frankfurter,curd"  ', header = TRUE, sep = " ")

代码

dcast(
  DT[, .(item = unlist(tstrsplit(V1, ","))), by = .(member, Date)],
  member + Date ~ item, fun.aggregate = length)

输出

Key: <member, Date>
   member       Date canned beer  curd frankfurter hygiene articles misc. beverages pastry pickled vegetables salty snack
    <int>     <char>       <int> <int>       <int>            <int>           <int>  <int>              <int>       <int>
1:   1000 15-03-2015           0     0           0                0               0      0                  0           0
2:   1000 24-06-2014           0     0           0                0               0      1                  0           1
3:   1000 24-07-2015           1     0           0                0               1      0                  0           0
4:   1001 02-05-2015           0     1           1                0               0      0                  0           0
5:   1001 25-11-2015           0     0           0                1               0      0                  0           0
6:   1001 27-05-2015           0     0           0                0               0      0                  1           0
   sausage semi-finished bread  soda whole milk yogurt
     <int>               <int> <int>      <int>  <int>
1:       1                   1     0          1      1
2:       0                   0     0          1      0
3:       0                   0     0          0      0
4:       0                   0     0          0      0
5:       1                   0     0          0      0
6:       0                   0     1          0      0