rollup
: A Tidy implementation of GROUPING SETS, WITH
ROLLUP, and WITH CUBE, which are powerful extensions of the GROUP BY
clause that compute multiple group-by clauses in a single statement in
SQL. This package operates on top of the dplyr
and performs
the same functions as SQL.
library(dplyr)
#>
#> 다음의 패키지를 부착합니다: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(rollup)
#> 필요한 패키지를 로딩중입니다: tidyr
#>
#> 다음의 패키지를 부착합니다: 'rollup'
#> The following objects are masked from 'package:dplyr':
#>
#> summarise, summarize
data("web_service_data")
web_service_data %>% head
#> # A tibble: 6 × 6
#> date_id id gender age page_view_cnt product_view_cnt_cat
#> <chr> <dbl> <chr> <fct> <dbl> <fct>
#> 1 2024-06-24 19 M 40 0 60%
#> 2 2024-06-24 34 M 40 5 70%
#> 3 2024-06-24 44 F 50 12 100%
#> 4 2024-06-24 57 M 60 87 20%
#> 5 2024-06-24 65 F 50 1 100%
#> 6 2024-06-24 86 F 40 3 90%
grouping_sets(a)
is equivalent to the single grouping
set operation group_by(a)
.grouping_sets(a,b)
is equivalent to row bind of
group_by(a)
and group_by(b)
.library(tidyr)
# avg_pv_cnt group by (gender, age, (gender, age))
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% grouping_sets('gender', 'age', c('gender','age')) %>%
summarize(avg_pv_cnt = mean(page_view_cnt))
#> # A tibble: 20 × 3
#> gender age avg_pv_cnt
#> <chr> <fct> <dbl>
#> 1 F <NA> 2.28
#> 2 M <NA> 1.92
#> 3 <NA> 10 1.61
#> 4 <NA> 20 3.01
#> 5 <NA> 30 2.23
#> 6 <NA> 40 1.77
#> 7 <NA> 50 1.44
#> 8 <NA> 60 2.30
#> 9 F 10 2.33
#> 10 F 20 2.86
#> 11 F 30 2.67
#> 12 F 40 2.33
#> 13 F 50 2.24
#> 14 F 60 1.48
#> 15 M 10 0.92
#> 16 M 20 3.19
#> 17 M 30 1.91
#> 18 M 40 1.31
#> 19 M 50 0.907
#> 20 M 60 2.99
# avg_pv_cnt group by ((gender, age, product_view_cnt_cat), product_view_cnt_cat)
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age, product_view_cnt_cat) %>% grouping_sets('product_view_cnt_cat', c('product_view_cnt_cat', 'gender','age')) %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>% pivot_wider(names_from = product_view_cnt_cat, values_from = avg_pv_cnt)
#> # A tibble: 13 × 11
#> gender age X `20%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
#> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 <NA> <NA> 1.46 1.84 2.02 2.31 2.72 2.89 2.8 3.79 2.82
#> 2 F 10 1.4 2 1.4 2.67 4 NA NA 4 NA
#> 3 F 20 0 3.5 2.08 2.29 3.83 2.57 3.45 4.83 2.25
#> 4 F 30 0.833 2.5 4.5 2.88 3 1.75 3.5 3 3.17
#> 5 F 40 1.33 1.9 2.7 2.2 1.22 3 3.38 4 2
#> 6 F 50 0.462 1.5 2 2.5 1.2 4 2.5 5.33 3.5
#> 7 F 60 1.19 1.71 1 1.33 3 3 1.5 2 3
#> 8 M 10 0.375 0.833 1.14 3 1 0 NA NA NA
#> 9 M 20 1.14 3.17 3.16 3.55 4.5 3 NA 3.5 7
#> 10 M 30 0.824 1.62 1.31 2.7 3.38 2.5 1.86 3.5 NA
#> 11 M 40 0.889 0.933 2.06 0.833 1.88 3.25 1.6 1.67 NA
#> 12 M 50 0.562 1.07 1.06 2.6 2 0 0.5 0 NA
#> 13 M 60 3.06 2.69 4 3.5 0 8 2 1 NA
with_cube()
adds all possible combinations of grouping
variableswith_cube()
easily adds row sum and column sum in a
cross tablegroup_by(a,b,c)
followed by with_cube()
equals to
grouping_sets((a,b,c), (a,b), (a,c), (b,c), a, b, c, NA)
.# add sub-total rows to 2x2 cross table using with_cube()
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% with_cube() %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>% pivot_wider(names_from = age, values_from = avg_pv_cnt)
#> # A tibble: 3 × 8
#> gender `NA` `10` `20` `30` `40` `50` `60`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F 2.28 2.33 2.86 2.67 2.33 2.24 1.48
#> 2 M 1.92 0.92 3.19 1.91 1.31 0.907 2.99
#> 3 <NA> 2.08 1.61 3.01 2.23 1.77 1.44 2.30
# with_cube equals to grouping_sets with all possible combinations
web_service_data %>% filter(date_id == '2024-06-30' & gender != "N") %>%
group_by(gender, age) %>% grouping_sets("gender","age",c("gender","age"), NA) %>%
summarize(avg_pv_cnt = mean(page_view_cnt)) %>% pivot_wider(names_from = age, values_from = avg_pv_cnt)
#> # A tibble: 3 × 8
#> gender `NA` `10` `20` `30` `40` `50` `60`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 F 2.28 2.33 2.86 2.67 2.33 2.24 1.48
#> 2 M 1.92 0.92 3.19 1.91 1.31 0.907 2.99
#> 3 <NA> 2.08 1.61 3.01 2.23 1.77 1.44 2.30
with_rollup()
is an extension of the
group_by
clause that produces sub-total rows alongside
grouped rows.group_by(a,b)
followed by with_rollup()
equals to grouping_sets((a,b), a, NA)
.group_by(a,b,c)
followed by with_rollup()
equals to grouping_sets((a,b,c), (a,b), (a), NA)
.web_service_data %>%
group_by(date_id) %>% with_rollup() %>%
summarize(user_cnt = n_distinct(if_else(page_view_cnt > 0, id, NA)))
#> # A tibble: 31 × 2
#> date_id user_cnt
#> <chr> <int>
#> 1 2024-06-01 644
#> 2 2024-06-02 615
#> 3 2024-06-03 700
#> 4 2024-06-04 710
#> 5 2024-06-05 706
#> 6 2024-06-06 637
#> 7 2024-06-07 694
#> 8 2024-06-08 642
#> 9 2024-06-09 622
#> 10 2024-06-10 706
#> # ℹ 21 more rows
# with_rollup equals to grouping_sets with all possible combinations in descending order
web_service_data %>%
group_by(date_id) %>% grouping_sets("date_id", NA) %>%
summarize(user_cnt = n_distinct(if_else(page_view_cnt > 0, id, NA)))
#> # A tibble: 31 × 2
#> date_id user_cnt
#> <chr> <int>
#> 1 2024-06-01 644
#> 2 2024-06-02 615
#> 3 2024-06-03 700
#> 4 2024-06-04 710
#> 5 2024-06-05 706
#> 6 2024-06-06 637
#> 7 2024-06-07 694
#> 8 2024-06-08 642
#> 9 2024-06-09 622
#> 10 2024-06-10 706
#> # ℹ 21 more rows