Common utility functions
Usage
change(df, cols, csm = NULL, digits = 5)
chg(x, n = 1L, fill_na = 0L)
pct(x, n = 1L, fill_na = 0L)
ror(df, col, n = 1L)
geomean(x)
years_df(df, date_col)
years_vec(date_col)
duration_vec(date_col)
make_interval(df, start, end = lubridate::today())
summary_stats(
df,
condition = NULL,
group_vars = NULL,
summary_vars = NULL,
arr = NULL,
digits = 3
)
gen_data(years)
Arguments
- df
data frame
- cols
numeric columns to calculate absolute/relative change & rate of return
- csm
numeric cols to calculate cumulative sum for
- digits
Number of digits to round to, default is 3
- x
numeric vector
- n
values to offset
- fill_na
fill value for any NAs; default is 0
- col
numeric column
- date_col
date column
- start
start date column
- end
end date column
- condition
filter condition, i.e.
patient == "new"
- group_vars
variables to group by, i.e.
c(specialty, state, hcpcs, cost)
- summary_vars
variables to summarise, i.e.
c(min, max, mode, range)
- arr
column to arrange data by, i.e.
cost
- years
sequence of years, e.g.
2010:2020
Examples
# Example data
ex <- gen_data(2020:2025)
head(ex)
#> # A tibble: 6 × 3
#> year group pay
#> <int> <chr> <int>
#> 1 2020 A 1172
#> 2 2021 A 1406
#> 3 2022 A 1459
#> 4 2023 A 1062
#> 5 2024 A 1484
#> 6 2025 A 1869
# Lagged absolute/percentage change, rate of return and cumulative sum
# `change()`
dplyr::filter(ex, group == "A") |>
change(pay)
#> # A tibble: 6 × 6
#> year group pay pay_chg pay_pct pay_ror
#> <int> <chr> <int> <int> <dbl> <dbl>
#> 1 2020 A 1172 0 0 1
#> 2 2021 A 1406 234 0.200 1.20
#> 3 2022 A 1459 53 0.0377 1.04
#> 4 2023 A 1062 -397 -0.272 0.728
#> 5 2024 A 1484 422 0.397 1.40
#> 6 2025 A 1869 385 0.259 1.26
# `geomean()` # Geometric mean
ex |>
dplyr::filter(group == "A") |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))
#> # A tibble: 1 × 1
#> gmean
#> <dbl>
#> 1 1.10
# When performing a `group_by()`, watch for
# the correct order of the variables
ex |>
dplyr::group_by(group) |>
change(pay)
#> # A tibble: 12 × 6
#> # Groups: group [2]
#> year group pay pay_chg pay_pct pay_ror
#> <int> <chr> <int> <int> <dbl> <dbl>
#> 1 2020 A 1172 0 0 1
#> 2 2021 A 1406 234 0.200 1.20
#> 3 2022 A 1459 53 0.0377 1.04
#> 4 2023 A 1062 -397 -0.272 0.728
#> 5 2024 A 1484 422 0.397 1.40
#> 6 2025 A 1869 385 0.259 1.26
#> 7 2020 B 1878 0 0 1
#> 8 2021 B 1558 -320 -0.170 0.830
#> 9 2022 B 1926 368 0.236 1.24
#> 10 2023 B 1451 -475 -0.247 0.753
#> 11 2024 B 1200 -251 -0.173 0.827
#> 12 2025 B 1196 -4 -0.00333 0.997
ex |>
dplyr::group_by(group) |>
change(pay) |>
dplyr::summarise(mean_pay = mean(pay, na.rm = TRUE),
csm_chg = sum(pay_chg),
csm_pct = sum(pay_pct),
mean_ror = mean(pay_ror, na.rm = TRUE),
geomean = geomean(pay_ror))
#> # A tibble: 2 × 6
#> group mean_pay csm_chg csm_pct mean_ror geomean
#> <chr> <dbl> <int> <dbl> <dbl> <dbl>
#> 1 A 1409. 697 0.622 1.10 1.08
#> 2 B 1535. -682 -0.357 0.940 0.928
# Timespans
dt <- dplyr::tibble(date = lubridate::today() - 366)
# `years_df()`
years_df(dt, date)
#> # A tibble: 1 × 2
#> date years_passed
#> <date> <dbl>
#> 1 2023-11-18 1
# `duration_vec()`
dplyr::mutate(dt, dur = duration_vec(date))
#> # A tibble: 1 × 2
#> date dur
#> <date> <Duration>
#> 1 2023-11-18 -31622400s (~-1 years)
# `make_interval()`
dplyr::tibble(date = lubridate::today() - 1000) |>
make_interval(start = date, end = lubridate::today() - 500)
#> # A tibble: 1 × 4
#> date interval period timelength_days
#> <date> <Interval> <Period> <dbl>
#> 1 2022-02-22 2022-02-22 UTC--2023-07-07 UTC 1y 4m 15d 0H 0M 0S 500
# `summary_stats()`
sm <- dplyr::tibble(provider = sample(c("A", "B", "C"), size = 200, replace = TRUE),
city = sample(c("ATL", "NYC"), size = 200, replace = TRUE),
charges = sample(1000:2000, size = 200),
payment = sample(1000:2000, size = 200))
head(sm)
#> # A tibble: 6 × 4
#> provider city charges payment
#> <chr> <chr> <int> <int>
#> 1 A ATL 1523 1352
#> 2 C ATL 1956 1662
#> 3 C NYC 1396 1288
#> 4 A NYC 1749 1180
#> 5 B ATL 1356 1894
#> 6 B NYC 1327 1085
summary_stats(sm,
condition = city == "ATL",
group_vars = provider,
summary_vars = c(charges, payment),
arr = provider)
#> # A tibble: 3 × 8
#> provider charges_median charges_mean charges_sd payment_median payment_mean
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 C 1518 1493. 323. 1407 1441.
#> 2 B 1444. 1517. 310. 1499 1520.
#> 3 A 1456 1446 259. 1493 1511.
#> # ℹ 2 more variables: payment_sd <dbl>, n <int>
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
change(pay)
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
dplyr::mutate(change = chg(pay))
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
dplyr::mutate(pct_change = pct(pay))
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
ror(pay)
ex |>
dplyr::group_by(group) |>
ror(pay)
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))
ex |>
dplyr::group_by(group) |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))
}
if (FALSE) { # interactive()
dt <- dplyr::tibble(date = lubridate::today() - 366)
dplyr::mutate(dt, years = years_vec(date))
}
if (FALSE) { # interactive()
dplyr::tibble(date = lubridate::today() - 366,
date2 = date - 789) |>
dplyr::mutate(dur = duration_vec(date),
dur2 = duration_vec(date2))
}
if (FALSE) { # interactive()
dt <- dplyr::tibble(date = lubridate::today() - 366)
make_interval(dt, start = date)
}