Skip to contents

Common utility functions

Usage

change(df, cols, csm = NULL, digits = 5)

chg(x, n = 1L, fill_na = 0L)

pct(x, n = 1L, fill_na = 0L)

ror(df, col, n = 1L)

geomean(x)

years_df(df, date_col)

years_vec(date_col)

duration_vec(date_col)

make_interval(df, start, end = lubridate::today())

summary_stats(
  df,
  condition = NULL,
  group_vars = NULL,
  summary_vars = NULL,
  arr = NULL,
  digits = 3
)

gen_data(years)

Arguments

df

data frame

cols

numeric columns to calculate absolute/relative change & rate of return

csm

numeric cols to calculate cumulative sum for

digits

Number of digits to round to, default is 3

x

numeric vector

n

values to offset

fill_na

fill value for any NAs; default is 0

col

numeric column

date_col

date column

start

start date column

end

end date column

condition

filter condition, i.e. patient == "new"

group_vars

variables to group by, i.e. c(specialty, state, hcpcs, cost)

summary_vars

variables to summarise, i.e. c(min, max, mode, range)

arr

column to arrange data by, i.e. cost

years

sequence of years, e.g. 2010:2020

Value

tibble() or vector

tibble

Examples

# Example data
ex <- gen_data(2020:2025)
head(ex)
#> # A tibble: 6 × 3
#>    year group   pay
#>   <int> <chr> <int>
#> 1  2020 A      1172
#> 2  2021 A      1406
#> 3  2022 A      1459
#> 4  2023 A      1062
#> 5  2024 A      1484
#> 6  2025 A      1869

# Lagged absolute/percentage change, rate of return and cumulative sum
# `change()`
dplyr::filter(ex, group == "A") |>
change(pay)
#> # A tibble: 6 × 6
#>    year group   pay pay_chg pay_pct pay_ror
#>   <int> <chr> <int>   <int>   <dbl>   <dbl>
#> 1  2020 A      1172       0  0        1    
#> 2  2021 A      1406     234  0.200    1.20 
#> 3  2022 A      1459      53  0.0377   1.04 
#> 4  2023 A      1062    -397 -0.272    0.728
#> 5  2024 A      1484     422  0.397    1.40 
#> 6  2025 A      1869     385  0.259    1.26 

# `geomean()` # Geometric mean
ex |>
dplyr::filter(group == "A") |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))
#> # A tibble: 1 × 1
#>   gmean
#>   <dbl>
#> 1  1.10

# When performing a `group_by()`, watch for
# the correct order of the variables
ex |>
dplyr::group_by(group) |>
change(pay)
#> # A tibble: 12 × 6
#> # Groups:   group [2]
#>     year group   pay pay_chg  pay_pct pay_ror
#>    <int> <chr> <int>   <int>    <dbl>   <dbl>
#>  1  2020 A      1172       0  0         1    
#>  2  2021 A      1406     234  0.200     1.20 
#>  3  2022 A      1459      53  0.0377    1.04 
#>  4  2023 A      1062    -397 -0.272     0.728
#>  5  2024 A      1484     422  0.397     1.40 
#>  6  2025 A      1869     385  0.259     1.26 
#>  7  2020 B      1878       0  0         1    
#>  8  2021 B      1558    -320 -0.170     0.830
#>  9  2022 B      1926     368  0.236     1.24 
#> 10  2023 B      1451    -475 -0.247     0.753
#> 11  2024 B      1200    -251 -0.173     0.827
#> 12  2025 B      1196      -4 -0.00333   0.997

ex |>
dplyr::group_by(group) |>
change(pay) |>
dplyr::summarise(mean_pay = mean(pay, na.rm = TRUE),
                 csm_chg  = sum(pay_chg),
                 csm_pct  = sum(pay_pct),
                 mean_ror = mean(pay_ror, na.rm = TRUE),
                 geomean  = geomean(pay_ror))
#> # A tibble: 2 × 6
#>   group mean_pay csm_chg csm_pct mean_ror geomean
#>   <chr>    <dbl>   <int>   <dbl>    <dbl>   <dbl>
#> 1 A        1409.     697   0.622    1.10    1.08 
#> 2 B        1535.    -682  -0.357    0.940   0.928

# Timespans
dt <- dplyr::tibble(date = lubridate::today() - 366)

# `years_df()`
years_df(dt, date)
#> # A tibble: 1 × 2
#>   date       years_passed
#>   <date>            <dbl>
#> 1 2023-11-18            1

# `duration_vec()`
dplyr::mutate(dt, dur = duration_vec(date))
#> # A tibble: 1 × 2
#>   date       dur                   
#>   <date>     <Duration>            
#> 1 2023-11-18 -31622400s (~-1 years)

# `make_interval()`
dplyr::tibble(date = lubridate::today() - 1000) |>
make_interval(start = date, end = lubridate::today() - 500)
#> # A tibble: 1 × 4
#>   date       interval                       period             timelength_days
#>   <date>     <Interval>                     <Period>                     <dbl>
#> 1 2022-02-22 2022-02-22 UTC--2023-07-07 UTC 1y 4m 15d 0H 0M 0S             500


# `summary_stats()`
sm <- dplyr::tibble(provider = sample(c("A", "B", "C"), size = 200, replace = TRUE),
                    city = sample(c("ATL", "NYC"), size = 200, replace = TRUE),
                    charges = sample(1000:2000, size = 200),
                    payment = sample(1000:2000, size = 200))

head(sm)
#> # A tibble: 6 × 4
#>   provider city  charges payment
#>   <chr>    <chr>   <int>   <int>
#> 1 A        ATL      1523    1352
#> 2 C        ATL      1956    1662
#> 3 C        NYC      1396    1288
#> 4 A        NYC      1749    1180
#> 5 B        ATL      1356    1894
#> 6 B        NYC      1327    1085

summary_stats(sm,
              condition    = city == "ATL",
              group_vars   = provider,
              summary_vars = c(charges, payment),
              arr          = provider)
#> # A tibble: 3 × 8
#>   provider charges_median charges_mean charges_sd payment_median payment_mean
#>   <chr>             <dbl>        <dbl>      <dbl>          <dbl>        <dbl>
#> 1 C                 1518         1493.       323.           1407        1441.
#> 2 B                 1444.        1517.       310.           1499        1520.
#> 3 A                 1456         1446        259.           1493        1511.
#> # ℹ 2 more variables: payment_sd <dbl>, n <int>

if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
change(pay)
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
dplyr::mutate(change = chg(pay))
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
dplyr::mutate(pct_change = pct(pay))
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
ror(pay)

ex |>
dplyr::group_by(group) |>
ror(pay)
}
if (FALSE) { # interactive()
dplyr::filter(ex, group == "A") |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))

ex |>
dplyr::group_by(group) |>
ror(pay) |>
dplyr::summarise(gmean = geomean(pay_ror))
}
if (FALSE) { # interactive()
dt <- dplyr::tibble(date = lubridate::today() - 366)
dplyr::mutate(dt, years = years_vec(date))
}
if (FALSE) { # interactive()
dplyr::tibble(date = lubridate::today() - 366,
              date2 = date - 789) |>
dplyr::mutate(dur = duration_vec(date),
              dur2 = duration_vec(date2))
}
if (FALSE) { # interactive()
dt <- dplyr::tibble(date = lubridate::today() - 366)
make_interval(dt, start = date)
}