Tidyverse

dplyr

  • Tidy Selectors
    • Operators
      • : for selecting a range of consecutive variables.
      • ! for taking the complement of a set of variables.
      • & and | for selecting the intersection or the union of two sets of variables.
      • c() for combining selections.
    • Select specific columns:
      • everything(): Matches all variables.
      • last_col(): Select last variable, possibly with an offset.
      • group_cols(): Select all grouping columns.
    • Pattern Matching:
      • starts_with(): Starts with a prefix.
      • ends_with(): Ends with a suffix.
      • contains(): Contains a literal string.
      • matches(): Matches a regular expression.
      • num_range(): Matches a numerical range like x01, x02, x03.
    • Variables stored in a character vector:
      • all_of(): Matches variable names in a character vector. All names must be present, otherwise an out-of-bounds error is thrown.
        • i.e. If you’ve defined a character with quoted names, e.g. vars <- c("Sepal.Length", "Sepal.Width"
      • any_of(): Same as all_of(), except that no error is thrown for names that don’t exist.
    • Using a predicate function:
      • where(): Applies a function to all variables and selects those for which the function returns TRUE.
  • slice
    • Notes from Row relational operations with slice()

      • In-depth look at slice functionality. It has other examples that I haven’t listed here, e.g interleaving rows, inserting a row at specific intervals.
    • Some Operations

      starwars |> slice(1:6) # First six rows
      starwars |> slice(5, 1, 6) # specific indices, keeps order
      starwars |> slice(n() - 2:0) # Last three rows
      starwars |> slice(-(4:n()))  # All rows except fourth row to last row
    • tidyeval: starwars |> slice(!!!list(1, 2:4, 5, 6))

  • arrange
    • By group

      gapminder_df |> 
        group_by(year) |> 
        arrange(gdpPercap, .by_group = TRUE)
  • across
    • Basic

      darl_dat_proc <- darl_dat_raw |> 
        mutate(across(where(is.numeric), scale))
    • Format column names

      iris %>%
        group_by(Species) %>%
        summarise(across(starts_with("Sepal"), 
                         mean, 
                         .names = "mean_{.col}"))
    • Multiple Functions

      # Using a named list in functions arg
      iris %>%
        group_by(Species) %>%
        summarise(across(starts_with("Sepal"), 
                         list(mean = mean, sd = sd), 
                         .names = "{.col}.{.fn}"))
      
      # When the list is not named, .fn is replaced by the function's position
      iris %>%
        group_by(Species) %>%
        summarise(across(starts_with("Sepal"), 
                         list(mean, sd), 
                         .names = "{.col}.fn{.fn}"))

tidyr

  • uncount

    uncount_df <- tibble(x = c("a", "b"), n = c(1, 2))
    uncount_df |> uncount(n)
    #>  # A tibble: 3 × 1
    #>    x    
    #>    <chr>
    #>  1 a    
    #>  2 b    
    #>  3 b

purrr

  • Notes from Mastering purrr: From Basic Maps to Functional Magic in R

  • imap - Useful to include the index or names of elements in your function calls.

    # A named list of scores
    named_scores <- list(math = 90, science = 85, history = 78)
    
    # Create descriptive strings for each score
    score_descriptions <- 
      imap(
        named_scores, 
        ~ paste(.y, "score is", .x)
      )
    score_descriptions
    
    $math
    [1] "math score is 90"
    
    $science
    [1] "science score is 85"
    
    $history
    [1] "history score is 78"
  • map_if

    mixed_list <- list(1, "a", 3, "b", 5)
    doubled_numbers <- 
      map_if(
        mixed_list, 
        is.numeric, 
        ~ .x * 2
      )
  • map_at

    specific_list <- list(a = 1, b = "hello", c = 3, d = "world")
    # Convert only the character elements to uppercase
    uppercase_chars <- 
      map_at(specific_list, 
             c("b", "d"), 
             ~ toupper(.x))
  • walk - When you just want the side-effects and not return anything

    purrr::walk(grps, 
                ~chk::chk_character_or_factor(.x, 
                                              x_name = "... (group columns)")
    )
  • pmap - To iterate by element across each list

    lol <- list(alg_list, grid_list, n_iter_list)
    
    # Setting up multiple RandomSearchCV objects, 1 for each algorithm
    # Collecting them in the inner-loop list
    inner_loop <- 
      purrr::pmap(lol, 
                  function(alg, grid, n_iter) {
                    sk_ms$RandomizedSearchCV(
                      estimator = alg,
                      param_distributions = grid,
                      n_iter = n_iter,
                      scoring = 'neg_mean_absolute_error',
                      cv = inner_cv,
                      n_jobs = -1L,
                      pre_dispatch = '2*n_jobs',
                      refit = TRUE)
                  }
      )
  • modify - Applies a transformation to each element of a list or vector and returns the modified list or vector.

    if ("step.pattern" %in% names(params[[dist_alg]])) {
    
          grid_final <- 
            purrr::modify(grid_initial, 
                          .f = function(x) {
    
                                 # coerce step pattern obj to a numeric vector to determine which step pattern it is
                                 step_test <- as.numeric(x$step.pattern)
                                 step_sym1 <- as.numeric(dtw::symmetric1)
                                 step_sym2 <- as.numeric(dtw::symmetric2)
                                 # compare patterns' numeric vectors then add step.pattern label to grid
                                 if (all(step_test == step_sym1)) {
                                   param_ls <- append(x, c(step_pattern_id = "symmetric1"))
                                 } else {
                                   param_ls <- append(x, c(step_pattern_id = NA))
                                 }
                                 return(param_ls)
                          }
            )
    }
  • every - Checks if all elements in a list or vector satisfy a given predicate. If all elements meet the condition, it returns TRUE; otherwise, it returns FALSE.

    numbers <- list(2, 4, 6, 8)
    # Check if all numbers are even
    every(numbers, ~ .x %% 2 == 0)
    #> [1] TRUE
    
    mtcars %>%
      select(hp) %>%
      map_lgl(~some(.x, ~ .x > 150))
    #> hp
    #> TRUE
    • some - Checks if at least one element in a list or vector satisfies a given predicate. If any element meets the condition, it returns TRUE; otherwise, it returns FALSE.

    • none - Checks if no elements in a list or vector satisfy a given predicate. If no elements meet the condition, it returns TRUE; otherwise, it returns FALSE.

  • keep - Retains elements that satisfy a given predicate. If an element meets the condition, it is kept; otherwise, it is removed.

    # Keep cars with mpg greater than 20 and discard cars with hp less than 100
    filtered_cars <- mtcars %>%
      split(1:nrow(.)) %>%
      keep(~ .x$mpg > 20) %>%
      discard(~ .x$hp < 100) %>%
      bind_rows()
    
    filtered_cars
    
    #>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
    #> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
    #> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
    #> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
    #> Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
    #> Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
    • Similar with discard
    • split splits the df into a list of row elements
  • reduce

    • Example 1: Recursively join dataframes

      react_tbl_list <- list(react_dd_heat, 
                             avg_covid_icu_hist, 
                             avg_covid_hosp_hist, 
                             avg_total_inpat_beds_hist)
      
      react_tab_final <- 
        purrr::reduce(react_tbl_list, 
                      left_join, 
                      by = "hospital_name")
    • Example 2: Recursively layer geoms (article)

      viridis_colors <- viridis::viridis(10)
      
      # old
      # mtcars %>% 
      #   ggplot(aes(hp, mpg)) +
      #   geom_point(size = 10, color = viridis_colors[5]) +
      #   geom_point(size = 8, color = viridis_colors[4]) +
      #   geom_point(size = 6, color = viridis_colors[3]) +
      #   geom_point(size = 4, color = viridis_colors[2]) +
      #   geom_point(size = 2, color = viridis_colors[1]) +
      #   scale_x_discrete(expand = expansion(.2)) +
      #   scale_y_continuous(expand = expansion(.2)) +
      #   theme_void() +
      #   theme(panel.background = element_rect(fill = "grey20"))
      
      # new
      reduce(
          5L:1L,
          ~ .x + geom_point(size = .y * 2, color = viridis_colors[.y]),
      
          .init = mtcars %>% 
            ggplot(aes(hp, mpg)) +
            scale_x_discrete(expand = expansion(.2)) +
            scale_y_continuous(expand = expansion(.2)) +
            theme_void() +
            theme(panel.background = element_rect(fill = "grey20"))
      
      )
      • .x seems like it’s the code that’s recursively returned and .y is the new value from the iterable with iterable values being inputted from left to right.
      • The order of ggplot calls doesn’t matter. See methods 2 and 3 in the article for solutions to situations when you want to place reduce at a specific place within a chain of code
      • The output of this is one chart with mulitple sized dots layered on top of each other. It’s a sort of tree ring effect on the dots of a scatterplot.
    • Example 3: Iterate functions

      square <- function(x) x^2
      deviation <- function(x) x - mean(x)
      nums <- runif(100)
      
      my_funs <- list(deviation, square, mean, sqrt)
      
      reduce(
        my_funs,
        ~ .y(.x),
        .init = nums
      )
      • See Example 2 in reduce2 for interating functions and arguments
  • reduce2

    • Same as reduce but takes two iterables as arguments

    • Example 1: Individually style columns in a kable table (article)

      # using reduce
      # numbers <- 3:5
      # background_colors <- c("skyblue", "forestgreen", "chocolate")
      # 
      # (mtcars %>% 
      #   head() %>% 
      #   kbl() %>% 
      #   kable_classic(html_font = "Roboto")) %>% 
      #   reduce(
      #     1:3,
      #     ~ .x %>% column_spec(numbers[.y], background = background_colors[.y]),
      #     .init = .
      #   )
      
      (mtcars %>% 
        head() %>% 
        kbl() %>% 
        kable_classic(html_font = "Roboto")) %>% 
        reduce2(
          3:5,                                           # 1st varying argument (represented by ..2)
          c("skyblue", "forestgreen", "chocolate"),      # 2nd varying argument (represented by ..3)
          ~ ..1 %>% column_spec(..2, background = ..3),
          .init = .
        )
      • ..1 is like the .x and ..2 is like the .y from reduce. The only new part is ..3 which refers to the second varying argument.
    • Example 2: Iterate functions and arguments

      reduce2(
        my_list_of_funs,
        my_list_of_args,
        ~ do.call(..2, c(list(dat = ..1), ..3)),
        .init = mtcars
      )
      • The list of arguments is actually a list of lists with each set of arguments getting its own list.

      • See Example 1 for descriptions of the “..1” etc. syntax

  • accumulate

    • It’s like reduce, except instead of returning a single value which is the output of the very last function call, it keeps all intermediate values and returns them in a list.

    • Example: (article)

      plots <- (mtcars %>% 
        ggplot(aes(hp, mpg)) +
        scale_x_discrete(expand = expansion(.2)) +
        scale_y_continuous(expand = expansion(.2)) +
        theme_void() +
        theme(panel.background = element_rect(fill = "grey20"))) %>% 
        accumulate(
          10L:1L,
          ~ .x + geom_point(size = .y ^ 1.5, color = viridis_colors[.y]),
          .init = .
        )
      
      for (i in plots) { plot(i) }
      • Variation of Example 2 in the reduce section

      • Unlike reduce where the output was one plot, accumulate outputs a list of plots. One for each value of the iterable.

  • accumulate2

    • It’s like reduce2, except instead of returning a single value which is the output of the very last function call, it keeps all intermediate values and returns them in a list.

    • Example: (article)

      tables <- mtcars %>% 
        head() %>% 
        kbl() %>% 
        kable_classic(html_font = "Roboto") %>% 
        kable_styling(full_width = FALSE) %>% # Added to keep aspect ratio constant when saving
        accumulate2(
          1:(length(mtcars)+1),                                          
          viridis::viridis(length(mtcars)+1),  
          ~ column_spec(..1, ..2, background = ..3, color = if(..2 < 5){"white"}),
          .init = .
        )
      • Produces a table for each individually styled column and stores them all in a list.
  • compose - Sequentially apply functions

    # Define scaling and log functions
    scale_by_10 <- function(x) x * 10
    safe_log <- safely(log, otherwise = NA)
    
    # Compose them into a single function
    scale_and_log <- compose(safe_log, scale_by_10)
    
    # Apply the composed function to the hp column
    mtcars <- mtcars %>%
      mutate(log_scaled_hp = map_dbl(hp, ~ scale_and_log(.x)$result))
    
    head(mtcars)
    
    #>                    mpg cyl disp  hp drat    wt  qsec vs am gear carb log_scaled_hp
    #> Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4      7.003065
    #> Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4      7.003065
    #> Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1      6.835185
    #> Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1      7.003065
    #> Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2      7.467371
    #> Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1      6.956545