Polars

Misc

  • Notes from

  • Packages

  • Read csv:

    import polars as pl
    
    df = pl.read_csv("https://gist.githubusercontent.com/slopp/ce3b90b9168f2f921784de84fa445651/raw/4ecf3041f0ed4913e7c230758733948bc561f434/penguins.csv", null_values="NA")
  • Chaining operations:

    df.filter(pl.col("species") == "Gentoo").select(pl.col("^bill.*$"))
  • Using with a GPU

    • Can see up to 10x execution time decrease

    • Notes from Polars + NVIDIA GPU Tutorial

    • Installation

      pip install polars[gpu] --extra-index-url=https://pypi.nvidia.com
    • Example

      gpu_engine = pl.GPUEngine(
          device=0,
          raise_on_fail=True,
      )
      
      high_spenders_gpu = (
          transactions
          .group_by("CUST_ID")
          .agg(pl.col("AMOUNT").sum())
          .sort(by="AMOUNT", descending=True)
          .head()
          .collect(engine=gpu_engine)
      )

Columns

  • Quick Reference

    dplyr polars
    Select column select(df, x) df.select(pl.col("x"))
    Select multiple columns select(df, x, y) df.select(pl.col("x", "y"))
    Rename column rename(df, new_x = x) df.rename({"x": "new_x"})
    Select all columns except x select(df, -x) df.select(pl.exclude("x"))
    Select all columns that start with “str” select(df, starts_with("str")) df.select(pl.col("^bill.*$")) or df.select(cs.starts_with("str"))[1]
    Select numeric columns select(df, where(is.numeric)) df.select(cs.float(), cs.integer())[1,2]
  • Example: Tidyselect

    df \
      .select([
          pl.col('^name|.*name$'),
          pl.when(
              pl.col('name') == pl.col('email_name')).then(pl.lit('yes'))
              .otherwise(pl.lit('no'))
              .alias('same?')
      ])
    • Apparently we cannot currently use look-forward or backward in polars. Such as .*(?=@) to capture the email_name
    df |> 
      select(starts_with("name") | ends_with("name")) |>
      mutate(`same?` = case_when(
        name == email_name ~ "yes",
        TRUE ~ "no"))    

Rows

  • Quick Reference

    dplyr polars
    First n rows head(df, n) df.head(n)
    Filter range of values filter(df, between(x, lo, hi)) df.filter(pl.col("x").is_between(lo, hi))
    Conditional filter(df, x > 4) df.filter(pl.col("x") > 4 )
    Sort rows arrange(df, x) df.sort("x")

Mutate

  • Quick Reference

    dplyr polars
    Create new column mutate(df, x_mean = mean(x)) df.with_columns(pl.col("x").mean().alias("x_mean"))
  • Example: Ratio variables

    df.with_columns((pl.col("bill_length_mm") / pl.col("flipper_length_mm")).alias("bill_flipper_ratio"))
    
    #> shape: (344, 10)
    #> ┌───────┬───────────┬───────────┬────────────────┬───┬─────────────┬────────┬──────┬────────────────────┐
    #> │ rowid ┆ species   ┆ island    ┆ bill_length_mm ┆ … ┆ body_mass_g ┆ sex    ┆ year ┆ bill_flipper_ratio │
    #> │ ---   ┆ ---       ┆ ---       ┆ ---            ┆   ┆ ---         ┆ ---    ┆ ---  ┆ ---                │
    #> │ i64   ┆ str       ┆ str       ┆ f64            ┆   ┆ i64         ┆ str    ┆ i64  ┆ f64                │
    #> ╞═══════╪═══════════╪═══════════╪════════════════╪═══╪═════════════╪════════╪══════╪════════════════════╡
    #> │ 1     ┆ Adelie    ┆ Torgersen ┆ 39.1           ┆ … ┆ 3750        ┆ male   ┆ 2007 ┆ 0.216022           │
    #> │ 2     ┆ Adelie    ┆ Torgersen ┆ 39.5           ┆ … ┆ 3800        ┆ female ┆ 2007 ┆ 0.212366           │
    #> │ 3     ┆ Adelie    ┆ Torgersen ┆ 40.3           ┆ … ┆ 3250        ┆ female ┆ 2007 ┆ 0.206667           │
    #> │ 4     ┆ Adelie    ┆ Torgersen ┆ null           ┆ … ┆ null        ┆ null   ┆ 2007 ┆ null               │
    #> │ …     ┆ …         ┆ …         ┆ …              ┆ … ┆ …           ┆ …      ┆ …    ┆ …                  │
    #> │ 341   ┆ Chinstrap ┆ Dream     ┆ 43.5           ┆ … ┆ 3400        ┆ female ┆ 2009 ┆ 0.215347           │
    #> │ 342   ┆ Chinstrap ┆ Dream     ┆ 49.6           ┆ … ┆ 3775        ┆ male   ┆ 2009 ┆ 0.256995           │
    #> │ 343   ┆ Chinstrap ┆ Dream     ┆ 50.8           ┆ … ┆ 4100        ┆ male   ┆ 2009 ┆ 0.241905           │
    #> │ 344   ┆ Chinstrap ┆ Dream     ┆ 50.2           ┆ … ┆ 3775        ┆ female ┆ 2009 ┆ 0.253535           │
    #> └───────┴───────────┴───────────┴────────────────┴───┴─────────────┴────────┴──────┴────────────────────┘
    df |> 
      mutate(bill_flipper_ratio = bill_length_mm / flipper_length_mm)
  • Example: Case-When

    df \
      .with_columns([  
          pl.when(pl.col('address').str.contains('OH'))
          .then(pl.lit('local'))
          .when(pl.col('address').str.contains('NY'))
          .then(pl.lit('foodie'))
          .otherwise(pl.lit('elsewhere'))
          .alias('familiarity')
        ])    
    df |>
     mutate(familiarity = case_when(
       str_detect(address, "OH") ~ "local",
       str_detect(address, "NY") ~ "foodie",
       TRUE ~ "elsewhere"
     ))
  • Example: String Extraction

    df \
      .with_columns(
          [
          pl.col('name').str.to_lowercase(),    
          pl.col('email').str.extract(r'^([\d\w]+)@', group_index = 1)
          .alias('email_name')
          ]
      )
    df |>
      rename(tolower(name) = name) |> 
      mutate(
        email_name = str_extract(email, "^([\\d\\w]+)@", group = 1)
      )

Summarize

  • Quick Reference

    dplyr polars
    Group group_by(df, x) df.group_by("x")
    Summarize summarize(df, x_n = n()) df.agg(pl.count().alias("x_n"))
  • Example: Group_By, Count, Sort

    df \
      .group_by("species") \
      .agg(pl.count() \
           .alias("counts")) \
      .sort("counts", descending=True)
    
    #> shape: (3, 2)
    #> ┌───────────┬────────┐
    #> │ species   ┆ counts │
    #> │ ---       ┆ ---    │
    #> │ str       ┆ u32    │
    #> ╞═══════════╪════════╡
    #> │ Adelie    ┆ 152    │
    #> │ Gentoo    ┆ 124    │
    #> │ Chinstrap ┆ 68     │
    #> └───────────┴────────┘
  • Example: Group-By, Arrange, Summarize

    For each species, find the penguin with the lowest body mass

    df \
      .group_by("species") \
      .agg(pl.all() \
        .sort_by("body_mass_g") \
        .drop_nulls() \
        .first())
    
    #> shape: (3, 9)
    #> ┌───────────┬───────┬───────────┬────────────────┬───┬───────────────────┬─────────────┬────────┬──────┐
    #> │ species   ┆ rowid ┆ island    ┆ bill_length_mm ┆ … ┆ flipper_length_mm ┆ body_mass_g ┆ sex    ┆ year │
    #> │ ---       ┆ ---   ┆ ---       ┆ ---            ┆   ┆ ---               ┆ ---         ┆ ---    ┆ ---  │
    #> │ str       ┆ i64   ┆ str       ┆ f64            ┆   ┆ i64               ┆ i64         ┆ str    ┆ i64  │
    #> ╞═══════════╪═══════╪═══════════╪════════════════╪═══╪═══════════════════╪═════════════╪════════╪══════╡
    #> │ Adelie    ┆ 4     ┆ Torgersen ┆ 36.5           ┆ … ┆ 181               ┆ 2850        ┆ female ┆ 2007 │
    #> │ Chinstrap ┆ 315   ┆ Dream     ┆ 46.9           ┆ … ┆ 192               ┆ 2700        ┆ female ┆ 2008 │
    #> │ Gentoo    ┆ 272   ┆ Biscoe    ┆ 42.7           ┆ … ┆ 208               ┆ 3950        ┆ female ┆ 2009 │
    #> └───────────┴───────┴───────────┴────────────────┴───┴───────────────────┴─────────────┴────────┴──────┘
    df |>
      group_by(species) |>
      arrange(body_mass_g) |>
      summarize(body_mass_g = first(body_mass_g))
    
    #> # A tibble: 3 × 2
    #>   species   body_mass_g
    #>   <fct>           <int>
    #> 1 Adelie           2850
    #> 2 Chinstrap        2700
    #> 3 Gentoo           3950
  • Example: Select, Filter, Summarize, Across

    df \
     .select(df.columns[0:4]+['^performance.*$','^salary.*$']) \
     .filter(pl.col('age') >= 30) \
     .select(pl.col(pl.Int64).mean().name.prefix('mean_'))
    
    #> shape: (1, 5)
    #> mean_age mean_performance_rating mean_performance_reviews_count  mean_salary mean_salary_increase_percentage
    #> f64  f64 f64 f64 f64
    #> 46.6 4.4 3.4 40000.0 10.0
    • For some reason, you have to sandwich ^ and $ around the regex in order to return those column names
    df |>
      select(1:3, 
             starts_with("performance"), 
             starts_with("salary")) |> 
      filter(age >= 30) |>
      summarize(across(
        .cols = where(is.numeric), 
        .fns = mean, 
        .names = "mean_{.col}"))
    
    #> # A tibble: 1 × 5
    #>   mean_age mean_performance_rating mean_performance_reviews_count mean_salary
    #>      <dbl>                   <dbl>                          <dbl>       <dbl>
    #> 1     46.6                     4.4                            3.4       40000
    #> # ℹ 1 more variable: mean_salary_increase_percentage <dbl>

Joins

  • Quick Reference

    dplyr polars
    Join dataframes left_join(df1, df2, by=x) df1.join(df2, on="x", how="left")
  • Example: Left Join

    import random
    
    df \
      .join(df_dept, on="department") \
      .select(['name','dept_id']) \
      .with_columns(
          employee_id = pl.format(
            '{}-{}',
            'dept_id',
            pl.Series([
              random.randint(100000, 999999) for _ in range(len(df))
              ])
              )
      )    
    df |>
      left_join(df_dept, by = "department") |>
      select(name, dept_id) |>
      mutate(employee_id = map_chr(dept_id, ~ paste0(.x, "-", sample(1:9, 1))))    

tidyr

  • Example: Unite

    df \
      .with_columns(
          pl.concat_str(
              pl.col(pl.String), separator=" "
          ).alias('combination_of_character')
      ) \
      .select(pl.col('combination_of_character'))
    
    #> shape: (6, 1)
    #> combination_of_character
    #> str
    #> "Alice New York 123 Main St, On…
    #> "Bob San Francisco 123 Main St,…
    #> "Charlie Tokyo 456-7890, Tokyo,…
    #> "Ken Toronto 49494 Exchange St,…
    #> "Steven Lima 1010 Gb st, Lima, …
    #> "Carlos Cleveland 666 Heaven dr…
    df |> 
      unite(col = "combination_of_character", 
            where(is.character), 
            sep = " ", ) |> 
      select(combination_of_character)
    
    #> # A tibble: 6 × 1
    #>   combination_of_character                                                                                         
    #>   <chr>                                                                                                            
    #> 1 Alice New York 123 Main St, Ontario, OH 123-456-7890 alice@example.com Engineering 2010-01-01 Active 2022-05-01  
    #> 2 Bob San Francisco 123 Main St, Calgary, AB 987-654-3210 bob@example.com Marketing 2012-05-15 Inactive 2021-07-15 
    #> 3 Charlie Tokyo 456-7890, Tokyo, NY 098-765-4332 charlie@example.com Finance 2015-10-01 Active 2022-08-31          
    #> 4 Ken Toronto 49494 Exchange St, Toronto, ON 111-232-4141 ken@stats.org Marketing 2010-04-01 Inactive 2024-10-30   
    #> 5 Steven Lima 1010 Gb st, Lima, OH 505-402-6060 stephencurry@nba.com Marketing 2009-10-30 Active 2023-01-02        
    #> 6 Carlos Cleveland 666 Heaven dr, Cleveland, OH 909-435-1000 carlos@genius.edu Finance 2005-11-12 Active 2024-12-12
  • Example: Pivot Wider

    df \
      .select(['name','address']) \
      .with_columns(
          state = pl.col('address').str.extract(r'([A-Z]{2})$')
      ) \
      .select('name','state') \
      .pivot(on = 'state', index = 'name', values='state', aggregate_function='len') \
      .with_columns(
          pl.col(pl.UInt32).fill_null(0)
      ) 
    
    # Or since this just creates dummies
    df \
      .select(['name','department']) \
      .to_dummies(columns = 'department') 
    df |>
      select(name, department) |>
      pivot_wider(id_cols = "name", 
                  names_from = "department", 
                  values_from = "department", 
                  values_fill = 0, 
                  values_fn = length, 
                  names_prefix = "department_")
    
    #> # A tibble: 6 × 4
    #>   name    department_Engineering department_Marketing department_Finance
    #>   <chr>                    <int>                <int>              <int>
    #> 1 Alice                        1                    0                  0
    #> 2 Bob                          0                    1                  0
    #> 3 Charlie                      0                    0                  1
    #> 4 Ken                          0                    1                  0
    #> 5 Steven                       0                    1                  0
    #> 6 Carlos                       0                    0                  1

Snippets

  • Example: Lead and Forward Fill

    • If the department only has 1 row of salary data, do not shift salary
    • If the department has more than 1 row of salary data, shift by -1 of salary column
    • Reason: there was a mistake in entering data for those with more than 1 row of data, apparently the actualy salary data is 1 row more
    df \
      .group_by('department') \
      .agg(
          pl.col('name'),
          pl.when(pl.col('salary').len()==1).then(pl.col('salary'))
          .otherwise(pl.col('salary').shift(-1))
          .alias('salary_shift')) \
      .explode('name','salary_shift') \
      .with_columns(
          pl.col('salary_shift').forward_fill()) 
    df |>
      group_by(department) |>
      summarize(
        name = name,
        salary_shift = case_when(
          n() == 1 ~ salary,
          TRUE ~ lead(salary)
        )
      ) |>
     fill(salary_shift, .direction = "down")
    
    #> # A tibble: 6 × 3
    #> # Groups:   department [3]
    #>   department  name    salary_shift
    #>   <chr>       <chr>          <dbl>
    #> 1 Engineering Alice          50000
    #> 2 Finance     Charlie        30000
    #> 3 Finance     Carlos         30000
    #> 4 Marketing   Bob            20000
    #> 5 Marketing   Ken            40000
    #> 6 Marketing   Steven         40000
  • Example: Map

    import random
    
    df \
      .select(['name','dept_id']) \
      .with_columns(
          employee_id = pl.format(
            '{}-{}',
            'dept_id',
            pl.Series([
              random.randint(100000, 999999) for _ in range(len(df))
              ])
              )
      ) 
    df |>
      select(name, dept_id) |>
      mutate(employee_id = map_chr(dept_id, ~paste0(.x, "-", sample(1000000:9999999, 1))))
    #> # A tibble: 6 × 3
    #>   name    dept_id employee_id
    #>   <chr>     <dbl> <chr>      
    #> 1 Alice        30 30-1694470 
    #> 2 Bob          25 25-1696036 
    #> 3 Charlie      20 20-4463080 
    #> 4 Ken          25 25-6942432 
    #> 5 Steven       25 25-3012223 
    #> 6 Carlos       20 20-8705991