Polars
Misc
Notes from
Packages
- {{tidypolars}}
Read csv:
import polars as pl = pl.read_csv("https://gist.githubusercontent.com/slopp/ce3b90b9168f2f921784de84fa445651/raw/4ecf3041f0ed4913e7c230758733948bc561f434/penguins.csv", null_values="NA") df
Chaining operations:
filter(pl.col("species") == "Gentoo").select(pl.col("^bill.*$")) df.
Using with a GPU
Can see up to 10x execution time decrease
Notes from Polars + NVIDIA GPU Tutorial
Installation
pip install polars[gpu] --extra-index-url=https://pypi.nvidia.com
Example
= pl.GPUEngine( gpu_engine =0, device=True, raise_on_fail ) = ( high_spenders_gpu transactions"CUST_ID") .group_by("AMOUNT").sum()) .agg(pl.col(="AMOUNT", descending=True) .sort(by .head()=gpu_engine) .collect(engine )
Columns
Quick Reference
dplyr polars Select column select(df, x)
df.select(pl.col("x"))
Select multiple columns select(df, x, y)
df.select(pl.col("x", "y"))
Rename column rename(df, new_x = x)
df.rename({"x": "new_x"})
Select all columns except x select(df, -x)
df.select(pl.exclude("x"))
Select all columns that start with “str” select(df, starts_with("str"))
df.select(pl.col("^bill.*$"))
ordf.select(cs.starts_with("str"))
[1]Select numeric columns select(df, where(is.numeric))
df.select(cs.float(), cs.integer())
[1,2]Example: Tidyselect
\ df .select(['^name|.*name$'), pl.col( pl.when('name') == pl.col('email_name')).then(pl.lit('yes')) pl.col('no')) .otherwise(pl.lit('same?') .alias( ])
- Apparently we cannot currently use look-forward or backward in polars. Such as .*(?=@) to capture the email_name
|> df select(starts_with("name") | ends_with("name")) |> mutate(`same?` = case_when( == email_name ~ "yes", name TRUE ~ "no"))
Rows
Quick Reference
dplyr polars First n
rowshead(df, n)
df.head(n)
Filter range of values filter(df, between(x, lo, hi))
df.filter(pl.col("x").is_between(lo, hi))
Conditional filter(df, x > 4)
df.filter(pl.col("x") > 4 )
Sort rows arrange(df, x)
df.sort("x")
Mutate
Quick Reference
dplyr polars Create new column mutate(df, x_mean = mean(x))
df.with_columns(pl.col("x").mean().alias("x_mean"))
Example: Ratio variables
"bill_length_mm") / pl.col("flipper_length_mm")).alias("bill_flipper_ratio")) df.with_columns((pl.col( #> shape: (344, 10) #> ┌───────┬───────────┬───────────┬────────────────┬───┬─────────────┬────────┬──────┬────────────────────┐ #> │ rowid ┆ species ┆ island ┆ bill_length_mm ┆ … ┆ body_mass_g ┆ sex ┆ year ┆ bill_flipper_ratio │ #> │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ #> │ i64 ┆ str ┆ str ┆ f64 ┆ ┆ i64 ┆ str ┆ i64 ┆ f64 │ #> ╞═══════╪═══════════╪═══════════╪════════════════╪═══╪═════════════╪════════╪══════╪════════════════════╡ #> │ 1 ┆ Adelie ┆ Torgersen ┆ 39.1 ┆ … ┆ 3750 ┆ male ┆ 2007 ┆ 0.216022 │ #> │ 2 ┆ Adelie ┆ Torgersen ┆ 39.5 ┆ … ┆ 3800 ┆ female ┆ 2007 ┆ 0.212366 │ #> │ 3 ┆ Adelie ┆ Torgersen ┆ 40.3 ┆ … ┆ 3250 ┆ female ┆ 2007 ┆ 0.206667 │ #> │ 4 ┆ Adelie ┆ Torgersen ┆ null ┆ … ┆ null ┆ null ┆ 2007 ┆ null │ #> │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ #> │ 341 ┆ Chinstrap ┆ Dream ┆ 43.5 ┆ … ┆ 3400 ┆ female ┆ 2009 ┆ 0.215347 │ #> │ 342 ┆ Chinstrap ┆ Dream ┆ 49.6 ┆ … ┆ 3775 ┆ male ┆ 2009 ┆ 0.256995 │ #> │ 343 ┆ Chinstrap ┆ Dream ┆ 50.8 ┆ … ┆ 4100 ┆ male ┆ 2009 ┆ 0.241905 │ #> │ 344 ┆ Chinstrap ┆ Dream ┆ 50.2 ┆ … ┆ 3775 ┆ female ┆ 2009 ┆ 0.253535 │ #> └───────┴───────────┴───────────┴────────────────┴───┴─────────────┴────────┴──────┴────────────────────┘
|> df mutate(bill_flipper_ratio = bill_length_mm / flipper_length_mm)
Example: Case-When
\ df .with_columns([ 'address').str.contains('OH')) pl.when(pl.col('local')) .then(pl.lit('address').str.contains('NY')) .when(pl.col('foodie')) .then(pl.lit('elsewhere')) .otherwise(pl.lit('familiarity') .alias( ])
|> df mutate(familiarity = case_when( str_detect(address, "OH") ~ "local", str_detect(address, "NY") ~ "foodie", TRUE ~ "elsewhere" ))
Example: String Extraction
\ df .with_columns( ['name').str.to_lowercase(), pl.col('email').str.extract(r'^([\d\w]+)@', group_index = 1) pl.col('email_name') .alias( ] )
|> df rename(tolower(name) = name) |> mutate( email_name = str_extract(email, "^([\\d\\w]+)@", group = 1) )
Summarize
Quick Reference
dplyr polars Group group_by(df, x)
df.group_by("x")
Summarize summarize(df, x_n = n())
df.agg(pl.count().alias("x_n"))
Example: Group_By, Count, Sort
\ df "species") \ .group_by(\ .agg(pl.count() "counts")) \ .alias("counts", descending=True) .sort( #> shape: (3, 2) #> ┌───────────┬────────┐ #> │ species ┆ counts │ #> │ --- ┆ --- │ #> │ str ┆ u32 │ #> ╞═══════════╪════════╡ #> │ Adelie ┆ 152 │ #> │ Gentoo ┆ 124 │ #> │ Chinstrap ┆ 68 │ #> └───────────┴────────┘
Example: Group-By, Arrange, Summarize
For each species, find the penguin with the lowest body mass
\ df "species") \ .group_by(all() \ .agg(pl."body_mass_g") \ .sort_by(\ .drop_nulls() .first()) #> shape: (3, 9) #> ┌───────────┬───────┬───────────┬────────────────┬───┬───────────────────┬─────────────┬────────┬──────┐ #> │ species ┆ rowid ┆ island ┆ bill_length_mm ┆ … ┆ flipper_length_mm ┆ body_mass_g ┆ sex ┆ year │ #> │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ #> │ str ┆ i64 ┆ str ┆ f64 ┆ ┆ i64 ┆ i64 ┆ str ┆ i64 │ #> ╞═══════════╪═══════╪═══════════╪════════════════╪═══╪═══════════════════╪═════════════╪════════╪══════╡ #> │ Adelie ┆ 4 ┆ Torgersen ┆ 36.5 ┆ … ┆ 181 ┆ 2850 ┆ female ┆ 2007 │ #> │ Chinstrap ┆ 315 ┆ Dream ┆ 46.9 ┆ … ┆ 192 ┆ 2700 ┆ female ┆ 2008 │ #> │ Gentoo ┆ 272 ┆ Biscoe ┆ 42.7 ┆ … ┆ 208 ┆ 3950 ┆ female ┆ 2009 │ #> └───────────┴───────┴───────────┴────────────────┴───┴───────────────────┴─────────────┴────────┴──────┘
|> df group_by(species) |> arrange(body_mass_g) |> summarize(body_mass_g = first(body_mass_g)) #> # A tibble: 3 × 2 #> species body_mass_g #> <fct> <int> #> 1 Adelie 2850 #> 2 Chinstrap 2700 #> 3 Gentoo 3950
Example: Select, Filter, Summarize, Across
\ df 0:4]+['^performance.*$','^salary.*$']) \ .select(df.columns[filter(pl.col('age') >= 30) \ .'mean_')) .select(pl.col(pl.Int64).mean().name.prefix( #> shape: (1, 5) #> mean_age mean_performance_rating mean_performance_reviews_count mean_salary mean_salary_increase_percentage #> f64 f64 f64 f64 f64 #> 46.6 4.4 3.4 40000.0 10.0
- For some reason, you have to sandwich
^
and$
around the regex in order to return those column names
|> df select(1:3, starts_with("performance"), starts_with("salary")) |> filter(age >= 30) |> summarize(across( .cols = where(is.numeric), .fns = mean, .names = "mean_{.col}")) #> # A tibble: 1 × 5 #> mean_age mean_performance_rating mean_performance_reviews_count mean_salary #> <dbl> <dbl> <dbl> <dbl> #> 1 46.6 4.4 3.4 40000 #> # ℹ 1 more variable: mean_salary_increase_percentage <dbl>
- For some reason, you have to sandwich
Joins
Quick Reference
dplyr polars Join dataframes left_join(df1, df2, by=x)
df1.join(df2, on="x", how="left")
Example: Left Join
import random \ df ="department") \ .join(df_dept, on'name','dept_id']) \ .select([ .with_columns(= pl.format( employee_id '{}-{}', 'dept_id', pl.Series([100000, 999999) for _ in range(len(df)) random.randint( ]) ) )
|> df left_join(df_dept, by = "department") |> select(name, dept_id) |> mutate(employee_id = map_chr(dept_id, ~ paste0(.x, "-", sample(1:9, 1))))
tidyr
Example: Unite
\ df .with_columns( pl.concat_str(=" " pl.col(pl.String), separator'combination_of_character') ).alias(\ ) 'combination_of_character')) .select(pl.col( #> shape: (6, 1) #> combination_of_character #> str #> "Alice New York 123 Main St, On… #> "Bob San Francisco 123 Main St,… #> "Charlie Tokyo 456-7890, Tokyo,… #> "Ken Toronto 49494 Exchange St,… #> "Steven Lima 1010 Gb st, Lima, … #> "Carlos Cleveland 666 Heaven dr…
|> df unite(col = "combination_of_character", where(is.character), sep = " ", ) |> select(combination_of_character) #> # A tibble: 6 × 1 #> combination_of_character #> <chr> #> 1 Alice New York 123 Main St, Ontario, OH 123-456-7890 alice@example.com Engineering 2010-01-01 Active 2022-05-01 #> 2 Bob San Francisco 123 Main St, Calgary, AB 987-654-3210 bob@example.com Marketing 2012-05-15 Inactive 2021-07-15 #> 3 Charlie Tokyo 456-7890, Tokyo, NY 098-765-4332 charlie@example.com Finance 2015-10-01 Active 2022-08-31 #> 4 Ken Toronto 49494 Exchange St, Toronto, ON 111-232-4141 ken@stats.org Marketing 2010-04-01 Inactive 2024-10-30 #> 5 Steven Lima 1010 Gb st, Lima, OH 505-402-6060 stephencurry@nba.com Marketing 2009-10-30 Active 2023-01-02 #> 6 Carlos Cleveland 666 Heaven dr, Cleveland, OH 909-435-1000 carlos@genius.edu Finance 2005-11-12 Active 2024-12-12
Example: Pivot Wider
\ df 'name','address']) \ .select([ .with_columns(= pl.col('address').str.extract(r'([A-Z]{2})$') state \ ) 'name','state') \ .select(= 'state', index = 'name', values='state', aggregate_function='len') \ .pivot(on .with_columns(0) pl.col(pl.UInt32).fill_null( ) # Or since this just creates dummies \ df 'name','department']) \ .select([= 'department') .to_dummies(columns
|> df select(name, department) |> pivot_wider(id_cols = "name", names_from = "department", values_from = "department", values_fill = 0, values_fn = length, names_prefix = "department_") #> # A tibble: 6 × 4 #> name department_Engineering department_Marketing department_Finance #> <chr> <int> <int> <int> #> 1 Alice 1 0 0 #> 2 Bob 0 1 0 #> 3 Charlie 0 0 1 #> 4 Ken 0 1 0 #> 5 Steven 0 1 0 #> 6 Carlos 0 0 1
Snippets
Example: Lead and Forward Fill
- If the department only has 1 row of salary data, do not shift salary
- If the department has more than 1 row of salary data, shift by -1 of salary column
- Reason: there was a mistake in entering data for those with more than 1 row of data, apparently the actualy salary data is 1 row more
\ df 'department') \ .group_by( .agg('name'), pl.col('salary').len()==1).then(pl.col('salary')) pl.when(pl.col('salary').shift(-1)) .otherwise(pl.col('salary_shift')) \ .alias('name','salary_shift') \ .explode( .with_columns('salary_shift').forward_fill()) pl.col(
|> df group_by(department) |> summarize( name = name, salary_shift = case_when( n() == 1 ~ salary, TRUE ~ lead(salary) )|> ) fill(salary_shift, .direction = "down") #> # A tibble: 6 × 3 #> # Groups: department [3] #> department name salary_shift #> <chr> <chr> <dbl> #> 1 Engineering Alice 50000 #> 2 Finance Charlie 30000 #> 3 Finance Carlos 30000 #> 4 Marketing Bob 20000 #> 5 Marketing Ken 40000 #> 6 Marketing Steven 40000
Example: Map
import random \ df 'name','dept_id']) \ .select([ .with_columns(= pl.format( employee_id '{}-{}', 'dept_id', pl.Series([100000, 999999) for _ in range(len(df)) random.randint( ]) ) )
|> df select(name, dept_id) |> mutate(employee_id = map_chr(dept_id, ~paste0(.x, "-", sample(1000000:9999999, 1)))) #> # A tibble: 6 × 3 #> name dept_id employee_id #> <chr> <dbl> <chr> #> 1 Alice 30 30-1694470 #> 2 Bob 25 25-1696036 #> 3 Charlie 20 20-4463080 #> 4 Ken 25 25-6942432 #> 5 Steven 25 25-3012223 #> 6 Carlos 20 20-8705991