Data manipulation with polars

Author

Tony Duan

Polars is a DataFrame interface on top of an OLAP Query Engine implemented in Rust using Apache Arrow Columnar Format as the memory model.

Code
#Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")
library(reticulate)
#use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")
py_require(c("polars", "pandas", "numpy", "matplotlib", "duckdb","datetime","seaborn","pyarrow"))

1 load package

Code
# Import the pandas library for data manipulation
import pandas as pd
# Import numpy for numerical operations
import numpy as np
# Import matplotlib.pylab for plotting
import matplotlib.pylab as plt
# Import seaborn for statistical data visualization
import seaborn as sns
# Import polars for DataFrame operations
import polars as pl
# Import col and when from polars for expression building
from polars import col, when
# Import datetime for date and time operations
from datetime import datetime
# Import duckdb for in-memory SQL database operations
import duckdb

2 import data from csv

Code
# Read the CSV file './data/mtcars.csv' into a Polars DataFrame
mtcars = pl.read_csv("./data/mtcars.csv")

3 import data from pandas

Code
# Read the CSV file into a pandas DataFrame
mtcars_pd= pd.read_csv("./data/mtcars.csv")
# Print the type of mtcars_pd
type(mtcars_pd)
<class 'pandas.core.frame.DataFrame'>
Code
# Convert the pandas DataFrame to a Polars DataFrame
mtcars = pl.from_pandas(mtcars_pd)
# Print the type of mtcars
type(mtcars)
<class 'polars.dataframe.frame.DataFrame'>
Code
# Select the 'cyl', 'mpg', and 'hp' columns from the mtcars DataFrame and take the first 5 rows
small_mtcars = mtcars.select(["cyl", "mpg","hp"]).head(5)

4 select column

4.1 get column names

Code
# Get the column names of the small_mtcars DataFrame
small_mtcars.columns
['cyl', 'mpg', 'hp']

4.2 select columns by name

Code
# Select the 'cyl' and 'mpg' columns from the small_mtcars DataFrame
small_mtcars.select(["cyl", "mpg"])
shape: (5, 2)
cyl mpg
i64 f64
6 21.0
6 21.0
4 22.8
6 21.4
8 18.7

4.3 select columns by name match with ‘p’

Code
# Select columns with 'p' in the name
cols_with_p = [col for col in small_mtcars.columns if 'p' in col]

# Select the columns identified in cols_with_p from the small_mtcars DataFrame
small_mtcars.select(cols_with_p)
shape: (5, 2)
mpg hp
f64 i64
21.0 110
21.0 110
22.8 93
21.4 110
18.7 175

4.4 select columns by index

4.4.1 select first and 3rd columns

Code
# Select the first (index 0) and third (index 2) columns of the small_mtcars DataFrame
small_mtcars.select([small_mtcars.columns[0], small_mtcars.columns[2]])
shape: (5, 2)
cyl hp
i64 i64
6 110
6 110
4 93
6 110
8 175

4.4.2 select first to 3rd columns

Code
# Select columns from index 0 up to (but not including) index 3
small_mtcars.select(small_mtcars.columns[0:3])
shape: (5, 3)
cyl mpg hp
i64 f64 i64
6 21.0 110
6 21.0 110
4 22.8 93
6 21.4 110
8 18.7 175

5 drop column

Code
# Drop the column named "cyl" from the small_mtcars DataFrame
small_mtcars.drop("cyl")
shape: (5, 2)
mpg hp
f64 i64
21.0 110
21.0 110
22.8 93
21.4 110
18.7 175

6 Renaming column

Code
# Rename the column 'mpg' to 'new_name_mpg'
small_mtcars.rename({
    "mpg": "new_name_mpg"
})
shape: (5, 3)
cyl new_name_mpg hp
i64 f64 i64
6 21.0 110
6 21.0 110
4 22.8 93
6 21.4 110
8 18.7 175

7 Create column

7.1 Mutate

Code
# Apply transformations to the mtcars DataFrame
result = (
    mtcars.head()
    .with_columns([
        # Create a new column 'gear2' by adding 1 to the 'gear' column
        (col("gear") + 1).alias("gear2")
        
        # Create a new column 'gear3': if 'gear' > 3 then 'long', else 'short'
        ,when(col("gear") > 3).then(pl.lit("long"))
        .otherwise(pl.lit("short")).alias("gear3")
        
        # Create a new column 'qsec2' with conditional logic based on 'qsec' values
        ,when(col("qsec") <= 17).then(pl.lit("short"))
        .when(col("qsec") <= 18).then(pl.lit("Medium"))
        .otherwise(pl.lit("long")).alias("qsec2")

    ])
)

# Print the resulting DataFrame
print(result)
shape: (5, 15)
┌───────────────────┬──────┬─────┬───────┬───┬──────┬───────┬───────┬────────┐
│ Unnamed: 0        ┆ mpg  ┆ cyl ┆ disp  ┆ … ┆ carb ┆ gear2 ┆ gear3 ┆ qsec2  │
│ ---               ┆ ---  ┆ --- ┆ ---   ┆   ┆ ---  ┆ ---   ┆ ---   ┆ ---    │
│ str               ┆ f64  ┆ i64 ┆ f64   ┆   ┆ i64  ┆ i64   ┆ str   ┆ str    │
╞═══════════════════╪══════╪═════╪═══════╪═══╪══════╪═══════╪═══════╪════════╡
│ Mazda RX4         ┆ 21.0 ┆ 6   ┆ 160.0 ┆ … ┆ 4    ┆ 5     ┆ long  ┆ short  │
│ Mazda RX4 Wag     ┆ 21.0 ┆ 6   ┆ 160.0 ┆ … ┆ 4    ┆ 5     ┆ long  ┆ Medium │
│ Datsun 710        ┆ 22.8 ┆ 4   ┆ 108.0 ┆ … ┆ 1    ┆ 5     ┆ long  ┆ long   │
│ Hornet 4 Drive    ┆ 21.4 ┆ 6   ┆ 258.0 ┆ … ┆ 1    ┆ 4     ┆ short ┆ long   │
│ Hornet Sportabout ┆ 18.7 ┆ 8   ┆ 360.0 ┆ … ┆ 2    ┆ 4     ┆ short ┆ Medium │
└───────────────────┴──────┴─────┴───────┴───┴──────┴───────┴───────┴────────┘

7.2 Transmute,create column and only keep this column

Code
# Select the 'gear' column from the first few rows of mtcars, add 1 to it, and alias it as 'gear2'
mtcars.head().select([
        (col("gear") + 1).alias("gear2")
    ])
shape: (5, 1)
gear2
i64
5
5
5
4
4

8 Filter rows

Code
# Filter rows where the 'gear' column is equal to 4
mtcars.filter(
   (col("gear") == 4)
)
shape: (12, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Datsun 710" 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
"Merc 240D" 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2
"Merc 230" 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
"Fiat 128" 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
"Honda Civic" 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
"Toyota Corolla" 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1
"Fiat X1-9" 27.3 4 79.0 66 4.08 1.935 18.9 1 1 4 1
"Volvo 142E" 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2

8.1 Filters with AND conditions

Code
# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5
mtcars.filter(
    (col("cyl") > 4) & (col("gear") == 5)
)
shape: (3, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Ford Pantera L" 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
"Ferrari Dino" 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
"Maserati Bora" 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8

8.2 Filters with OR conditions

Code
# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5
mtcars.filter(
    (col("cyl") == 6) | (col("gear") == 5)
)
shape: (11, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Hornet 4 Drive" 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
"Valiant" 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1
"Merc 280" 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
"Porsche 914-2" 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2
"Lotus Europa" 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
"Ford Pantera L" 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
"Ferrari Dino" 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
"Maserati Bora" 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8

8.3 filter row with index

8.3.1 first 3

Code
# Select the first 3 rows of the small_mtcars DataFrame
small_mtcars.head(3)
shape: (3, 3)
cyl mpg hp
i64 f64 i64
6 21.0 110
6 21.0 110
4 22.8 93

8.3.2 last 3

Code
# Select the last 3 rows of the small_mtcars DataFrame
small_mtcars.tail(3)
shape: (3, 3)
cyl mpg hp
i64 f64 i64
4 22.8 93
6 21.4 110
8 18.7 175

8.3.3 5th rows

Code
# Select the row at index 4 (which is the 5th row)
mtcars[4]
shape: (1, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Hornet Sportabout" 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2

8.3.4 1 and 5th rows

Code
# Select rows at index 0 (1st row) and 4 (5th row)
mtcars[[0, 4]]
shape: (2, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Hornet Sportabout" 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2

8.3.5 get 1 to 4 rows

Code
# Select rows from index 0 up to (but not including) index 4
mtcars[0:4]
shape: (4, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Datsun 710" 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
"Hornet 4 Drive" 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1

8.3.6 get ramdon 5 rows

Code
# Select 5 random rows from the mtcars DataFrame, with a fixed seed for reproducibility
mtcars.sample(n=5, seed=42)
shape: (5, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Hornet Sportabout" 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
"Ferrari Dino" 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
"Merc 230" 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
"AMC Javelin" 15.2 8 304.0 150 3.15 3.435 17.3 0 0 3 2
"Fiat 128" 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1

9 Append

9.1 append by row

Code
# Select rows from index 0 up to (but not including) index 4
data1 = mtcars[0:4]
# Select rows from index 10 up to (but not including) index 11
data2 = mtcars[10:11]

# Concatenate data1 and data2 DataFrames vertically
pl.concat([data1, data2], how="vertical")
shape: (5, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Datsun 710" 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
"Hornet 4 Drive" 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
"Merc 280C" 17.8 6 167.6 123 3.92 3.44 18.9 1 0 4 4

9.2 append by column

Code
# Select the 'mpg' column from small_mtcars
data1 = small_mtcars.select("mpg")
# Select the 'cyl' column from small_mtcars
data2 = small_mtcars.select("cyl")

# Concatenate data1 and data2 DataFrames horizontally
pl.concat([data1, data2], how="horizontal")
shape: (5, 2)
mpg cyl
f64 i64
21.0 6
21.0 6
22.8 4
21.4 6
18.7 8

9.3 Dropping NA values

Code
# Drop all rows that contain any null values
mtcars.drop_nulls()
shape: (32, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Datsun 710" 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
"Hornet 4 Drive" 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
"Hornet Sportabout" 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
"Lotus Europa" 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
"Ford Pantera L" 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
"Ferrari Dino" 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
"Maserati Bora" 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8
"Volvo 142E" 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2

9.4 To drop rows with NAs in specific columns:

Code
# Drop rows that have null values in either the "mpg" or "disp" columns
mtcars.drop_nulls(subset=["mpg", "disp"])
shape: (32, 12)
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
str f64 i64 f64 i64 f64 f64 f64 i64 i64 i64 i64
"Mazda RX4" 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
"Mazda RX4 Wag" 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
"Datsun 710" 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
"Hornet 4 Drive" 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
"Hornet Sportabout" 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
"Lotus Europa" 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
"Ford Pantera L" 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
"Ferrari Dino" 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
"Maserati Bora" 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8
"Volvo 142E" 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2

9.5 keep NA values

10 group by

10.1 average,min,max,sum

Code
# Group the mtcars DataFrame by the 'cyl' column and aggregate various statistics
tbl_query = (
    mtcars
    .group_by("cyl")
    .agg([
        pl.col("hp").mean().alias("avg_hp"),  # Calculate the mean of 'hp' and alias it as 'avg_hp'
        pl.col("hp").min().alias("min_hp"),    # Calculate the minimum of 'hp' and alias it as 'min_hp'
        pl.col("hp").max().alias("max_hp"),    # Calculate the maximum of 'hp' and alias it as 'max_hp'
        pl.col("disp").sum().alias("totol_disp") # Calculate the sum of 'disp' and alias it as 'totol_disp'
    ])
)

# Print the resulting aggregated DataFrame
tbl_query
shape: (3, 5)
cyl avg_hp min_hp max_hp totol_disp
i64 f64 i64 i64 f64
6 122.285714 105 175 1283.2
8 209.214286 150 335 4943.4
4 82.636364 52 113 1156.5

10.2 count record and count distinct record

Code
# Group the mtcars DataFrame by the 'cyl' column and count the number of rows in each group
mtcars.group_by("cyl").agg(pl.count().alias("n"))
shape: (3, 2)
cyl n
i64 u32
6 7
8 14
4 11
Code
# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each group
mtcars.group_by("cyl").agg(pl.col("hp").n_unique().alias("n"))
shape: (3, 2)
cyl n
i64 u32
4 10
6 4
8 9

11 order rows

Code
# Sort the small_mtcars DataFrame by the 'hp' column in ascending order
small_mtcars.sort("hp")
shape: (5, 3)
cyl mpg hp
i64 f64 i64
4 22.8 93
6 21.0 110
6 21.0 110
6 21.4 110
8 18.7 175

11.1 Sort in descending order

Code
# Sort the small_mtcars DataFrame by the 'hp' column in descending order
small_mtcars.sort("hp", descending=True)
shape: (5, 3)
cyl mpg hp
i64 f64 i64
8 18.7 175
6 21.0 110
6 21.0 110
6 21.4 110
4 22.8 93

11.2 Arrange by multiple variables

Code
# Sort the small_mtcars DataFrame by 'cyl' in ascending order and 'mpg' in descending order
small_mtcars.sort(
    by=["cyl", "mpg"],
    descending=[False, True]  # True means descending
)
shape: (5, 3)
cyl mpg hp
i64 f64 i64
4 22.8 93
6 21.4 110
6 21.0 110
6 21.0 110
8 18.7 175

12 join

Code
# Create a Polars DataFrame named lhs
lhs = pl.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
# Create a Polars DataFrame named rhs
rhs = pl.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})
Code
# Display the lhs DataFrame
lhs
shape: (3, 2)
id val
i64 str
1 "lhs.1"
2 "lhs.2"
3 "lhs.3"
Code
# Display the rhs DataFrame
rhs
shape: (3, 2)
id val
i64 str
1 "rhs.1"
2 "rhs.2"
4 "rhs.3"

12.1 inner_join

Code
# Perform an inner join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="inner")
# Display the result
result
shape: (2, 3)
id val val_right
i64 str str
1 "lhs.1" "rhs.1"
2 "lhs.2" "rhs.2"

12.2 full join

Code
# Perform a full outer join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="outer")
# Display the result
result
shape: (4, 4)
id val id_right val_right
i64 str i64 str
1 "lhs.1" 1 "rhs.1"
2 "lhs.2" 2 "rhs.2"
null null 4 "rhs.3"
3 "lhs.3" null null

12.3 left join

Code
# Perform a left join of lhs and rhs DataFrames on the 'id' column
result = lhs.join(rhs, on="id", how="left")
# Display the result
result
shape: (3, 3)
id val val_right
i64 str str
1 "lhs.1" "rhs.1"
2 "lhs.2" "rhs.2"
3 "lhs.3" null

12.4 anti join

keep data in left which not in right

Code
# Perform an anti-join: keep rows from lhs that do not have a match in rhs based on 'id'
result = lhs.join(rhs, on="id", how="anti")
# Display the result
result
shape: (1, 2)
id val
i64 str
3 "lhs.3"

keep data in right which not in left

Code
# Perform an anti-join: keep rows from rhs that do not have a match in lhs based on 'id'
result = rhs.join(lhs, on="id", how="anti")
# Display the result
result
shape: (1, 2)
id val
i64 str
4 "rhs.3"

13 Reshape tables

Code
# Create a Polars DataFrame named costs
costs = pl.DataFrame({
    "id": [1, 2],
    "price_x": [0.1, 0.2],
    "price_y": [0.4, 0.5],
    "price_z": [0.7, 0.8],
})

# Display the DataFrame
costs
shape: (2, 4)
id price_x price_y price_z
i64 f64 f64 f64
1 0.1 0.4 0.7
2 0.2 0.5 0.8

13.1 Gather data long(wide to long)

Below 3 method will give same result

Code
# Unpivot (melt) the costs DataFrame from wide to long format
costs_long = costs.unpivot(
    index="id",                # Columns to keep as identifiers
    on=["price_x", "price_y", "price_z"],  # Columns to melt
    variable_name="type",     # Name for the melted column names
    value_name="price"        # Name for the melted values
)

# Display the long format DataFrame
costs_long
shape: (6, 3)
id type price
i64 str f64
1 "price_x" 0.1
2 "price_x" 0.2
1 "price_y" 0.4
2 "price_y" 0.5
1 "price_z" 0.7
2 "price_z" 0.8

13.2 Spread data wide(long to wide)

Code
# Pivot the costs_long DataFrame from long to wide format
costs_wide = costs_long.pivot(
    values="price",
    index="id",
    on="type"  
)

# Display the wide format DataFrame
costs_wide
shape: (2, 4)
id price_x price_y price_z
i64 f64 f64 f64
1 0.1 0.4 0.7
2 0.2 0.5 0.8

14 string

Code
# Create a Polars DataFrame named df
df = pl.DataFrame({
    "text": ["abc", "DDD", "1243c", "aeEe"],
    "num": [3, 4, 7, 8]
})
# Display the DataFrame
df
shape: (4, 2)
text num
str i64
"abc" 3
"DDD" 4
"1243c" 7
"aeEe" 8

14.1 upper case

Code
# Add a new column 'text_new' with the uppercase version of the 'text' column
df.with_columns(
    pl.col("text").str.to_uppercase().alias("text_new")
)
shape: (4, 3)
text num text_new
str i64 str
"abc" 3 "ABC"
"DDD" 4 "DDD"
"1243c" 7 "1243C"
"aeEe" 8 "AEEE"

14.2 lower case

Code
# Add a new column 'text_new' with the lowercase version of the 'text' column
df.with_columns(
    pl.col("text").str.to_lowercase().alias("text_new")
)
shape: (4, 3)
text num text_new
str i64 str
"abc" 3 "abc"
"DDD" 4 "ddd"
"1243c" 7 "1243c"
"aeEe" 8 "aeee"

14.3 match

Code
# Add multiple new columns based on string matching conditions
df.with_columns([
    # 'text_new1': 'T' if 'text' is exactly 'abc', else 'F'
    pl.when(pl.col("text") == "abc").then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new1"),
    # 'text_new2': 'T' if 'text' starts with 'a', else 'F'
    pl.when(pl.col("text").str.starts_with("a")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new2"),
    # 'text_new3': 'T' if 'text' ends with 'c', else 'F'
    pl.when(pl.col("text").str.ends_with("c")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new3"),
    # 'text_new4': 'T' if 'text' contains '4', else 'F'
    pl.when(pl.col("text").str.contains("4")).then(pl.lit("T")).otherwise(pl.lit("F")).alias("text_new4"),
])
shape: (4, 6)
text num text_new1 text_new2 text_new3 text_new4
str i64 str str str str
"abc" 3 "T" "T" "T" "F"
"DDD" 4 "F" "F" "F" "F"
"1243c" 7 "F" "F" "T" "T"
"aeEe" 8 "F" "T" "F" "F"

14.4 concatenation

Code
# Add a new column 'text_new1' by concatenating the 'text' column with itself, separated by " is "
df.with_columns(
    (pl.col("text") + " is " + pl.col("text")).alias("text_new1")
)
shape: (4, 3)
text num text_new1
str i64 str
"abc" 3 "abc is abc"
"DDD" 4 "DDD is DDD"
"1243c" 7 "1243c is 1243c"
"aeEe" 8 "aeEe is aeEe"

14.5 replace

Use .str.replace(…, regex=True) with regular expressions to replace patterns in strings.

For example, the code below uses “a.”, where . is called a wildcard–which matches any character.

Code
# Add a new column 'text_new1' by replacing patterns in the 'text' column using a regular expression
df.with_columns(
    pl.col("text").str.replace(r"a.", "XX").alias("text_new1")
)
shape: (4, 3)
text num text_new1
str i64 str
"abc" 3 "XXc"
"DDD" 4 "DDD"
"1243c" 7 "1243c"
"aeEe" 8 "XXEe"

14.6 extract

Use str.extract() with a regular expression to pull out a matching piece of text.

text_new1 get rid of first ‘a’

text_new1 get rid of last ‘c’

Code
# Add new columns by extracting substrings from the 'text' column using regular expressions
df.with_columns([
    # 'text_new1': extract everything after the first 'a'
    pl.col("text").str.extract(r"a(.*)", 1).alias("text_new1"),
    # 'text_new2': extract everything before the last 'c'
    pl.col("text").str.extract(r"(.*)c", 1).alias("text_new2")
])
shape: (4, 4)
text num text_new1 text_new2
str i64 str str
"abc" 3 "bc" "ab"
"DDD" 4 null null
"1243c" 7 null "1243"
"aeEe" 8 "eEe" null

15 date

Polars provides robust functionalities for handling date and time data, including parsing, extracting components, and formatting.

Code
# Create a Polars DataFrame with 'dates' and 'times' columns containing string representations of dates and times
df_dates = pl.DataFrame({
    "dates": ["2023-04-05", "2024-05-06"],
    "times": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"]
})

# Display the DataFrame
df_dates
shape: (2, 2)
dates times
str str
"2023-04-05" "2023-04-05 06:07:08"
"2024-05-06" "2024-05-06 07:08:09"

15.1 Extracting Date Components

You can extract various components like year, month, day, hour, minute, second from datetime objects.

# Convert 'dates' to date type and extract year, month, day
df_dates.with_columns([
    pl.col("dates").str.to_date().dt.year().alias("year"),
    pl.col("dates").str.to_date().dt.month().alias("month"),
    pl.col("dates").str.to_date().dt.day().alias("day"),
])

15.2 Formatting Dates

Dates can be formatted into different string representations using strftime().

# Convert 'times' to datetime type and format it as YYYY-MM-DD HH:MM:SS
df_dates.with_columns(
    pl.col("times").str.to_datetime().dt.strftime("%Y-%m-%d %H:%M:%S").alias("formatted_time")
)
Code
# Add new columns by converting string columns to datetime and date objects
df_dates2 = df_dates.with_columns(
    # Convert 'times' column to datetime objects and alias as 'real_times'
    pl.col("times").str.to_datetime().alias("real_times"),
    # Convert 'dates' column to date objects and alias as 'real_dates'
    pl.col("dates").str.to_date().alias("real_dates")
)
# Display the DataFrame with new columns
df_dates2
shape: (2, 4)
dates times real_times real_dates
str str datetime[μs] date
"2023-04-05" "2023-04-05 06:07:08" 2023-04-05 06:07:08 2023-04-05
"2024-05-06" "2024-05-06 07:08:09" 2024-05-06 07:08:09 2024-05-06
Code
# Generate descriptive statistics for the df_dates2 DataFrame
df_dates2.describe()
shape: (9, 5)
statistic dates times real_times real_dates
str str str str str
"count" "2" "2" "2" "2"
"null_count" "0" "0" "0" "0"
"mean" null null "2023-10-20 18:37:38.500000" "2023-10-20 12:00:00"
"std" null null null null
"min" "2023-04-05" "2023-04-05 06:07:08" "2023-04-05 06:07:08" "2023-04-05"
"25%" null null "2023-04-05 06:07:08" "2023-04-05"
"50%" null null "2024-05-06 07:08:09" "2024-05-06"
"75%" null null "2024-05-06 07:08:09" "2024-05-06"
"max" "2024-05-06" "2024-05-06 07:08:09" "2024-05-06 07:08:09" "2024-05-06"

16 using polars with database

16.1 set up a DuckDB database, with an mtcars table.

Code
# Connect to DuckDB (in-memory by default)
con = duckdb.connect()

# Sample mtcars data as a list of tuples
mtcars_data = [
    (21.0, 6, 160, 110, 3.90, 2.62, 16.46, 0, 1, 4, 4),
    (21.0, 6, 160, 110, 3.90, 2.875, 17.02, 0, 1, 4, 4),
    (22.8, 4, 108, 93, 3.85, 2.320, 18.61, 1, 1, 4, 1),
    (21.4, 6, 258, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1),
]

# Define column names for the DataFrame
columns = [
    "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec",
    "vs", "am", "gear", "carb"
]

# Create Polars DataFrame from the sample data and column names
df = pl.DataFrame(mtcars_data, schema=columns)

16.2 create table in duckdb

Code
# Register the Polars DataFrame 'df' as a table named 'mtcars' in the DuckDB connection
con.register("mtcars", df)
<duckdb.duckdb.DuckDBPyConnection object at 0x12a871470>

16.3 show tables in duckdb

Code
# Execute an SQL query to show all tables and fetch the results
con.execute("SHOW TABLES").fetchall()
[('mtcars',)]

16.4 create query

Code
# Define an SQL query to select 'mpg', 'cyl', and 'hp' from the 'mtcars' table where 'cyl' is 6
query = "SELECT mpg, cyl, hp FROM mtcars WHERE cyl = 6"

16.5 get data

Code
# Execute the SQL query and retrieve the result as a Polars DataFrame
result = con.execute(query).pl()
# Display the result
result
shape: (3, 3)
mpg cyl hp
f64 i64 i64
21.0 6 110
21.0 6 110
21.4 6 110

16.6 LazyFrame

Code
# Execute the SQL query and retrieve the result as a Polars LazyFrame
lazy_df = con.execute(query).pl().lazy()
# Display the LazyFrame
lazy_df
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

DF ["mpg", "cyl", "hp"]; PROJECT */3 COLUMNS

16.7 Collect to DataFrame

because lazy expressions,the collect function is actually running the sql.

Code
# Collect the LazyFrame, executing the query and returning a Polars DataFrame
result_df = lazy_df.collect()
# Display the resulting DataFrame
result_df
shape: (3, 3)
mpg cyl hp
f64 i64 i64
21.0 6 110
21.0 6 110
21.4 6 110

17 reference:

https://docs.pola.rs/api/python/stable/reference/index.html

Back to top