Data manipulation with siuba(not working with current pandas version)

Author

Tony Duan

siuba (小巴) is a port of dplyr and other R libraries with seamless support for pandas and SQL

1 Comparison with different python dataframe package

Using python 3.11

Code

Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")
library(reticulate)
use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")

2 load package

Code

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns



from siuba.siu import call
from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *

from siuba.data import mtcars,penguins

Code

small_mtcars = mtcars >> select(_.cyl, _.mpg, _.hp)>> head(5)

3 select column

3.1 get column names

Code

list(small_mtcars)

3.2 select columns by name

Code

small_mtcars >> select(_.cyl, _.mpg)

3.3 select columns by name match with ‘p’

Code

small_mtcars >> select(_.contains("p"))

3.4 select columns by index

3.4.1 select first and 3rd columns

Code

small_mtcars >> select(0,2)

3.4.2 select first to 3rd columns

Code

small_mtcars >> select(_[0:3])

4 drop column

Code

small_mtcars >> select(~_.cyl)

5 Renaming column

Code

small_mtcars >> rename(new_name_mpg = _.mpg)

6 Create column

6.1 Mutate

Code

mtcars.head()>> mutate(gear2 = _.gear+1
                      ,gear3=if_else(_.gear > 3, "long", "short")
                       ,qsec2=case_when({
                                          _.qsec <= 17: "short",
                                          _.qsec <= 18: "Medium",
                                          True: "long"
                                                     })
                       )

6.2 Transmute,create column and only keep this column

Code

mtcars.head()>> transmute(gear2 = _.gear+1)

7 Filter rows

Code

mtcars>> filter(_.gear ==4)

7.1 Filters with AND conditions

Code

mtcars >> filter((_.cyl >4) & (_.gear == 5))

7.2 Filters with OR conditions

Code

mtcars >> filter((_.cyl == 6) | (_.gear == 5))

7.3 filter row with index

7.3.1 first 3

Code

small_mtcars>>head(3)

7.3.2 last 3

Code

# not in siuba, in pandas
small_mtcars.tail(3)

7.3.3 5th rows

Code

# not in siuba, in pandas
mtcars.iloc[[4]]

7.3.4 1 and 5th rows

Code

# not in siuba, in pandas
mtcars.iloc[[0,4]]

7.3.5 1 to 5th rows

Code

# not in siuba, in pandas
mtcars.iloc[0:4]

7.3.6 get ramdon 5 rows

Code

mtcars.sample(5, random_state=42)

8 Append

8.1 append by row

Code

# not available in siuba yet
#from siuba import bind_rows

Code

# using pandas

# get 1 to 4 rows
data1=mtcars.iloc[0:4]

# get 9 rows
data2=mtcars.iloc[10:11]

data3=pd.concat([data1, data2], ignore_index = True,axis=0)

data3

8.2 append by column

Code

# not available in siuba yet
#from siuba import bind_columns

Code

# using pandas
data1=small_mtcars>>select(_.mpg)

data2=small_mtcars>>select(_.cyl)

data3=pd.concat([data1, data2],axis=1)

data3

8.3 Dropping NA values

8.4 keep NA values

9 group by

9.1 average,min,max,sum

Code

tbl_query = (mtcars
  >> group_by(_.cyl)
  >> summarize(avg_hp = _.hp.mean()
              ,min_hp=_.hp.min()
              ,max_hp=_.hp.max()
              ,totol_disp=_.disp.sum()
  )
  )

tbl_query

9.2 count record and count distinct record

Code

mtcars >> group_by(_.cyl)  >> summarize(n = _.shape[0])

Code

mtcars >> group_by(_.cyl)  >> summarize(n = _.hp.nunique())

10 order rows

Code

small_mtcars >> arrange(_.hp)

10.1 Sort in descending order

Code

small_mtcars >> arrange(-_.hp)

10.2 Arrange by multiple variables

Code

small_mtcars >> arrange(_.cyl, -_.mpg)

11 join

Code

lhs = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
rhs = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})

Code

lhs

Code

rhs

11.1 inner_join

Code

result=lhs >> inner_join(_, rhs, on="id")
result

11.2 full join

Code

result=rhs >> full_join(_, lhs, on="id")
result

11.3 left join

Code

result=lhs >> left_join(_, rhs, on="id")
result

11.4 anti join

keep data in left which not in right ::: {.cell}

Code

result=lhs >> anti_join(_, rhs, on="id")
result

:::

keep data in right which not in left ::: {.cell}

Code

result=rhs >> anti_join(_, lhs, on="id")
result

:::

12 Reshape tables

Code

costs = pd.DataFrame({
    'id': [1,2],
    'price_x': [.1, .2],
    'price_y': [.4, .5],
    'price_z': [.7, .8]
})

costs

12.1 Gather data long(wide to long)

Below 3 method will give same result

Code

# selecting each variable manually
costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z)

other way: ::: {.cell}

Code

# selecting variables using a slice
costs >> gather('measure', 'value', _["price_x":"price_z"])

::: other way: ::: {.cell}

Code

# selecting by excluding id
costs >> gather('measure', 'value', -_.id)

:::

12.2 Spread data wide(long to wide)

Code

costs_long= costs>> gather('measure', 'value', -_.id)
costs_long

Code

costs_long>> spread('measure', 'value')

13 string

Code

df = pd.DataFrame({'text': ['abc', 'DDD','1243c','aeEe'], 'num': [3, 4,7,8]})

df

13.1 upper case

Code

df>> mutate(text_new=_.text.str.upper())

13.2 lower case

Code

df>> mutate(text_new=_.text.str.lower())

13.3 match

Code

df>> mutate(text_new1=if_else(_.text== "abc",'T','F')
            ,text_new2=if_else(_.text.str.startswith("a"),'T','F')
            ,text_new3=if_else(_.text.str.endswith("c"),'T','F')
            ,text_new4=if_else(_.text.str.contains("4"),'T','F')

)

13.4 concatenation

Code

df>> mutate(text_new1=_.text+' is '+_.text
)

13.5 replace

Use .str.replace(…, regex=True) with regular expressions to replace patterns in strings.

For example, the code below uses “p.”, where . is called a wildcard–which matches any character.

Code

df>> mutate(text_new1=_.text.str.replace("a.", "XX", regex=True)
)

13.6 extract

Use str.extract() with a regular expression to pull out a matching piece of text.

For example the regular expression “^(.*) ” contains the following pieces:

a matches the literal letter “a”
.* has a . which matches anything, and * which modifies it to apply 0 or more times.

Code

df>> mutate(text_new1=_.text.str.extract("a(.*)")
            ,text_new2=_.text.str.extract("(.*)c")
)

14 date

Code

df_dates = pd.DataFrame({
    "dates": pd.to_datetime(["2021-01-02", "2021-02-03"]),
    "raw": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"],
})
df_dates

Code

from datetime import datetime

df_date=df_dates>>mutate(month=_.dates.dt.month_name()
                  ,date_format_raw = call(pd.to_datetime, _.raw)
                  ,date_format_raw_year=_.date_format_raw.dt.year

)

df_date

Code

df_date.info()

15 using siuba with database

15.1 set up a sqlite database, with an mtcars table.

Code

from sqlalchemy import create_engine
from siuba.sql import LazyTbl
from siuba import _, group_by, summarize, show_query, collect 
from siuba.data import mtcars

# copy in to sqlite, using the pandas .to_sql() method
engine = create_engine("sqlite:///:memory:")
mtcars.to_sql("mtcars", engine, if_exists = "replace")

15.2 create table

Code

# Create a lazy SQL DataFrame
tbl_mtcars = LazyTbl(engine, "mtcars")
tbl_mtcars

15.3 create query

Code

# connect with siuba

tbl_query = (tbl_mtcars
  >> group_by(_.mpg)
  >> summarize(avg_hp = _.hp.mean())
  )

tbl_query

15.4 show query

Code

 tbl_query >> show_query()

15.5 Collect to DataFrame

because lazy expressions,the collect function is actually running the sql. ::: {.cell}

Code

data=tbl_query >> collect()
print(data)

:::

16 reference:

https://siuba.org/

--- title: "Data manipulation with siuba(not working with current pandas version)" author: "Tony Duan" execute: warning: false error: false eval: false format: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9" --- ![](images/siuba_small.svg){width="434"} siuba (小巴) is a port of dplyr and other R libraries with seamless support for pandas and SQL ## Comparison with different python dataframe package ![](images/2.png){width="656"} Using python 3.11 ```{r} Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11") library(reticulate) use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11") ``` ## load package ```{python} import pandas as pd import numpy as np import matplotlib.pylab as plt import seaborn as sns from siuba.siu import call from siuba import _, mutate, filter, group_by, summarize,show_query from siuba import * from siuba.data import mtcars,penguins ``` ```{python} small_mtcars = mtcars >> select(_.cyl, _.mpg, _.hp)>> head(5) ``` ## select column ### get column names ```{python} list(small_mtcars) ``` ### select columns by name ```{python} small_mtcars >> select(_.cyl, _.mpg) ``` ### select columns by name match with 'p' ```{python} small_mtcars >> select(_.contains("p")) ``` ### select columns by index #### select first and 3rd columns ```{python} small_mtcars >> select(0,2) ``` #### select first to 3rd columns ```{python} small_mtcars >> select(_[0:3]) ``` ## drop column ```{python} small_mtcars >> select(~_.cyl) ``` ## Renaming column ```{python} small_mtcars >> rename(new_name_mpg = _.mpg) ``` ## Create column ### Mutate ```{python} mtcars.head()>> mutate(gear2 = _.gear+1 ,gear3=if_else(_.gear > 3, "long", "short") ,qsec2=case_when({ _.qsec <= 17: "short", _.qsec <= 18: "Medium", True: "long" }) ) ``` ### Transmute,create column and only keep this column ```{python} mtcars.head()>> transmute(gear2 = _.gear+1) ``` ## Filter rows ```{python} mtcars>> filter(_.gear ==4) ``` ### Filters with AND conditions ```{python} mtcars >> filter((_.cyl >4) & (_.gear == 5)) ``` ### Filters with OR conditions ```{python} mtcars >> filter((_.cyl == 6) | (_.gear == 5)) ``` ### filter row with index #### first 3 ```{python} small_mtcars>>head(3) ``` #### last 3 ```{python} # not in siuba, in pandas small_mtcars.tail(3) ``` #### 5th rows ```{python} # not in siuba, in pandas mtcars.iloc[[4]] ``` #### 1 and 5th rows ```{python} # not in siuba, in pandas mtcars.iloc[[0,4]] ``` #### 1 to 5th rows ```{python} # not in siuba, in pandas mtcars.iloc[0:4] ``` #### get ramdon 5 rows ```{python} mtcars.sample(5, random_state=42) ``` ## Append ### append by row ```{python} # not available in siuba yet #from siuba import bind_rows ``` ```{python} # using pandas # get 1 to 4 rows data1=mtcars.iloc[0:4] # get 9 rows data2=mtcars.iloc[10:11] data3=pd.concat([data1, data2], ignore_index = True,axis=0) data3 ``` ### append by column ```{python} # not available in siuba yet #from siuba import bind_columns ``` ```{python} # using pandas data1=small_mtcars>>select(_.mpg) data2=small_mtcars>>select(_.cyl) data3=pd.concat([data1, data2],axis=1) data3 ``` ### Dropping NA values ### keep NA values ## group by ### average,min,max,sum ```{python} tbl_query = (mtcars >> group_by(_.cyl) >> summarize(avg_hp = _.hp.mean() ,min_hp=_.hp.min() ,max_hp=_.hp.max() ,totol_disp=_.disp.sum() ) ) tbl_query ``` ### count record and count distinct record ```{python} mtcars >> group_by(_.cyl) >> summarize(n = _.shape[0]) ``` ```{python} mtcars >> group_by(_.cyl) >> summarize(n = _.hp.nunique()) ``` ## order rows ```{python} small_mtcars >> arrange(_.hp) ``` ### Sort in descending order ```{python} small_mtcars >> arrange(-_.hp) ``` ### Arrange by multiple variables ```{python} small_mtcars >> arrange(_.cyl, -_.mpg) ``` ## join ```{python} lhs = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']}) rhs = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']}) ``` ```{python} lhs ``` ```{python} rhs ``` ### inner_join ```{python} result=lhs >> inner_join(_, rhs, on="id") result ``` ### full join ```{python} result=rhs >> full_join(_, lhs, on="id") result ``` ### left join ```{python} result=lhs >> left_join(_, rhs, on="id") result ``` ### anti join keep data in left which not in right ```{python} result=lhs >> anti_join(_, rhs, on="id") result ``` keep data in right which not in left ```{python} result=rhs >> anti_join(_, lhs, on="id") result ``` ## Reshape tables ```{python} costs = pd.DataFrame({ 'id': [1,2], 'price_x': [.1, .2], 'price_y': [.4, .5], 'price_z': [.7, .8] }) costs ``` ### Gather data long(wide to long) Below 3 method will give same result ```{python} # selecting each variable manually costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z) ``` other way: ```{python} # selecting variables using a slice costs >> gather('measure', 'value', _["price_x":"price_z"]) ``` other way: ```{python} # selecting by excluding id costs >> gather('measure', 'value', -_.id) ``` ### Spread data wide(long to wide) ```{python} costs_long= costs>> gather('measure', 'value', -_.id) costs_long ``` ```{python} costs_long>> spread('measure', 'value') ``` ## string ```{python} df = pd.DataFrame({'text': ['abc', 'DDD','1243c','aeEe'], 'num': [3, 4,7,8]}) df ``` ### upper case ```{python} df>> mutate(text_new=_.text.str.upper()) ``` ### lower case ```{python} df>> mutate(text_new=_.text.str.lower()) ``` ### match ```{python} df>> mutate(text_new1=if_else(_.text== "abc",'T','F') ,text_new2=if_else(_.text.str.startswith("a"),'T','F') ,text_new3=if_else(_.text.str.endswith("c"),'T','F') ,text_new4=if_else(_.text.str.contains("4"),'T','F') ) ``` ### concatenation ```{python} df>> mutate(text_new1=_.text+' is '+_.text ) ``` ### replace Use .str.replace(..., regex=True) with regular expressions to replace patterns in strings. For example, the code below uses "p.", where . is called a wildcard–which matches any character. ```{python} df>> mutate(text_new1=_.text.str.replace("a.", "XX", regex=True) ) ``` ### extract Use str.extract() with a regular expression to pull out a matching piece of text. For example the regular expression “^(.*) ” contains the following pieces: - a matches the literal letter “a” - .* has a . which matches anything, and * which modifies it to apply 0 or more times. ```{python} df>> mutate(text_new1=_.text.str.extract("a(.*)") ,text_new2=_.text.str.extract("(.*)c") ) ``` ## date ```{python} df_dates = pd.DataFrame({ "dates": pd.to_datetime(["2021-01-02", "2021-02-03"]), "raw": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"], }) df_dates ``` ```{python} from datetime import datetime df_date=df_dates>>mutate(month=_.dates.dt.month_name() ,date_format_raw = call(pd.to_datetime, _.raw) ,date_format_raw_year=_.date_format_raw.dt.year ) df_date ``` ```{python} df_date.info() ``` ## using siuba with database ### set up a sqlite database, with an mtcars table. ```{python} from sqlalchemy import create_engine from siuba.sql import LazyTbl from siuba import _, group_by, summarize, show_query, collect from siuba.data import mtcars # copy in to sqlite, using the pandas .to_sql() method engine = create_engine("sqlite:///:memory:") mtcars.to_sql("mtcars", engine, if_exists = "replace") ``` ### create table ```{python} # Create a lazy SQL DataFrame tbl_mtcars = LazyTbl(engine, "mtcars") tbl_mtcars ``` ### create query ```{python} # connect with siuba tbl_query = (tbl_mtcars >> group_by(_.mpg) >> summarize(avg_hp = _.hp.mean()) ) tbl_query ``` ### show query ```{python} tbl_query >> show_query() ``` ### Collect to DataFrame because lazy expressions,the collect function is actually running the sql. ```{python} data=tbl_query >> collect() print(data) ``` ## reference: https://siuba.org/