Data manipulation with siuba(not working with current pandas version)

Author

Tony Duan

siuba (小巴) is a port of dplyr and other R libraries with seamless support for pandas and SQL

1 Comparison with different python dataframe package

2 load package

Code
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns



from siuba.siu import call
from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *

from siuba.data import mtcars,penguins
Code
small_mtcars = mtcars >> select(_.cyl, _.mpg, _.hp)>> head(5)

3 select column

3.1 get column names

Code
list(small_mtcars)

3.2 select columns by name

Code
small_mtcars >> select(_.cyl, _.mpg)

3.3 select columns by name match with ‘p’

Code
small_mtcars >> select(_.contains("p"))

3.4 select columns by index

3.4.1 select first and 3rd columns

Code
small_mtcars >> select(0,2)

3.4.2 select first to 3rd columns

Code
small_mtcars >> select(_[0:3])

4 drop column

Code
small_mtcars >> select(~_.cyl)

5 Renaming column

Code
small_mtcars >> rename(new_name_mpg = _.mpg)

6 Create column

6.1 Mutate

Code
mtcars.head()>> mutate(gear2 = _.gear+1
                      ,gear3=if_else(_.gear > 3, "long", "short")
                       ,qsec2=case_when({
                                          _.qsec <= 17: "short",
                                          _.qsec <= 18: "Medium",
                                          True: "long"
                                                     })
                       )

6.2 Transmute,create column and only keep this column

Code
mtcars.head()>> transmute(gear2 = _.gear+1)

7 Filter rows

Code
mtcars>> filter(_.gear ==4)

7.1 Filters with AND conditions

Code
mtcars >> filter((_.cyl >4) & (_.gear == 5))

7.2 Filters with OR conditions

Code
mtcars >> filter((_.cyl == 6) | (_.gear == 5))

7.3 filter row with index

7.3.1 first 3

Code
small_mtcars>>head(3)

7.3.2 last 3

Code
# not in siuba, in pandas
small_mtcars.tail(3)

7.3.3 5th rows

Code
# not in siuba, in pandas
mtcars.iloc[[4]]

7.3.4 1 and 5th rows

Code
# not in siuba, in pandas
mtcars.iloc[[0,4]]

7.3.5 1 to 5th rows

Code
# not in siuba, in pandas
mtcars.iloc[0:4]

7.3.6 get ramdon 5 rows

Code
mtcars.sample(5, random_state=42)

8 Append

8.1 append by row

Code
# not available in siuba yet
#from siuba import bind_rows
Code
# using pandas

# get 1 to 4 rows
data1=mtcars.iloc[0:4]

# get 9 rows
data2=mtcars.iloc[10:11]

data3=pd.concat([data1, data2], ignore_index = True,axis=0)

data3

8.2 append by column

Code
# not available in siuba yet
#from siuba import bind_columns
Code
# using pandas
data1=small_mtcars>>select(_.mpg)

data2=small_mtcars>>select(_.cyl)

data3=pd.concat([data1, data2],axis=1)

data3

8.3 Dropping NA values

8.4 keep NA values

9 group by

9.1 average,min,max,sum

Code
tbl_query = (mtcars
  >> group_by(_.cyl)
  >> summarize(avg_hp = _.hp.mean()
              ,min_hp=_.hp.min()
              ,max_hp=_.hp.max()
              ,totol_disp=_.disp.sum()
  )
  )

tbl_query

9.2 count record and count distinct record

Code
mtcars >> group_by(_.cyl)  >> summarize(n = _.shape[0])
Code
mtcars >> group_by(_.cyl)  >> summarize(n = _.hp.nunique())

10 order rows

Code
small_mtcars >> arrange(_.hp)

10.1 Sort in descending order

Code
small_mtcars >> arrange(-_.hp)

10.2 Arrange by multiple variables

Code
small_mtcars >> arrange(_.cyl, -_.mpg)

11 join

Code
lhs = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
rhs = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})
Code
lhs
Code
rhs

11.1 inner_join

Code
result=lhs >> inner_join(_, rhs, on="id")
result

11.2 full join

Code
result=rhs >> full_join(_, lhs, on="id")
result

11.3 left join

Code
result=lhs >> left_join(_, rhs, on="id")
result

11.4 anti join

keep data in left which not in right

Code
result=lhs >> anti_join(_, rhs, on="id")
result

keep data in right which not in left

Code
result=rhs >> anti_join(_, lhs, on="id")
result

12 Reshape tables

Code
costs = pd.DataFrame({
    'id': [1,2],
    'price_x': [.1, .2],
    'price_y': [.4, .5],
    'price_z': [.7, .8]
})

costs

12.1 Gather data long(wide to long)

Below 3 method will give same result

Code
# selecting each variable manually
costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z)

other way:

Code
# selecting variables using a slice
costs >> gather('measure', 'value', _["price_x":"price_z"])

other way:

Code
# selecting by excluding id
costs >> gather('measure', 'value', -_.id)

12.2 Spread data wide(long to wide)

Code
costs_long= costs>> gather('measure', 'value', -_.id)
costs_long
Code
costs_long>> spread('measure', 'value')

13 string

Code
df = pd.DataFrame({'text': ['abc', 'DDD','1243c','aeEe'], 'num': [3, 4,7,8]})

df

13.1 upper case

Code
df>> mutate(text_new=_.text.str.upper())

13.2 lower case

Code
df>> mutate(text_new=_.text.str.lower())

13.3 match

Code
df>> mutate(text_new1=if_else(_.text== "abc",'T','F')
            ,text_new2=if_else(_.text.str.startswith("a"),'T','F')
            ,text_new3=if_else(_.text.str.endswith("c"),'T','F')
            ,text_new4=if_else(_.text.str.contains("4"),'T','F')

)

13.4 concatenation

Code
df>> mutate(text_new1=_.text+' is '+_.text
)

13.5 replace

Use .str.replace(…, regex=True) with regular expressions to replace patterns in strings.

For example, the code below uses “p.”, where . is called a wildcard–which matches any character.

Code
df>> mutate(text_new1=_.text.str.replace("a.", "XX", regex=True)
)

13.6 extract

Use str.extract() with a regular expression to pull out a matching piece of text.

For example the regular expression “^(.*) ” contains the following pieces:

  • a matches the literal letter “a”

  • .* has a . which matches anything, and * which modifies it to apply 0 or more times.

Code
df>> mutate(text_new1=_.text.str.extract("a(.*)")
            ,text_new2=_.text.str.extract("(.*)c")
)

14 date

Code
df_dates = pd.DataFrame({
    "dates": pd.to_datetime(["2021-01-02", "2021-02-03"]),
    "raw": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"],
})
df_dates
Code
from datetime import datetime

df_date=df_dates>>mutate(month=_.dates.dt.month_name()
                  ,date_format_raw = call(pd.to_datetime, _.raw)
                  ,date_format_raw_year=_.date_format_raw.dt.year

)

df_date
Code
df_date.info()

15 using siuba with database

15.1 set up a sqlite database, with an mtcars table.

Code
from sqlalchemy import create_engine
from siuba.sql import LazyTbl
from siuba import _, group_by, summarize, show_query, collect 
from siuba.data import mtcars

# copy in to sqlite, using the pandas .to_sql() method
engine = create_engine("sqlite:///:memory:")
mtcars.to_sql("mtcars", engine, if_exists = "replace")

15.2 create table

Code
# Create a lazy SQL DataFrame
tbl_mtcars = LazyTbl(engine, "mtcars")
tbl_mtcars

15.3 create query

Code
# connect with siuba

tbl_query = (tbl_mtcars
  >> group_by(_.mpg)
  >> summarize(avg_hp = _.hp.mean())
  )

tbl_query

15.4 show query

Code
 tbl_query >> show_query()

15.5 Collect to DataFrame

because lazy expressions,the collect function is actually running the sql.

Code
data=tbl_query >> collect()
print(data)

16 reference:

https://siuba.org/

Back to top