# Import seaborn for loading sample datasetsimport seaborn as sns# Import pandas for data manipulationimport pandas as pd# Import sqlite3 for SQLite database interactionimport sqlite3# Load the 'tips' dataset from seaborntips = sns.load_dataset("tips")# Load the 'planets' dataset from seabornplanets=sns.load_dataset('planets')
1.1 create database file pythonsqlite.db and copy tips data and planets data into database
Code
# Import the sqlite3 moduleimport sqlite3# Create a connection to the SQLite database file 'pythonsqlite.db'db = sqlite3.connect('pythonsqlite.db')# Write the 'tips' DataFrame to a SQL table named 'tips', replacing it if it already existstips.to_sql(name='tips', con=db,if_exists='replace')# Write the 'planets' DataFrame to a SQL table named 'planets', replacing it if it already existsplanets.to_sql(name='planets', con=db,if_exists='replace')# Close the database connectiondb.close()
1.2 check all table in database
Code
# Connect to the SQLite database file 'pythonsqlite.db'db = sqlite3.connect('pythonsqlite.db')# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable
name
0
tips
1
planets
2 SQL
2.1 select
Code
# Connect to the SQLite databasedb = sqlite3.connect('pythonsqlite.db')# Define an SQL query to select all columns from the 'tips' table, limiting to 3 rowssql="select * from tips LIMIT 3;"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable
index
total_bill
tip
sex
smoker
day
time
size
0
0
16.99
1.01
Female
No
Sun
Dinner
2
1
1
10.34
1.66
Male
No
Sun
Dinner
3
2
2
21.01
3.50
Male
No
Sun
Dinner
3
2.2 Renaming column
Code
# Define an SQL query to select 'total_bill' and alias it as 'new_total_bill'sql="select total_bill as new_total_bill from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
new_total_bill
0
16.99
1
10.34
2
21.01
3
23.68
4
24.59
2.3 create column
Code
# Define an SQL query to select 'total_bill' as 'new_total_bill' and also the original 'total_bill'sql="select total_bill as new_total_bill,total_bill from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
new_total_bill
total_bill
0
16.99
16.99
1
10.34
10.34
2
21.01
21.01
3
23.68
23.68
4
24.59
24.59
2.4 Filter rows
Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male'sql="select * from tips where sex='Male'"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
index
total_bill
tip
sex
smoker
day
time
size
0
1
10.34
1.66
Male
No
Sun
Dinner
3
1
2
21.01
3.50
Male
No
Sun
Dinner
3
2
3
23.68
3.31
Male
No
Sun
Dinner
2
3
5
25.29
4.71
Male
No
Sun
Dinner
4
4
6
8.77
2.00
Male
No
Sun
Dinner
2
2.4.1 Filters with AND conditions
Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' AND 'size' is greater than 3sql="select * from tips where sex='Male' and size>3"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
index
total_bill
tip
sex
smoker
day
time
size
0
5
25.29
4.71
Male
No
Sun
Dinner
4
1
7
26.88
3.12
Male
No
Sun
Dinner
4
2
13
18.43
3.00
Male
No
Sun
Dinner
4
3
23
39.42
7.58
Male
No
Sat
Dinner
4
4
25
17.81
2.34
Male
No
Sat
Dinner
4
2.4.2 Filters with or conditions
Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' OR 'size' is greater than 3sql="select * from tips where sex='Male' or size>3"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the shape of the DataFrame (number of rows, number of columns)table.shape
(169, 8)
2.5 Append
2.5.1 append by row
Code
# Define an SQL query to union all rows from 'tips' with itselfsql="select * from tips UNION all select * from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the shape of the DataFrametable.shape
(488, 8)
2.5.2 append by column
2.5.3 Dropping NA values
2.5.4 keep NA values
2.6 group by
2.6.1 average,min,max,sum
Code
# Define an SQL query to calculate the average, minimum, maximum, and sum of 'total_bill' grouped by 'sex'sql="select AVG(total_bill),min(total_bill),max(total_bill),sum(total_bill) from tips group by sex"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable
AVG(total_bill)
min(total_bill)
max(total_bill)
sum(total_bill)
0
18.056897
3.07
44.30
1570.95
1
20.744076
7.25
50.81
3256.82
2.6.2 count
Code
# Define an SQL query to count records grouped by 'sex'sql="select sex,count(*) from tips group by 1"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable
sex
count(*)
0
Female
87
1
Male
157
2.7 order rows
Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in ascending ordersql="select * from tips order by total_bill"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
index
total_bill
tip
sex
smoker
day
time
size
0
67
3.07
1.00
Female
Yes
Sat
Dinner
1
1
92
5.75
1.00
Female
Yes
Fri
Dinner
2
2
111
7.25
1.00
Female
No
Sat
Dinner
1
3
172
7.25
5.15
Male
Yes
Sun
Dinner
2
4
149
7.51
2.00
Male
No
Thur
Lunch
2
2.7.1 Sort in descending order
Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in descending ordersql="select * from tips order by total_bill desc"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
index
total_bill
tip
sex
smoker
day
time
size
0
170
50.81
10.00
Male
Yes
Sat
Dinner
3
1
212
48.33
9.00
Male
No
Sat
Dinner
4
2
59
48.27
6.73
Male
No
Sat
Dinner
4
3
156
48.17
5.00
Male
No
Sun
Dinner
6
4
182
45.35
3.50
Male
Yes
Sun
Dinner
3
2.7.2 Arrange by multiple variables
Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' then by 'tip'sql="select * from tips order by total_bill,tip"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()
index
total_bill
tip
sex
smoker
day
time
size
0
67
3.07
1.00
Female
Yes
Sat
Dinner
1
1
92
5.75
1.00
Female
Yes
Fri
Dinner
2
2
111
7.25
1.00
Female
No
Sat
Dinner
1
3
172
7.25
5.15
Male
Yes
Sun
Dinner
2
4
149
7.51
2.00
Male
No
Thur
Lunch
2
2.8 join
Joining tables is a fundamental operation in SQL, allowing you to combine rows from two or more tables based on a related column between them. Here are common types of joins:
2.8.1 inner_join
An INNER JOIN returns only the rows that have matching values in both tables.
A FULL OUTER JOIN returns all rows when there is a match in one of the tables. If there are no matches, it returns NULL values for columns from the table that doesn’t have a match.
# Perform a FULL OUTER JOIN using pandas mergefull_join_result = pd.merge(lhs_df, rhs_df, on='id', how='outer')print("\nFull Outer Join Result:")print(full_join_result)
2.8.3 left join
A LEFT JOIN (or LEFT OUTER JOIN) returns all rows from the left table, and the matching rows from the right table. If there is no match from the right table, NULL values are used.
# Perform a LEFT JOIN using pandas mergeleft_join_result = pd.merge(lhs_df, rhs_df, on='id', how='left')print("\nLeft Join Result:")print(left_join_result)
2.8.4 anti join
An ANTI JOIN returns rows from the left table that have no matching rows in the right table.
# Perform an ANTI JOIN (rows in lhs_df not in rhs_df)anti_join_result = lhs_df[~lhs_df['id'].isin(rhs_df['id'])]print("\nAnti Join Result (lhs_df not in rhs_df):")print(anti_join_result)
2.9 Reshape tables
2.9.1 Gather data long(wide to long)
2.9.2 Spread data wide (long to wide)
2.10 string
2.10.1 upper case
2.10.2 lower case
2.10.3 match
2.10.4 concatenation
2.10.5 replace
2.10.6 extract
2.11 date
2.12 create table into database
Code
# Define an SQL query to create a new table named 'new_tips' if it doesn't already exist, copying data from 'tips'sql="create table if not exists new_tips as select * from tips"# Execute the SQL querydb.execute(sql)# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable
name
0
tips
1
planets
2
new_tips
2.13 delete table in database
Code
# Define an SQL query to drop the table named 'new_tips' if it existssql="drop table if exists new_tips"# Execute the SQL querydb.execute(sql)# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable
name
0
tips
1
planets
2.14 edit table in database
Code
# Close the database connectiondb.close()
3 Using SQL with Pandas
Code
# Import the sqldf function from the pandasql libraryfrom pandasql import sqldf
Code
# Define an SQL query to select the first 5 rows from the 'tips' tablesql="SELECT * FROM tips LIMIT 5;"# Execute the SQL query on the 'tips' DataFrame and store the result in 'data'data=sqldf(sql)# Display the resulting DataFramedata
---title: "SQL database with python"author: "Tony Duan"execute: warning: false error: falseformat: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9"---{width="600"}# Connection with database```{python}# Import seaborn for loading sample datasetsimport seaborn as sns# Import pandas for data manipulationimport pandas as pd# Import sqlite3 for SQLite database interactionimport sqlite3# Load the 'tips' dataset from seaborntips = sns.load_dataset("tips")# Load the 'planets' dataset from seabornplanets=sns.load_dataset('planets')```## create database file pythonsqlite.db and copy tips data and planets data into database```{python}#| eval: false# Import the sqlite3 moduleimport sqlite3# Create a connection to the SQLite database file 'pythonsqlite.db'db = sqlite3.connect('pythonsqlite.db')# Write the 'tips' DataFrame to a SQL table named 'tips', replacing it if it already existstips.to_sql(name='tips', con=db,if_exists='replace')# Write the 'planets' DataFrame to a SQL table named 'planets', replacing it if it already existsplanets.to_sql(name='planets', con=db,if_exists='replace')# Close the database connectiondb.close()```## check all table in database```{python}# Connect to the SQLite database file 'pythonsqlite.db'db = sqlite3.connect('pythonsqlite.db')# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable```# SQL## select```{python}# Connect to the SQLite databasedb = sqlite3.connect('pythonsqlite.db')# Define an SQL query to select all columns from the 'tips' table, limiting to 3 rowssql="select * from tips LIMIT 3;"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable```## Renaming column```{python}# Define an SQL query to select 'total_bill' and alias it as 'new_total_bill'sql="select total_bill as new_total_bill from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```## create column```{python}# Define an SQL query to select 'total_bill' as 'new_total_bill' and also the original 'total_bill'sql="select total_bill as new_total_bill,total_bill from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```## Filter rows```{python}# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male'sql="select * from tips where sex='Male'"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```### Filters with AND conditions```{python}# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' AND 'size' is greater than 3sql="select * from tips where sex='Male' and size>3"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```### Filters with or conditions```{python}# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' OR 'size' is greater than 3sql="select * from tips where sex='Male' or size>3"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the shape of the DataFrame (number of rows, number of columns)table.shape```## Append### append by row```{python}# Define an SQL query to union all rows from 'tips' with itselfsql="select * from tips UNION all select * from tips"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the shape of the DataFrametable.shape```### append by column### Dropping NA values### keep NA values## group by### average,min,max,sum```{python}# Define an SQL query to calculate the average, minimum, maximum, and sum of 'total_bill' grouped by 'sex'sql="select AVG(total_bill),min(total_bill),max(total_bill),sum(total_bill) from tips group by sex"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable```### count```{python}# Define an SQL query to count records grouped by 'sex'sql="select sex,count(*) from tips group by 1"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the DataFrametable```## order rows```{python}# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in ascending ordersql="select * from tips order by total_bill"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```### Sort in descending order```{python}# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in descending ordersql="select * from tips order by total_bill desc"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```### Arrange by multiple variables```{python}# Define an SQL query to select all columns from 'tips' and order by 'total_bill' then by 'tip'sql="select * from tips order by total_bill,tip"# Execute the SQL query and read the result into a pandas DataFrametable = pd.read_sql_query(sql,db)# Display the first 5 rows of the DataFrametable.head()```## joinJoining tables is a fundamental operation in SQL, allowing you to combine rows from two or more tables based on a related column between them. Here are common types of joins:### inner_joinAn INNER JOIN returns only the rows that have matching values in both tables.```python# Create sample DataFrames for demonstrationlhs_df = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})rhs_df = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})# Perform an INNER JOIN using pandas mergeinner_join_result = pd.merge(lhs_df, rhs_df, on='id', how='inner')print("Inner Join Result:")print(inner_join_result)```### full joinA FULL OUTER JOIN returns all rows when there is a match in one of the tables. If there are no matches, it returns NULL values for columns from the table that doesn't have a match.```python# Perform a FULL OUTER JOIN using pandas mergefull_join_result = pd.merge(lhs_df, rhs_df, on='id', how='outer')print("\nFull Outer Join Result:")print(full_join_result)```### left joinA LEFT JOIN (or LEFT OUTER JOIN) returns all rows from the left table, and the matching rows from the right table. If there is no match from the right table, NULL values are used.```python# Perform a LEFT JOIN using pandas mergeleft_join_result = pd.merge(lhs_df, rhs_df, on='id', how='left')print("\nLeft Join Result:")print(left_join_result)```### anti joinAn ANTI JOIN returns rows from the left table that have no matching rows in the right table.```python# Perform an ANTI JOIN (rows in lhs_df not in rhs_df)anti_join_result = lhs_df[~lhs_df['id'].isin(rhs_df['id'])]print("\nAnti Join Result (lhs_df not in rhs_df):")print(anti_join_result)```## Reshape tables### Gather data long(wide to long)### Spread data wide (long to wide)## string### upper case### lower case### match### concatenation### replace### extract## date## create table into database```{python}# Define an SQL query to create a new table named 'new_tips' if it doesn't already exist, copying data from 'tips'sql="create table if not exists new_tips as select * from tips"# Execute the SQL querydb.execute(sql)# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable```## delete table in database```{python}# Define an SQL query to drop the table named 'new_tips' if it existssql="drop table if exists new_tips"# Execute the SQL querydb.execute(sql)# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrametable = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)# Display the DataFrametable```## edit table in database```{python}# Close the database connectiondb.close()```# Using SQL with Pandas```{python}# Import the sqldf function from the pandasql libraryfrom pandasql import sqldf``````{python}# Define an SQL query to select the first 5 rows from the 'tips' tablesql="SELECT * FROM tips LIMIT 5;"# Execute the SQL query on the 'tips' DataFrame and store the result in 'data'data=sqldf(sql)# Display the resulting DataFramedata```# reference: