SQL database with python

Author

Tony Duan

1 Connection with database

Code
# Import seaborn for loading sample datasets
import seaborn as sns
# Import pandas for data manipulation
import pandas as pd
# Import sqlite3 for SQLite database interaction
import sqlite3

# Load the 'tips' dataset from seaborn
tips = sns.load_dataset("tips")
# Load the 'planets' dataset from seaborn
planets=sns.load_dataset('planets')

1.1 create database file pythonsqlite.db and copy tips data and planets data into database

Code
# Import the sqlite3 module
import sqlite3
# Create a connection to the SQLite database file 'pythonsqlite.db'
db = sqlite3.connect('pythonsqlite.db')

# Write the 'tips' DataFrame to a SQL table named 'tips', replacing it if it already exists
tips.to_sql(name='tips', con=db,if_exists='replace')
# Write the 'planets' DataFrame to a SQL table named 'planets', replacing it if it already exists
planets.to_sql(name='planets', con=db,if_exists='replace')

# Close the database connection
db.close()

1.2 check all table in database

Code
# Connect to the SQLite database file 'pythonsqlite.db'
db = sqlite3.connect('pythonsqlite.db')
# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table
name
0 tips
1 planets

2 SQL

2.1 select

Code
# Connect to the SQLite database
db = sqlite3.connect('pythonsqlite.db')
# Define an SQL query to select all columns from the 'tips' table, limiting to 3 rows
sql="select * from tips LIMIT 3;"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table
index total_bill tip sex smoker day time size
0 0 16.99 1.01 Female No Sun Dinner 2
1 1 10.34 1.66 Male No Sun Dinner 3
2 2 21.01 3.50 Male No Sun Dinner 3

2.2 Renaming column

Code
# Define an SQL query to select 'total_bill' and alias it as 'new_total_bill'
sql="select total_bill as new_total_bill from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
new_total_bill
0 16.99
1 10.34
2 21.01
3 23.68
4 24.59

2.3 create column

Code
# Define an SQL query to select 'total_bill' as 'new_total_bill' and also the original 'total_bill'
sql="select total_bill as new_total_bill,total_bill from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
new_total_bill total_bill
0 16.99 16.99
1 10.34 10.34
2 21.01 21.01
3 23.68 23.68
4 24.59 24.59

2.4 Filter rows

Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male'
sql="select * from tips where sex='Male'"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
index total_bill tip sex smoker day time size
0 1 10.34 1.66 Male No Sun Dinner 3
1 2 21.01 3.50 Male No Sun Dinner 3
2 3 23.68 3.31 Male No Sun Dinner 2
3 5 25.29 4.71 Male No Sun Dinner 4
4 6 8.77 2.00 Male No Sun Dinner 2

2.4.1 Filters with AND conditions

Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' AND 'size' is greater than 3
sql="select * from tips where sex='Male' and size>3"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
index total_bill tip sex smoker day time size
0 5 25.29 4.71 Male No Sun Dinner 4
1 7 26.88 3.12 Male No Sun Dinner 4
2 13 18.43 3.00 Male No Sun Dinner 4
3 23 39.42 7.58 Male No Sat Dinner 4
4 25 17.81 2.34 Male No Sat Dinner 4

2.4.2 Filters with or conditions

Code
# Define an SQL query to select all columns from 'tips' where 'sex' is 'Male' OR 'size' is greater than 3
sql="select * from tips where sex='Male' or size>3"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the shape of the DataFrame (number of rows, number of columns)
table.shape
(169, 8)

2.5 Append

2.5.1 append by row

Code
# Define an SQL query to union all rows from 'tips' with itself
sql="select * from tips UNION all select * from tips"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the shape of the DataFrame
table.shape
(488, 8)

2.5.2 append by column

2.5.3 Dropping NA values

2.5.4 keep NA values

2.6 group by

2.6.1 average,min,max,sum

Code
# Define an SQL query to calculate the average, minimum, maximum, and sum of 'total_bill' grouped by 'sex'
sql="select AVG(total_bill),min(total_bill),max(total_bill),sum(total_bill) from tips group by sex"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table
AVG(total_bill) min(total_bill) max(total_bill) sum(total_bill)
0 18.056897 3.07 44.30 1570.95
1 20.744076 7.25 50.81 3256.82

2.6.2 count

Code
# Define an SQL query to count records grouped by 'sex'
sql="select sex,count(*) from tips group by 1"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the DataFrame
table
sex count(*)
0 Female 87
1 Male 157

2.7 order rows

Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in ascending order
sql="select * from tips order by total_bill"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
index total_bill tip sex smoker day time size
0 67 3.07 1.00 Female Yes Sat Dinner 1
1 92 5.75 1.00 Female Yes Fri Dinner 2
2 111 7.25 1.00 Female No Sat Dinner 1
3 172 7.25 5.15 Male Yes Sun Dinner 2
4 149 7.51 2.00 Male No Thur Lunch 2

2.7.1 Sort in descending order

Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' in descending order
sql="select * from tips order by total_bill desc"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
index total_bill tip sex smoker day time size
0 170 50.81 10.00 Male Yes Sat Dinner 3
1 212 48.33 9.00 Male No Sat Dinner 4
2 59 48.27 6.73 Male No Sat Dinner 4
3 156 48.17 5.00 Male No Sun Dinner 6
4 182 45.35 3.50 Male Yes Sun Dinner 3

2.7.2 Arrange by multiple variables

Code
# Define an SQL query to select all columns from 'tips' and order by 'total_bill' then by 'tip'
sql="select * from tips order by total_bill,tip"
# Execute the SQL query and read the result into a pandas DataFrame
table = pd.read_sql_query(sql,db)
# Display the first 5 rows of the DataFrame
table.head()
index total_bill tip sex smoker day time size
0 67 3.07 1.00 Female Yes Sat Dinner 1
1 92 5.75 1.00 Female Yes Fri Dinner 2
2 111 7.25 1.00 Female No Sat Dinner 1
3 172 7.25 5.15 Male Yes Sun Dinner 2
4 149 7.51 2.00 Male No Thur Lunch 2

2.8 join

Joining tables is a fundamental operation in SQL, allowing you to combine rows from two or more tables based on a related column between them. Here are common types of joins:

2.8.1 inner_join

An INNER JOIN returns only the rows that have matching values in both tables.

# Create sample DataFrames for demonstration
lhs_df = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})
rhs_df = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})

# Perform an INNER JOIN using pandas merge
inner_join_result = pd.merge(lhs_df, rhs_df, on='id', how='inner')
print("Inner Join Result:")
print(inner_join_result)

2.8.2 full join

A FULL OUTER JOIN returns all rows when there is a match in one of the tables. If there are no matches, it returns NULL values for columns from the table that doesn’t have a match.

# Perform a FULL OUTER JOIN using pandas merge
full_join_result = pd.merge(lhs_df, rhs_df, on='id', how='outer')
print("\nFull Outer Join Result:")
print(full_join_result)

2.8.3 left join

A LEFT JOIN (or LEFT OUTER JOIN) returns all rows from the left table, and the matching rows from the right table. If there is no match from the right table, NULL values are used.

# Perform a LEFT JOIN using pandas merge
left_join_result = pd.merge(lhs_df, rhs_df, on='id', how='left')
print("\nLeft Join Result:")
print(left_join_result)

2.8.4 anti join

An ANTI JOIN returns rows from the left table that have no matching rows in the right table.

# Perform an ANTI JOIN (rows in lhs_df not in rhs_df)
anti_join_result = lhs_df[~lhs_df['id'].isin(rhs_df['id'])]
print("\nAnti Join Result (lhs_df not in rhs_df):")
print(anti_join_result)

2.9 Reshape tables

2.9.1 Gather data long(wide to long)

2.9.2 Spread data wide (long to wide)

2.10 string

2.10.1 upper case

2.10.2 lower case

2.10.3 match

2.10.4 concatenation

2.10.5 replace

2.10.6 extract

2.11 date

2.12 create table into database

Code
# Define an SQL query to create a new table named 'new_tips' if it doesn't already exist, copying data from 'tips'
sql="create table if not exists new_tips as select * from tips"
# Execute the SQL query
db.execute(sql)

# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table
name
0 tips
1 planets
2 new_tips

2.13 delete table in database

Code
# Define an SQL query to drop the table named 'new_tips' if it exists
sql="drop table if  exists new_tips"
# Execute the SQL query
db.execute(sql)

# Execute an SQL query to select table names from sqlite_master and read into a pandas DataFrame
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", db)
# Display the DataFrame
table
name
0 tips
1 planets

2.14 edit table in database

Code
# Close the database connection
db.close()

3 Using SQL with Pandas

Code
# Import the sqldf function from the pandasql library
from pandasql import sqldf
Code
# Define an SQL query to select the first 5 rows from the 'tips' table
sql="SELECT * FROM tips LIMIT 5;"
# Execute the SQL query on the 'tips' DataFrame and store the result in 'data'
data=sqldf(sql)
# Display the resulting DataFrame
data
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

4 reference:

Back to top