from platform import python_versionprint(python_version())
3.11.11
pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.
Show package’s dependencies
Code
# Import the os moduleimport os# Import pkg_resources from pip._vendor to get package informationfrom pip._vendor import pkg_resources# Define the package name_package_name ='pandas'# Get the package object_package = pkg_resources.working_set.by_key[_package_name]# Print the dependencies of the packageprint([str(r) for r in _package.requires()]) # retrieve deps from setup.py
# Import the pandas library, commonly aliased as pdimport pandas as pd# Import sqldf from pandasql for SQL-like queries on pandas DataFramesfrom pandasql import sqldf# Import numpy for numerical operations, commonly aliased as npimport numpy as np# Import matplotlib.pylab for plotting, commonly aliased as pltimport matplotlib.pylab as plt# Import seaborn for statistical data visualizationimport seaborn as sns
Code
mtcars=mtcars
Code
# Assign the R dataset mtcars to a pandas DataFrame named mtcarsmtcars=r.mtcars
Code
# Display the first 5 rows of the mtcars DataFramemtcars.head(5)
# Set the name of the index to 'newhead'mtcars.index.name ='newhead'# Reset the index of the DataFrame, making the old index a new columnmtcars.reset_index(inplace=True)
Code
# Display the first 5 rows of the mtcars DataFrame after index manipulationmtcars.head(5)
# Create a new DataFrame small_mtcars with selected columns and the first 5 rowssmall_mtcars = mtcars[["cyl", "mpg",'hp']]small_mtcars=small_mtcars.head(5)small_mtcars
# Create a new column 'mpg2' by adding 1 to the 'mpg' columnsmall_mtcars['mpg2'] = small_mtcars['mpg']+1# Create a new column 'mpg3'. If 'mpg' is greater than 20, assign 'long', otherwise 'short'small_mtcars['mpg3'] = np.where(small_mtcars['mpg']>20, "long", "short")# Create a new column 'mpg4' with nested np.where conditions for categorical assignmentsmall_mtcars['mpg4'] =np.where(small_mtcars["mpg"]<19, "short", np.where(small_mtcars["mpg"]<=22, "Medium", np.where(small_mtcars["mpg"]>22, "long","else")))# Display the DataFrame with the new columnssmall_mtcars
cyl mpg hp mpg2 mpg3 mpg4
0 6.0 21.0 110.0 22.0 long Medium
1 6.0 21.0 110.0 22.0 long Medium
2 4.0 22.8 93.0 23.8 long long
3 6.0 21.4 110.0 22.4 long Medium
4 8.0 18.7 175.0 19.7 short short
8 Transmute: Create Column and Only Keep This Column
Code
# Create a new column 'mpg2' by adding 1 to the 'mpg' columnsmall_mtcars['mpg2'] = small_mtcars['mpg']+1# Create a new DataFrame containing only the 'mpg2' columnnew_data=small_mtcars[['mpg2']]# Display the new DataFramenew_data
mpg2
0 22.0
1 22.0
2 23.8
3 22.4
4 19.7
8.1 Filter Rows
Code
# Filter rows where the 'gear' column is equal to 4mtcars[(mtcars['gear'] ==4)]
# using pandas# get 1 to 4 rowsdata1=mtcars.iloc[0:4]# get 9 rowsdata2=mtcars.iloc[10:11]# Concatenate data1 and data2 DataFrames by row, ignoring the original indexdata3=pd.concat([data1, data2], ignore_index =True,axis=0)# Display the concatenated DataFramedata3
# using pandas# Select 'cyl' and 'mpg' columns from small_mtcarsdata1=small_mtcars[["cyl", "mpg"]]# Select 'hp' column from small_mtcarsdata2=small_mtcars[['hp']]# Concatenate data1 and data2 DataFrames by column, and reindex to match data2's indexdata3=pd.concat([data1, data2], axis=1).reindex(data2.index)# Display the concatenated DataFramedata3
Missing values (NaN) can be handled by either removing rows/columns with missing data or by filling them with appropriate values.
8.2.3.1 Drop rows with any NA values
# Create a sample DataFrame with missing valuesdf_missing = pd.DataFrame({'A': [1, 2, np.nan, 4],'B': [5, np.nan, np.nan, 8],'C': [9, 10, 11, 12]})print("Original DataFrame:")print(df_missing)# Drop rows that contain any NaN valuesdf_dropped_all = df_missing.dropna()print("\nDataFrame after dropping rows with any NA:")print(df_dropped_all)
8.2.3.2 Drop rows with NAs in specific columns
# Drop rows that have NaN values in column 'A'df_dropped_col_a = df_missing.dropna(subset=['A'])print("\nDataFrame after dropping rows with NA in column 'A':")print(df_dropped_col_a)
8.2.4 Filling NA values
Missing values can be filled with a specific value, the mean, median, or previous/next valid observation.
8.2.4.1 Fill with a specific value
# Fill all NaN values with 0df_filled_zero = df_missing.fillna(0)print("\nDataFrame after filling NA with 0:")print(df_filled_zero)
8.2.4.2 Fill with mean of the column
# Fill NaN values in column 'B' with the mean of column 'B'df_filled_mean = df_missing.copy()df_filled_mean['B'] = df_filled_mean['B'].fillna(df_filled_mean['B'].mean())print("\nDataFrame after filling NA in column 'B' with its mean:")print(df_filled_mean)
8.2.4.3 Forward fill (ffill)
# Forward fill NaN values (propagate last valid observation forward to next valid observation)df_ffill = df_missing.fillna(method='ffill')print("\nDataFrame after forward fill:")print(df_ffill)
8.2.4.4 Backward fill (bfill)
# Backward fill NaN values (propagate next valid observation backward to next valid observation)df_bfill = df_missing.fillna(method='bfill')print("\nDataFrame after backward fill:")print(df_bfill)
8.3 Group By
8.3.1 Average, Min, Max, Sum
Code
# Group the mtcars DataFrame by the 'cyl' column and calculate the mean of the 'hp' column for each groupmtcars.groupby("cyl")["hp"].mean()
# Group the mtcars DataFrame by the 'cyl' column and count the number of non-null 'hp' values for each groupmtcars.groupby("cyl")["hp"].count()
cyl
4.0 11
6.0 7
8.0 14
Name: hp, dtype: int64
Code
# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each groupmtcars.groupby("cyl")["hp"].nunique()
cyl
4.0 10
6.0 4
8.0 9
Name: hp, dtype: int64
8.4 Order Rows
Code
# Sort the small_mtcars DataFrame by the 'hp' column in ascending ordersmall_mtcars.sort_values('hp')
cyl mpg hp mpg2 mpg3 mpg4
2 4.0 22.8 93.0 23.8 long long
0 6.0 21.0 110.0 22.0 long Medium
1 6.0 21.0 110.0 22.0 long Medium
3 6.0 21.4 110.0 22.4 long Medium
4 8.0 18.7 175.0 19.7 short short
8.4.1 Sort in descending order
Code
# Sort the small_mtcars DataFrame by the 'hp' column in descending ordersmall_mtcars.sort_values('hp',ascending=False)
cyl mpg hp mpg2 mpg3 mpg4
4 8.0 18.7 175.0 19.7 short short
0 6.0 21.0 110.0 22.0 long Medium
1 6.0 21.0 110.0 22.0 long Medium
3 6.0 21.4 110.0 22.4 long Medium
2 4.0 22.8 93.0 23.8 long long
8.4.2 Arrange by multiple variables
Code
# Sort the small_mtcars DataFrame by 'cyl' then by 'mpg' in ascending ordersmall_mtcars.sort_values(by=['cyl','mpg'])
cyl mpg hp mpg2 mpg3 mpg4
2 4.0 22.8 93.0 23.8 long long
0 6.0 21.0 110.0 22.0 long Medium
1 6.0 21.0 110.0 22.0 long Medium
3 6.0 21.4 110.0 22.4 long Medium
4 8.0 18.7 175.0 19.7 short short
8.5 Join
Code
# Create a DataFrame named lhslhs = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})# Create a DataFrame named rhsrhs = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})
Code
# Display the lhs DataFramelhs
id val
0 1 lhs.1
1 2 lhs.2
2 3 lhs.3
Code
# Display the rhs DataFramerhs
id val
0 1 rhs.1
1 2 rhs.2
2 4 rhs.3
8.5.1 Inner Join
Code
# Perform an inner merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='inner')# Display the resultresult
id val_x val_y
0 1 lhs.1 rhs.1
1 2 lhs.2 rhs.2
8.5.2 Full Join
Code
# Perform a full outer merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='outer')# Display the resultresult
id val_x val_y
0 1 lhs.1 rhs.1
1 2 lhs.2 rhs.2
2 3 lhs.3 NaN
3 4 NaN rhs.3
8.5.3 Left Join
Code
# Perform a left merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='left')# Display the resultresult
id val_x val_y
0 1 lhs.1 rhs.1
1 2 lhs.2 rhs.2
2 3 lhs.3 NaN
8.6 Reshape Tables
Code
# Create a DataFrame named costscosts = pd.DataFrame({'id': [1,2],'price_x': [.1, .2],'price_y': [.4, .5],'price_z': [.7, .8]})# Display the DataFramecosts
# selecting each variable manually# Melt the costs DataFrame from wide to long formatlong_date=pd.melt(costs,id_vars=['id'], value_vars=['price_x', 'price_y','price_z'])# Display the long format DataFramelong_date
# Create a DataFrame named df with 'text' and 'num' columnsdf = pd.DataFrame({'text': ['abc', 'DDD','1243c','aeEe'], 'num': [3, 4,7,8]})# Display the DataFramedf
text num
0 abc 3
1 DDD 4
2 1243c 7
3 aeEe 8
8.7.1 upper case
# Convert the 'text' column to uppercasedf['text'].str.upper()
8.7.2 lower case
# Convert the 'text' column to lowercasedf['text'].str.lower()
8.7.3 match
# Check if the 'text' column contains 'a'df['text'].str.contains('a')
8.7.4 concatenation
# Concatenate the 'text' column with itself, separated by a spacedf['text'] +" "+ df['text']
8.7.5 replace
# Replace 'a' with 'X' in the 'text' columndf['text'].str.replace('a', 'X')
8.7.6 regular expression
https://regex101.com/
Code
# Import the re module for regular expressionsimport re# Example stringtext ="The rain in Spain falls mainly in the plain. My email is example@email.com"print(text)
The rain in Spain falls mainly in the plain. My email is example@email.com
Code
print("")
Code
print("Regular expression to find all words ending with 'ain'")
Regular expression to find all words ending with 'ain'
Code
# Define a regex pattern to find words ending with 'ain'pattern1 =r'\b\w*ain\b'# Find all matches of pattern1 in the textmatches1 = re.findall(pattern1, text)print("Words ending with 'ain':", matches1)
Words ending with 'ain': ['rain', 'Spain', 'plain']
Code
print("")
Code
print("Regular expression to extract an email address")
Regular expression to extract an email address
Code
# Define a regex pattern to extract email addressespattern2 =r"[\w\.-]+@[\w\.-]+"# Find all matches of pattern2 in the textmatches2 = re.findall(pattern2, text)print("Email addresses found:", matches2)
Email addresses found: ['example@email.com']
Code
print("")
Code
print("Replace all 'ain' endings with 'ANE'")
Replace all 'ain' endings with 'ANE'
Code
# Replace all occurrences of 'ain' at the end of a word with 'ANE'replaced_text = re.sub(r'ain\b', 'ANE', text)print("Text after replacement:", replaced_text)
Text after replacement: The rANE in SpANE falls mainly in the plANE. My email is example@email.com
Code
print("")
8.7.7 extract
Use str.extract() with a regular expression to pull out a matching piece of text.
For example the regular expression “^(.*) ” contains the following pieces:
a matches the literal letter “a”
.* has a . which matches anything, and * which modifies it to apply 0 or more times.
8.8 date
Pandas provides powerful tools for working with dates and times, including converting to datetime objects, extracting components, and formatting.
Code
# Create a DataFrame with 'dates' and 'raw' columnsdf_dates = pd.DataFrame({"dates": pd.to_datetime(["2021-01-02", "2021-02-03"]),"raw": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"],})# Display the DataFramedf_dates
dates raw
0 2021-01-02 2023-04-05 06:07:08
1 2021-02-03 2024-05-06 07:08:09
8.8.1 Extracting Date Components
You can extract various components like year, month, day, hour, minute, second from datetime objects.
# Extract year, month, and day from the 'dates' columndf_dates['dates'].dt.yeardf_dates['dates'].dt.monthdf_dates['dates'].dt.day
8.8.2 Formatting Dates
Dates can be formatted into different string representations using strftime().
# Format the 'dates' column as YYYY-MM-DDdf_dates['dates'].dt.strftime('%Y-%m-%d')
Code
# Print a concise summary of the df_dates DataFramedf_dates.info()
---title: "Data manipulation with Pandas"author: "Tony Duan"execute: warning: false error: falseformat: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9"---{width="600"}# Set Up Python:```{r}#Sys.setenv(RETICULATE_PYTHON = "/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")library(reticulate)py_require(c('pandas','pandasql','matplotlib','seaborn'))#use_python("/Library/Frameworks/Python.framework/Versions/3.11/bin/python3.11")``````{r}repl_python()``````{python}from platform import python_versionprint(python_version())```pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,built on top of the Python programming language.Show package's dependencies```{python}# Import the os moduleimport os# Import pkg_resources from pip._vendor to get package informationfrom pip._vendor import pkg_resources# Define the package name_package_name ='pandas'# Get the package object_package = pkg_resources.working_set.by_key[_package_name]# Print the dependencies of the packageprint([str(r) for r in _package.requires()]) # retrieve deps from setup.py```# Install Package```{python}#| eval: false!pip3.11 install seaborn pandasql nbformat nbclient```# Load Package```{python}# Import the pandas library, commonly aliased as pdimport pandas as pd# Import sqldf from pandasql for SQL-like queries on pandas DataFramesfrom pandasql import sqldf# Import numpy for numerical operations, commonly aliased as npimport numpy as np# Import matplotlib.pylab for plotting, commonly aliased as pltimport matplotlib.pylab as plt# Import seaborn for statistical data visualizationimport seaborn as sns``````{r}mtcars=mtcars``````{python}# Assign the R dataset mtcars to a pandas DataFrame named mtcarsmtcars=r.mtcars``````{python}# Display the first 5 rows of the mtcars DataFramemtcars.head(5)```row name to column```{python}# Set the name of the index to 'newhead'mtcars.index.name ='newhead'# Reset the index of the DataFrame, making the old index a new columnmtcars.reset_index(inplace=True)``````{python}# Display the first 5 rows of the mtcars DataFrame after index manipulationmtcars.head(5)``````{python}# Create a new DataFrame small_mtcars with selected columns and the first 5 rowssmall_mtcars = mtcars[["cyl", "mpg",'hp']]small_mtcars=small_mtcars.head(5)small_mtcars```## Get Info on the Data```{python}# Print a concise summary of the small_mtcars DataFrame, including data types and non-null valuessmall_mtcars.info()```# Select Columns## Get Column Names```{python}# Get a list of column names from the small_mtcars DataFramelist(small_mtcars)```## Select by Name::: panel-tabset### pandas```{python}# Select specific columns ('cyl', 'mpg', 'hp') from the small_mtcars DataFramesmall_mtcars [["cyl", "mpg",'hp']]```other way:```{python}# Select columns by providing a list of items to keepsmall_mtcars.filter(items=['cyl', 'mpg','hp'])```:::### Select columns by name matching with 'p'```{python}# Select columns where the column name contains the letter 'p'small_mtcars.loc[:,small_mtcars.columns.str.contains("p")]```other way:```{python}# Select columns whose names match the regular expression 'p.*' (starts with 'p')small_mtcars.filter(regex='p.*', axis=1)```### Select columns by index#### Select first and 3rd columns```{python}# Select rows at index 0 and 2 (first and third rows)small_mtcars.iloc[[0,2]]```#### Select first to 3rd columns```{python}# Select rows from index 0 up to (but not including) index 3small_mtcars[0:3]```# Drop Column::: panel-tabset## pandas```{python}# Drop the column named 'cyl' from the DataFrame (axis=1 indicates column)small_mtcars.drop('cyl', axis=1)```::: # Renaming Columns::: panel-tabset## pandas```{python}# Rename columns 'mpg' to 'new_name_mpg' and 'cyl' to 'new_name_cyl'small_mtcars.rename(columns={'mpg':"new_name_mpg", 'cyl':'new_name_cyl'})```::: # Create Column::: panel-tabset## pandas```{python}# Create a new column 'mpg2' by adding 1 to the 'mpg' columnsmall_mtcars['mpg2'] = small_mtcars['mpg']+1# Create a new column 'mpg3'. If 'mpg' is greater than 20, assign 'long', otherwise 'short'small_mtcars['mpg3'] = np.where(small_mtcars['mpg']>20, "long", "short")# Create a new column 'mpg4' with nested np.where conditions for categorical assignmentsmall_mtcars['mpg4'] =np.where(small_mtcars["mpg"]<19, "short", np.where(small_mtcars["mpg"]<=22, "Medium", np.where(small_mtcars["mpg"]>22, "long","else")))# Display the DataFrame with the new columnssmall_mtcars```:::# Transmute: Create Column and Only Keep This Column```{python}# Create a new column 'mpg2' by adding 1 to the 'mpg' columnsmall_mtcars['mpg2'] = small_mtcars['mpg']+1# Create a new DataFrame containing only the 'mpg2' columnnew_data=small_mtcars[['mpg2']]# Display the new DataFramenew_data```## Filter Rows```{python}# Filter rows where the 'gear' column is equal to 4mtcars[(mtcars['gear'] ==4)]```other way:```{python}# Filter rows where the 'gear' column is equal to 4 using the query methodmtcars.query('gear==4')```### Filters with AND conditions```{python}# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5mtcars[(mtcars['cyl'] >4)&(mtcars['gear'] ==5) ]```other way:```{python}# Filter rows where 'cyl' is greater than 4 AND 'gear' is equal to 5 using the query methodmtcars.query('cyl>4 and gear==5')```### Filters with OR conditions```{python}# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5mtcars[(mtcars['cyl'] ==6) |(mtcars['gear'] ==5) ]```other way:```{python}# Filter rows where 'cyl' is equal to 6 OR 'gear' is equal to 5 using the query methodmtcars.query('cyl==6 or gear==5')```### Filter rows with index#### 5th rows```{python}# Select the row at index 4 (which is the 5th row)mtcars.iloc[[4]]```#### 1st and 5th rows```{python}# Select rows at index 0 (1st row) and 4 (5th row)mtcars.iloc[[0,4]]```#### 1st to 5th rows```{python}# Select rows from index 0 up to (but not including) index 4mtcars.iloc[0:4]```#### Get random 5 rows```{python}# Select 5 random rows from the mtcars DataFrame, with a fixed random_state for reproducibilitymtcars.sample(5, random_state=42)```## Append### Append by row```{python}# using pandas# get 1 to 4 rowsdata1=mtcars.iloc[0:4]# get 9 rowsdata2=mtcars.iloc[10:11]# Concatenate data1 and data2 DataFrames by row, ignoring the original indexdata3=pd.concat([data1, data2], ignore_index =True,axis=0)# Display the concatenated DataFramedata3```### Append by column```{python}# using pandas# Select 'cyl' and 'mpg' columns from small_mtcarsdata1=small_mtcars[["cyl", "mpg"]]# Select 'hp' column from small_mtcarsdata2=small_mtcars[['hp']]# Concatenate data1 and data2 DataFrames by column, and reindex to match data2's indexdata3=pd.concat([data1, data2], axis=1).reindex(data2.index)# Display the concatenated DataFramedata3```### Dropping NA valuesMissing values (NaN) can be handled by either removing rows/columns with missing data or by filling them with appropriate values.#### Drop rows with any NA values```python# Create a sample DataFrame with missing valuesdf_missing = pd.DataFrame({'A': [1, 2, np.nan, 4],'B': [5, np.nan, np.nan, 8],'C': [9, 10, 11, 12]})print("Original DataFrame:")print(df_missing)# Drop rows that contain any NaN valuesdf_dropped_all = df_missing.dropna()print("\nDataFrame after dropping rows with any NA:")print(df_dropped_all)```#### Drop rows with NAs in specific columns```python# Drop rows that have NaN values in column 'A'df_dropped_col_a = df_missing.dropna(subset=['A'])print("\nDataFrame after dropping rows with NA in column 'A':")print(df_dropped_col_a)```### Filling NA valuesMissing values can be filled with a specific value, the mean, median, or previous/next valid observation.#### Fill with a specific value```python# Fill all NaN values with 0df_filled_zero = df_missing.fillna(0)print("\nDataFrame after filling NA with 0:")print(df_filled_zero)```#### Fill with mean of the column```python# Fill NaN values in column 'B' with the mean of column 'B'df_filled_mean = df_missing.copy()df_filled_mean['B'] = df_filled_mean['B'].fillna(df_filled_mean['B'].mean())print("\nDataFrame after filling NA in column 'B' with its mean:")print(df_filled_mean)```#### Forward fill (ffill)```python# Forward fill NaN values (propagate last valid observation forward to next valid observation)df_ffill = df_missing.fillna(method='ffill')print("\nDataFrame after forward fill:")print(df_ffill)```#### Backward fill (bfill)```python# Backward fill NaN values (propagate next valid observation backward to next valid observation)df_bfill = df_missing.fillna(method='bfill')print("\nDataFrame after backward fill:")print(df_bfill)```## Group By### Average, Min, Max, Sum```{python}# Group the mtcars DataFrame by the 'cyl' column and calculate the mean of the 'hp' column for each groupmtcars.groupby("cyl")["hp"].mean()``````{python}# Group the mtcars DataFrame by the 'cyl' column and find the minimum value of the 'hp' column for each groupmtcars.groupby("cyl")["hp"].min()``````{python}# Group the mtcars DataFrame by the 'cyl' column and find the maximum value of the 'hp' column for each groupmtcars.groupby("cyl")["hp"].max()``````{python}# Group the mtcars DataFrame by the 'cyl' column and calculate the sum of the 'hp' column for each groupmtcars.groupby("cyl")["hp"].sum()```### Count Record and Count Distinct Record```{python}# Group the mtcars DataFrame by the 'cyl' column and count the number of non-null 'hp' values for each groupmtcars.groupby("cyl")["hp"].count()``````{python}# Group the mtcars DataFrame by the 'cyl' column and count the number of unique 'hp' values for each groupmtcars.groupby("cyl")["hp"].nunique()```## Order Rows```{python}# Sort the small_mtcars DataFrame by the 'hp' column in ascending ordersmall_mtcars.sort_values('hp')```### Sort in descending order```{python}# Sort the small_mtcars DataFrame by the 'hp' column in descending ordersmall_mtcars.sort_values('hp',ascending=False)```### Arrange by multiple variables```{python}# Sort the small_mtcars DataFrame by 'cyl' then by 'mpg' in ascending ordersmall_mtcars.sort_values(by=['cyl','mpg'])```## Join```{python}# Create a DataFrame named lhslhs = pd.DataFrame({'id': [1,2,3], 'val': ['lhs.1', 'lhs.2', 'lhs.3']})# Create a DataFrame named rhsrhs = pd.DataFrame({'id': [1,2,4], 'val': ['rhs.1', 'rhs.2', 'rhs.3']})``````{python}# Display the lhs DataFramelhs``````{python}# Display the rhs DataFramerhs```### Inner Join```{python}# Perform an inner merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='inner')# Display the resultresult```### Full Join```{python}# Perform a full outer merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='outer')# Display the resultresult```### Left Join ```{python}# Perform a left merge of lhs and rhs DataFrames on the 'id' columnresult=pd.merge(lhs, rhs, on='id', how='left')# Display the resultresult```## Reshape Tables```{python}# Create a DataFrame named costscosts = pd.DataFrame({'id': [1,2],'price_x': [.1, .2],'price_y': [.4, .5],'price_z': [.7, .8]})# Display the DataFramecosts```### Gather data long(wide to long)```{python}# selecting each variable manually# Melt the costs DataFrame from wide to long formatlong_date=pd.melt(costs,id_vars=['id'], value_vars=['price_x', 'price_y','price_z'])# Display the long format DataFramelong_date#costs >> gather('measure', 'value', _.price_x, _.price_y, _.price_z)```### Spread data wide (long to wide)```{python}# Pivot the long_date DataFrame from long to wide formatlong_date.pivot(index="id", columns="variable", values="value")```## string```{python}# Create a DataFrame named df with 'text' and 'num' columnsdf = pd.DataFrame({'text': ['abc', 'DDD','1243c','aeEe'], 'num': [3, 4,7,8]})# Display the DataFramedf```### upper case```python# Convert the 'text' column to uppercasedf['text'].str.upper()```### lower case```python# Convert the 'text' column to lowercasedf['text'].str.lower()```### match```python# Check if the 'text' column contains 'a'df['text'].str.contains('a')```### concatenation```python# Concatenate the 'text' column with itself, separated by a spacedf['text'] +" "+ df['text']```### replace```python# Replace 'a' with 'X' in the 'text' columndf['text'].str.replace('a', 'X')```### regular expressionhttps://regex101.com/```{python}# Import the re module for regular expressionsimport re# Example stringtext ="The rain in Spain falls mainly in the plain. My email is example@email.com"print(text)print("")print("Regular expression to find all words ending with 'ain'")# Define a regex pattern to find words ending with 'ain'pattern1 =r'\b\w*ain\b'# Find all matches of pattern1 in the textmatches1 = re.findall(pattern1, text)print("Words ending with 'ain':", matches1)print("")print("Regular expression to extract an email address")# Define a regex pattern to extract email addressespattern2 =r"[\w\.-]+@[\w\.-]+"# Find all matches of pattern2 in the textmatches2 = re.findall(pattern2, text)print("Email addresses found:", matches2)print("")print("Replace all 'ain' endings with 'ANE'")# Replace all occurrences of 'ain' at the end of a word with 'ANE'replaced_text = re.sub(r'ain\b', 'ANE', text)print("Text after replacement:", replaced_text)print("")```### extractUse str.extract() with a regular expression to pull out a matching piece of text.For example the regular expression “^(.*) ” contains the following pieces:- a matches the literal letter “a”- .* has a . which matches anything, and * which modifies it to apply 0 or more times.## datePandas provides powerful tools for working with dates and times, including converting to datetime objects, extracting components, and formatting.```{python}# Create a DataFrame with 'dates' and 'raw' columnsdf_dates = pd.DataFrame({"dates": pd.to_datetime(["2021-01-02", "2021-02-03"]),"raw": ["2023-04-05 06:07:08", "2024-05-06 07:08:09"],})# Display the DataFramedf_dates```### Extracting Date ComponentsYou can extract various components like year, month, day, hour, minute, second from datetime objects.```python# Extract year, month, and day from the 'dates' columndf_dates['dates'].dt.yeardf_dates['dates'].dt.monthdf_dates['dates'].dt.day```### Formatting DatesDates can be formatted into different string representations using `strftime()`.```python# Format the 'dates' column as YYYY-MM-DDdf_dates['dates'].dt.strftime('%Y-%m-%d')``````{python}# Print a concise summary of the df_dates DataFramedf_dates.info()```## dataframe to numpy array```{python}# Convert the df_dates DataFrame to a NumPy arraydf_dates.to_numpy()```# Reference:https://pandas.pydata.org/docs/user_guide```{python}#| eval: false#| include: falsethis is hidding code chunk```