Pandas
This page provides an introduction to pandas. These are my notes from YouTube Video.
Overview
Pandas is a powerful Python library for data manipulation and analysis, offering data structures like DataFrames and Series to handle and analyze structured data easily. It provides tools for data cleaning, transformation, aggregation, and visualization, making it essential for data science and analytics.
# install pandas
pip3 install pandas
python3
# importing pandas
import pandas as pd
pd.set_option('display.max_columns', None)
DataFrame
Think of it as table with extra functionality.
# create a dataframe
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df.head()
# o/p:
# 0 1 2
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9
df_with_columns = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
df_with_columns.head()
# o/p:
# A B C
# 0 1 2 3
# 1 4 5 6
# look at the bottom 1 row
df_with_columns.tail(1)
# o/p:
# A B C
# 1 4 5 6
# print headers
df_with_columns.columns
# o/p: Index(['A', 'B', 'C'], dtype='object')
# print indexes
df_with_columns.index
# o/p: RangeIndex(start=0, stop=2, step=1)
df_with_columns.index.tolist()
# o/p: [0, 1]
# specify indexes
df_with_indexes = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index = ["x", "y"])
# o/p:
# 0 1 2
# x 1 2 3
# y 4 5 6
df.info()
# o/p:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3 entries, 0 to 2
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 0 3 non-null int64
# 1 1 3 non-null int64
# 2 2 3 non-null int64
# dtypes: int64(3)
df.describe()
# o/p:
# 0 1 2
# count 3.0 3.0 3.0
# mean 4.0 5.0 6.0
# std 3.0 3.0 3.0
# min 1.0 2.0 3.0
# 25% 2.5 3.5 4.5
# 50% 4.0 5.0 6.0
# 75% 5.5 6.5 7.5
# max 7.0 8.0 9.0
# count number of unique values
df.nunique()
# o/p:
# 0 3
# 1 3
# 2 3
# dtype: int64
# count number of unique values in specific columns
df_with_columns['A'].unique()
# o/p: array([1, 4])
# shape of df
df.shape
# o/p: (3, 3)
# total number of bytes
df.size
# o/p: 9
Loading DataFrame from File
coffee = pd.read_csv("static/data/coffee.csv")
coffee.columns
# o/p: Index(['Day', 'Coffee Type', 'Units Sold'], dtype='object')
bios = pd.read_csv("static/data/bios.csv")
bios.columns
# o/p:
# Index(['athlete_id', 'name', 'born_date', 'born_city', 'born_region',
# 'born_country', 'NOC', 'height_cm', 'weight_kg', 'died_date'],
# dtype='object')
nocs = pd.read_csv("static/data/noc_regions.csv")
# o/p:
# Index(['NOC', 'region', 'notes'], dtype='object')
# reading parquet file
results = pd.read_parquet("static/data/results.parquet")
results.columns
# o/p:
# Index(['year', 'type', 'discipline', 'event', 'as', 'athlete_id', 'noc',
# 'team', 'place', 'tied', 'medal'],
# dtype='object')
Convert DataFrame Format
# convert from one format to another
bios.to_csv
bios.to_parquet
bios.to_excel
Accessing DataFrame
# get sample data
coffee.sample()
# o/p:
# Day Coffee Type Units Sold
# 11 Saturday Latte 35
# loc
coffee.loc[[0, 1, 2]]
coffee.loc[0:3]
# o/p:
# Day Coffee Type Units Sold
# 0 Monday Espresso 25
# 1 Monday Latte 15
# 2 Tuesday Espresso 30
# only select specific columns
coffee.loc[1:2, ["Day", "Units Sold"]]
# o/p:
# Day Units Sold
# 1 Monday 15
# 2 Tuesday 30
# assign an column values to index
coffee.index = coffee["Day"]
# reset
coffee = pd.read_csv("static/data/coffee.csv")
# access specific value
coffee.at[0, "Units Sold"]
# o/p: np.int64(25)
Sorting DataFrame
# sorting
coffee.sort_values("Units Sold")
# sorting by Units Sold in ascending order and by Coffee Type in descending order
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[True, False])
Filtering DataFrame
# filtering
bios.info()
# RangeIndex: 145500 entries, 0 to 145499
# Data columns (total 10 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 athlete_id 145500 non-null int64
# 1 name 145500 non-null object
# 2 born_date 143693 non-null object
# 3 born_city 110908 non-null object
# 4 born_region 110908 non-null object
# 5 born_country 110908 non-null object
# 6 NOC 145499 non-null object
# 7 height_cm 106651 non-null float64
# 8 weight_kg 102070 non-null float64
# 9 died_date 33940 non-null object
# dtypes: float64(2), int64(1), object(7)
# filter all the atheletes who are taller than 225
bios.loc[bios["height_cm"] > 225, ["athlete_id", "name", "height_cm", "weight_kg"]]
# o/p:
# athlete_id name height_cm weight_kg
# 89070 89782 Yao Ming 226.0 141.0
bios[bios["height_cm"] > 225][["athlete_id", "name", "height_cm", "weight_kg"]]
# o/p:
# athlete_id name height_cm weight_kg
# 89070 89782 Yao Ming 226.0 141.0
# filter all the atheletes who are taller than 215 and born in USA
bios[
(bios["height_cm"] > 215) & (bios["born_country"]=="USA")
][["name", "height_cm", "born_country"]]
# o/p:
# name height_cm born_country
# 5781 Tommy Burleson 223.0 USA
# 6722 Shaquille O'Neal 216.0 USA
# 6937 David Robinson 216.0 USA
# 123850 Tyson Chandler 216.0 USA
# filter using contains
bios[bios["name"].str.contains("Keith")][["name", "born_country"]].head(2)
# o/p:
# name born_country
# 1897 Keith Hanlon NaN
# 3505 Keith Wallace GBR
# filter using contains without case
bios[
bios["name"].str.contains("keith", case=False)
][["name", "born_country"]].head(2)
# o/p:
# name born_country
# 1897 Keith Hanlon NaN
# 3505 Keith Wallace GBR
# filter using regex
bios[
bios["name"].str.contains("keith|patrick", case=False)
][["name", "born_country"]].head(2)
# o/p:
# name born_country
# 6 Patrick Chila FRA
# 119 Patrick Wheatley RSA
# turn off the regex
bios[
bios["name"].str.contains("keith|patrick", case=False, regex=False)
][["name", "born_country"]].head(2)
# o/p:
# Empty DataFrame
# Columns: [name, born_country]
# Index: []
# query
bios.query("born_country == 'USA' and born_city == 'Seattle'")[["name"]].head(5)
# o/p:
# name
# 11030 David Halpern
# 12800 Todd Trewin
# 15476 Scott McKinley
# 29079 Joyce Tanac
# 31135 Bill Kuhlemeier
Modifying Columns in DataFrame
# adding column
coffee["price"] = 4.99
coffee.head(2)
# o/p:
# Day Coffee Type Units Sold price
# 0 Monday Espresso 25 4.99
# 1 Monday Latte 15 4.99
# adding columns with condition
import numpy as np
coffee["New Price"] = np.where(coffee['Coffee Type'] == 'Espresso', 3.99, 5.99)
# o/p:
# Day Coffee Type Units Sold price New Price
# 0 Monday Espresso 25 4.99 3.99
# 1 Monday Latte 15 4.99 5.99
# adding column with mathematical operation
coffee["Revenue"] = coffee["New Price"] * coffee["Units Sold"]
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso 25 3.99 99.75
# 1 Monday Latte 15 5.99 89.85
# renaming column
coffee.rename(columns={"price": "Price"}, inplace=True)
coffee.head(2)
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso 25 3.99 99.75
# 1 Monday Latte 15 5.99 89.85
# ...
# remove price column
coffee.drop(columns=['Price'])
# o/p:
# Day Coffee Type Units Sold New Price
# 0 Monday Espresso 25 3.99
# 1 Monday Latte 15 5.99
# 2 Tuesday Espresso 30 3.99
# 3 Tuesday Latte 20 5.99
# ....
coffee.head(2)
# o/p:
# Day Coffee Type Units Sold price New Price
# 0 Monday Espresso 25 4.99 3.99
# 1 Monday Latte 15 4.99 5.99
# remove column from original dataframe
coffee.drop(columns=["Price"], inplace=True)
coffee.head()
# o/p:
# Day Coffee Type Units Sold New Price
# 0 Monday Espresso 25 3.99
# 1 Monday Latte 15 5.99
# 2 Tuesday Espresso 30 3.99
# 3 Tuesday Latte 20 5.99
# 4 Wednesday Espresso 35 3.99
info
Drop doesn't remove column from the original dataframe if inplace = False. Same is the case for rename function.
Cloning DataFrame
coffee_new = coffee.copy()
coffee_new.head(2)
# o/p:
# Day Coffee Type Units Sold New Price
# 0 Monday Espresso 25 3.99
# 1 Monday Latte 15 5.99
Saving DataFrame
coffee.to_csv('static/data/coffee_new.csv', index=False)
Apply Lambda
bios['height_category'] = bios['height_cm'].apply(
lambda x: 'Short' if x < 165 else ('Average' if x < 185 else 'Tall')
)
bios[['name', 'height_category']]
# o/p:
# name height_category
# 0 Jean-François Blanchy Tall
# 1 Arnaud Boetsch Average
Merging DataFrame
# merging bios with nocs to get region based on born_country
bios.head(2)[["athlete_id", "name", "born_country"]]
# o/p:
# athlete_id name born_country
# 0 1 Jean-François Blanchy FRA
# 1 2 Arnaud Boetsch FRA
nocs.head(2)
# o/p:
# NOC region notes
# 0 AFG Afghanistan NaN
# 1 AHO Curacao Netherlands Antilles
bios_new = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left')
bios_new.rename(columns={'region': 'born_country_full'}, inplace=True)
bios_new[['athlete_id', 'name', 'born_country', 'born_country_full']].head(4)
# o/p:
# athlete_id name born_country born_country_full
# 0 1 Jean-François Blanchy FRA France
# 1 2 Arnaud Boetsch FRA France
# 2 3 Jean Borotra FRA France
# 3 4 Jacques Brugnon FRA France
info
When merging tables, pandas appends suffixes ('_x' and '_y') to columns with identical names in both left and right tables. To override this suffixes you can specifiy suffixes=[] array.
Concatenating DataFrame
usa = bios[bios['born_country'] == 'USA'].copy()
usa.head(2)[["name", "born_country"]]
# o/p:
# name born_country
# 54 Monique Javer USA
# 960 Xóchitl Escobedo USA
gbr = bios[bios['born_country'] == 'GBR'].copy()
gbr.head(2)[["name", "born_country"]]
# o/p:
# name born_country
# 4 Albert Canet GBR
# 37 Helen Aitchison GBR
only_usa_gbr = pd.concat([usa, gbr])
only_usa_gbr[["name", "born_country"]]
# o/p:
# name born_country
# 54 Monique Javer USA
# 960 Xóchitl Escobedo USA
# ...
# 9068 Charles Brahm GBR
# 9097 Clement Spiette GBR
Handling Null Values
coffee.loc[[0, 1], 'Units Sold'] = np.nan
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso NaN 3.99 99.75
# 1 Monday Latte NaN 5.99 89.85
# fillna
coffee.fillna(coffee['Units Sold'].mean())
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso 35.0 3.99 99.75
# 1 Monday Latte 35.0 5.99 89.85
# dropna
coffee.loc[[0, 1], 'Units Sold'] = np.nan
coffee.dropna(subset=['Units Sold'], inplace=False)
# isna
coffee[coffee['Units Sold'].isna()]
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso NaN 3.99 99.75
# 1 Monday Latte NaN 5.99 89.85
# notna
coffee[coffee['Units Sold'].notna()]
Aggregating Data
# count athelets by born_city
bios['born_city'].value_counts()
# o/p:
# born_city
# Budapest 1378
# Moskva (Moscow) 883
# ...
# count athelets by region only in USA
bios[bios['born_country']=='USA']['born_region'].value_counts()
# o/p:
# born_region
# California 1634
# New York 990
# Illinois 585
# Massachusetts 530
# ...
# sum the units sold by coffee type
coffee.groupby(['Coffee Type'])['Units Sold'].sum()
# o/p:
# Coffee Type
# Espresso 240.0
# Latte 180.0
# Name: Units Sold, dtype: float64
# multiple agg
coffee.groupby(['Coffee Type']).agg({'Units Sold': 'sum', 'New Price': 'mean'})
# o/p:
# Units Sold New Price
# Coffee Type
# Espresso 240.0 3.99
# Latte 180.0 5.99
Pivot
coffee.head()
# o/p:
# Day Coffee Type Units Sold New Price Revenue
# 0 Monday Espresso NaN 3.99 99.75
# 1 Monday Latte NaN 5.99 89.85
# 2 Tuesday Espresso 30.0 3.99 119.70
# 3 Tuesday Latte 20.0 5.99 119.80
# 4 Wednesday Espresso 35.0 3.99 139.65
pivot_by_day = coffee.pivot(columns='Coffee Type', index='Day', values='Revenue')
# o/p:
# Coffee Type Espresso Latte
# Day
# Friday 179.55 209.65
# Monday 99.75 89.85
# Saturday 179.55 209.65
# Sunday 179.55 209.65
# Thursday 159.60 179.70
# Tuesday 119.70 119.80
# Wednesday 139.65 149.75
Shift
# calculate yesterday's revenue for each row
coffee['Yesterday Revenue'] = coffee['Revenue'].shift(1)
# o/p:
# Day Coffee Type Units Sold New Price Revenue Yesterday Revenue
# 0 Monday Espresso NaN 3.99 99.75 NaN
# 1 Monday Latte NaN 5.99 89.85 99.75
# 2 Tuesday Espresso 30.0 3.99 119.70 89.85
# 3 Tuesday Latte 20.0 5.99 119.80 119.70
# 4 Wednesday Espresso 35.0 3.99 139.65 119.80
Rank
# calculate rank by height
bios['height_rank'] = bios['height_cm'].rank(ascending=False)
bios[['name', 'height_rank']].sort_values(['height_rank'])
# o/p:
# name height_rank
# 89070 Yao Ming 1.0
# 6978 Arvydas Sabonis 2.5
# 5781 Tommy Burleson 2.5
# 5673 Gunther Behnke 5.0
# 89075 Roberto Dueñas 5.0
# ...
Rolling
# cummulative sum of revenue
coffee['Cummulative Revenue'] = coffee['Revenue'].cumsum()
coffee[['Day', 'Coffee Type', 'Revenue', 'Cummulative Revenue']]
# o/p:
# Day Coffee Type Revenue Cummulative Revenue
# 0 Monday Espresso 99.75 99.75
# 1 Monday Latte 89.85 189.60
# 2 Tuesday Espresso 119.70 309.30
# 3 Tuesday Latte 119.80 429.10
# 4 Wednesday Espresso 139.65 568.75
# 5 Wednesday Latte 149.75 718.50
# 6 Thursday Espresso 159.60 878.10
# rolling window sum of revenue over 3 days window
coffee['3 Days'] = coffee['Revenue'].rolling(3).sum()
coffee[['Day', 'Coffee Type', 'Revenue', '3 Days']]
# o/p:
# Day Coffee Type Revenue 3 Days
# 0 Monday Espresso 99.75 NaN
# 1 Monday Latte 89.85 NaN
# 2 Tuesday Espresso 119.70 309.30
# 3 Tuesday Latte 119.80 329.35
# 4 Wednesday Espresso 139.65 379.15
# 5 Wednesday Latte 149.75 409.20
# 6 Thursday Espresso 159.60 449.00
# 7 Thursday Latte 179.70 489.05
# ...