Pandas: Data Analysis in Python
Pandas is the go-to library for data analysis and manipulation in Python. Built on NumPy, it provides Series and DataFrame structures with labeled indices, native missing-value handling, and an expressive API for transforming data of every shape and size.
Installation
pip install pandas
pip install openpyxl # for reading/writing .xlsx
import pandas as pd
import numpy as np
Series
A Series is a labeled 1D array — like a spreadsheet column.
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s['b']) # 20
print(s[1]) # 20 — positional access also works
s * 2
s[s > 15] # Boolean filter: b=20, c=30, d=40
# From dictionary
countries = pd.Series({'Spain': 47_000_000, 'Mexico': 130_000_000, 'Argentina': 45_000_000})
countries.sort_values(ascending=False)
DataFrame creation
# From a dict of lists
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'Dave'],
'age': [28, 35, 22, 41],
'city': ['New York', 'London', 'Paris', 'New York'],
'salary': [75000, 92000, 61000, 110000],
})
# From a list of dicts
records = [
{'id': 1, 'value': 3.14, 'label': 'pi'},
{'id': 2, 'value': 2.71, 'label': 'e'},
]
df2 = pd.DataFrame(records)
# From CSV / Excel
df_csv = pd.read_csv('data.csv', encoding='utf-8', parse_dates=['date'])
df_excel = pd.read_excel('report.xlsx', sheet_name='Sales')
Initial inspection
df.head(3) # first 3 rows (default 5)
df.tail(3) # last 3 rows
df.info() # dtypes, non-null counts, memory usage
df.describe() # numeric stats (count, mean, std, min, quartiles, max)
df.describe(include='all') # includes categoricals too
df.shape # (rows, columns)
df.dtypes
df.nunique() # unique value count per column
df.isnull().sum() # NaN count per column
df['city'].value_counts()
Selection and indexing
# Select columns
df['name'] # Series
df[['name', 'salary']] # DataFrame
# .loc — label-based
df.loc[0] # row with index 0
df.loc[1:3, 'name':'city'] # rows 1-3, cols name→city
df.loc[df['age'] > 30] # Boolean filter
# .iloc — position-based
df.iloc[0] # first row
df.iloc[-1] # last row
df.iloc[1:3, 0:2] # rows 1-2, cols 0-1
# .at / .iat — single cell (fastest)
df.at[0, 'name'] # 'Alice'
df.iat[0, 0] # 'Alice'
# Compound filters
mask = (df['age'] > 25) & (df['city'] == 'New York')
df.loc[mask]
# isin
df[df['city'].isin(['New York', 'London'])]
# query — readable syntax
df.query("age > 25 and city == 'New York'")
Creating and transforming columns
# Calculated column
df['monthly_salary'] = df['salary'] / 12
# Vectorized string operations (fast)
df['name_upper'] = df['name'].str.upper()
df['city_lower'] = df['city'].str.lower()
# pd.cut — bin a continuous variable
df['age_group'] = pd.cut(
df['age'],
bins=[0, 25, 35, 100],
labels=['junior', 'mid', 'senior']
)
# np.select — multiple conditions
conditions = [
df['salary'] < 65000,
df['salary'].between(65000, 95000),
df['salary'] > 95000,
]
choices = ['low', 'medium', 'high']
df['salary_band'] = np.select(conditions, choices, default='unknown')
# assign — chainable (returns a new DataFrame)
df = (df
.assign(bonus=lambda d: d['salary'] * 0.10)
.assign(total=lambda d: d['salary'] + d['bonus']))
Grouping — groupby
# Basic groupby
by_city = df.groupby('city')['salary'].mean()
# Multiple aggregations
summary = df.groupby('city').agg(
mean_salary=('salary', 'mean'),
max_salary=('salary', 'max'),
headcount=('name', 'count'),
)
# Several functions on one column
df.groupby('city')['salary'].agg(['mean', 'std', 'count'])
# transform — keeps the original index (useful for enriching a DataFrame)
df['city_avg_salary'] = df.groupby('city')['salary'].transform('mean')
# filter — keep groups satisfying a condition
large_cities = df.groupby('city').filter(lambda g: len(g) > 1)
Merge and join
employees = pd.DataFrame({
'id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Carol', 'Dave'],
'dept_id': [10, 20, 10, 30],
})
departments = pd.DataFrame({
'dept_id': [10, 20, 30],
'department': ['Engineering', 'Marketing', 'Finance'],
'budget': [500_000, 300_000, 400_000],
})
# Inner join (only matching rows)
inner = pd.merge(employees, departments, on='dept_id', how='inner')
# Left join (all employees, NaN where no dept match)
left = pd.merge(employees, departments, on='dept_id', how='left')
# Different column names
# pd.merge(df1, df2, left_on='emp_dept', right_on='dept_id')
# join — merges on index
df_a = employees.set_index('id')
df_b = pd.DataFrame({'bonus': [1000, 2000, 1500, 3000]}, index=[1, 2, 3, 4])
result = df_a.join(df_b)
Time series
# Date range index
dates = pd.date_range('2024-01-01', periods=365, freq='D')
sales = pd.Series(np.random.randint(100, 500, 365), index=dates)
# Date-based slicing
january = sales['2024-01']
q1 = sales['2024-01':'2024-03']
# Resampling — aggregate by period
weekly = sales.resample('W').sum() # weekly
monthly = sales.resample('ME').mean() # month end
quarterly = sales.resample('QE').sum() # quarter end
# Rolling statistics
ma7 = sales.rolling(window=7).mean() # 7-day moving average
ma30 = sales.rolling(window=30, min_periods=1).mean()
# Shift, diff, pct_change
yesterday = sales.shift(1) # lag by 1 period
daily_chg = sales.diff() # day-over-day change
pct_chg = sales.pct_change() # percentage change
# Parse dates from string column
df['date_dt'] = pd.to_datetime(df['date_str'], format='%d/%m/%Y')
df['year'] = df['date_dt'].dt.year
df['month'] = df['date_dt'].dt.month
df['day_name'] = df['date_dt'].dt.day_name()
Pivot tables and crosstab
# pivot_table — multi-dimensional summary
table = pd.pivot_table(
df,
values='salary',
index='city',
columns='age_group',
aggfunc='mean',
fill_value=0,
margins=True, # adds 'All' row and column
)
# melt — wide → long format
df_wide = pd.DataFrame({
'name': ['Alice', 'Bob'],
'january': [5000, 6000],
'february': [5200, 5800],
'march': [5400, 6200],
})
df_long = df_wide.melt(
id_vars=['name'],
value_vars=['january', 'february', 'march'],
var_name='month',
value_name='sales'
)
# crosstab — contingency table
ct = pd.crosstab(df['city'], df['age_group'], normalize='index') # row %
Missing data
df_na = pd.DataFrame({
'a': [1, np.nan, 3, np.nan],
'b': [np.nan, 2, 3, 4],
'c': ['x', 'y', None, 'z'],
})
# Detect and count
df_na.isnull().sum()
# Drop rows/columns
df_na.dropna() # drop rows with any NaN
df_na.dropna(subset=['a']) # only if 'a' is NaN
df_na.dropna(thresh=2) # keep if at least 2 non-NaN
# Fill
df_na.fillna(0)
df_na.fillna({'a': df_na['a'].mean(), 'c': 'unknown'})
# Interpolation (good for time series)
df_na['a'] = df_na['a'].interpolate(method='linear')
# Forward / backward fill
df_na['b'] = df_na['b'].ffill() # propagate last valid value forward
df_na['b'] = df_na['b'].bfill() # propagate next valid value backward
Exporting results
# CSV
df.to_csv('output.csv', index=False, encoding='utf-8-sig') # BOM for Excel compat
# Excel (requires openpyxl)
with pd.ExcelWriter('report.xlsx', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Employees', index=False)
summary.to_excel(writer, sheet_name='Summary')
# JSON
df.to_json('data.json', orient='records', force_ascii=False, indent=2)
# SQL (requires sqlalchemy)
# from sqlalchemy import create_engine
# engine = create_engine('sqlite:///database.db')
# df.to_sql('employees', engine, if_exists='replace', index=False)
# df2 = pd.read_sql('SELECT * FROM employees WHERE city="New York"', engine)
# Parquet — columnar, fast, compressed (requires pyarrow)
# df.to_parquet('data.parquet', compression='snappy')
# df2 = pd.read_parquet('data.parquet')
Best practices
- Prefer vectorized operations over
applyand loops —df['col'].str.upper()is 10–100× faster thandf['col'].apply(str.upper). - Avoid
inplace=True— it doesn't save memory and makes debugging harder; reassign the variable instead. - Use
df.copy()when modifying a subset to avoidSettingWithCopyWarningand unexpected mutations of the original. - Chain
query()andassign()to build readable, functional-style transformation pipelines. - Specify
dtypewhen reading CSVs for large datasets:pd.read_csv(..., dtype={'id': int, 'price': float})can halve memory usage. - Use Parquet instead of CSV for datasets > 100 MB that you read frequently — faster parsing and better compression.
- Set
parse_datesinread_csvto avoid manualpd.to_datetime()calls on date columns.
Related conversions
Frequent conversions across the catalogue: