๐ผ
Pandas
Pandas data manipulation: DataFrames, Series, indexing, cleaning, groupby, merging, reshaping, time series, and I/O
Creating & Inspecting DataFrames
Create DataFrames from dicts, lists, CSV, and inspect shape/dtypes/stats
pythonยทCreate DataFrames
import pandas as pd
import numpy as np
# From dict
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"score": [88.5, 92.0, 78.3],
})
# From list of dicts
df = pd.DataFrame([
{"name": "Alice", "age": 25},
{"name": "Bob", "age": 30},
])
# From NumPy array
df = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])
# Series
s = pd.Series([10, 20, 30], index=["a", "b", "c"], name="values")
# Range of dates
dates = pd.date_range("2024-01-01", periods=6, freq="D")pythonยทInspect
df.shape # (rows, cols) df.dtypes # column data types df.info() # dtypes + non-null counts + memory df.describe() # count/mean/std/min/quartiles/max df.describe(include="all") # include object columns too df.head(5) # first 5 rows df.tail(5) # last 5 rows df.sample(5) # random 5 rows df.columns.tolist() # column names as list df.index # row index df.values # underlying NumPy array df.memory_usage(deep=True)
Selecting & Indexing
loc, iloc, boolean masks, query, and MultiIndex
pythonยทColumn & row selection
# Single column โ Series df["name"] df.name # attribute access (avoid if col name clashes) # Multiple columns โ DataFrame df[["name", "age"]] # Rows by label โ loc (inclusive on both ends) df.loc[2] # row with label 2 df.loc[0:3] # rows 0,1,2,3 df.loc[0:3, "name":"score"] # rows and cols by label df.loc[:, ["name", "age"]] # all rows, specific cols # Rows by position โ iloc (exclusive end) df.iloc[0] # first row df.iloc[0:3] # rows 0,1,2 df.iloc[0:3, 0:2] # rows 0-2, cols 0-1 df.iloc[-1] # last row
pythonยทBoolean filtering & query
# Boolean mask
df[df["age"] > 28]
df[df["name"] == "Alice"]
df[(df["age"] > 25) & (df["score"] >= 85)] # AND
df[(df["age"] < 25) | (df["score"] > 90)] # OR
df[~(df["age"] > 30)] # NOT
# isin
df[df["name"].isin(["Alice", "Charlie"])]
# query โ readable string syntax
df.query("age > 28")
df.query("age > 28 and score >= 85")
df.query("name in ['Alice', 'Bob']")
threshold = 85
df.query("score > @threshold") # use local variable with @pythonยทIndex operations
# Set a column as index
df = df.set_index("name")
df.reset_index() # move index back to column
# Rename index / columns
df.index.name = "person"
df.columns = ["col1", "col2"]
df.rename(columns={"age": "years", "score": "pts"}, inplace=True)
# MultiIndex
arrays = [["bar", "bar", "baz"], ["one", "two", "one"]]
idx = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])
df.loc["bar"] # select first level
df.loc[("bar", "one")] # select both levels
df.xs("one", level="second") # cross-sectionData Cleaning
Handle missing values, duplicates, type casting, and string cleaning
pythonยทMissing values
df.isnull().sum() # count NaN per column
df.isnull().sum() / len(df) # fraction missing
# Drop
df.dropna() # drop rows with any NaN
df.dropna(how="all") # drop rows where ALL values are NaN
df.dropna(subset=["age"]) # drop rows where 'age' is NaN
df.dropna(axis=1) # drop columns with any NaN
# Fill
df.fillna(0)
df.fillna({"age": 0, "score": df["score"].mean()})
df["score"].fillna(df["score"].median(), inplace=True)
df.ffill() # forward fill
df.bfill() # backward fill
# Interpolate
df["score"].interpolate(method="linear")pythonยทDuplicates & type casting
# Duplicates
df.duplicated().sum()
df.duplicated(subset=["name"]).sum()
df.drop_duplicates()
df.drop_duplicates(subset=["name"], keep="first")
# Type casting
df["age"] = df["age"].astype(int)
df["score"] = df["score"].astype(float)
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["category"] = df["category"].astype("category")
# Check types
df.dtypes
df["age"].dtypepythonยทString cleaning (str accessor)
s = df["name"]
s.str.lower()
s.str.upper()
s.str.strip() # trim whitespace
s.str.replace(r"\s+", "_", regex=True)
s.str.contains("ali", case=False) # boolean mask
s.str.startswith("A")
s.str.split(",", expand=True) # split into multiple columns
s.str.extract(r"(\d+)") # capture group โ column
s.str.len() # length of each string
s.str.get(0) # first characterTransforming Data
apply, map, assign, cut, clip, sort, and column operations
pythonยทapply, map & assign
# apply โ row or column-wise function
df["score_norm"] = df["score"].apply(lambda x: x / 100)
df.apply(lambda col: col.max() - col.min()) # column-wise
df.apply(lambda row: row["score"] * 2, axis=1) # row-wise
# map โ element-wise on Series (rename/encode)
df["grade"] = df["score"].map(lambda x: "A" if x >= 90 else "B")
df["name"] = df["name"].map({"Alice": "A", "Bob": "B"})
# assign โ add new columns (chainable)
df = (df
.assign(score_pct=df["score"] / 100)
.assign(pass_fail=lambda d: d["score"].ge(60).map({True: "Pass", False: "Fail"}))
)
# applymap / map (DataFrame element-wise, pandas 2.1+)
df.map(lambda x: round(x, 2) if isinstance(x, float) else x)pythonยทBinning, clipping & sorting
# Bin continuous values
df["grade"] = pd.cut(df["score"], bins=[0, 60, 80, 100], labels=["C", "B", "A"])
df["quartile"] = pd.qcut(df["score"], q=4, labels=["Q1","Q2","Q3","Q4"])
# Clip outliers
df["score"] = df["score"].clip(lower=0, upper=100)
# Sort
df.sort_values("score")
df.sort_values("score", ascending=False)
df.sort_values(["age", "score"], ascending=[True, False])
df.sort_index()
# Rank
df["rank"] = df["score"].rank(method="dense", ascending=False)
# Replace values
df["score"].replace({-1: np.nan, 999: np.nan})GroupBy & Aggregation
groupby, agg, transform, filter, pivot_table, and crosstab
pythonยทgroupby basics
g = df.groupby("category")
g["score"].mean()
g["score"].agg(["mean", "std", "count"])
g.size() # group sizes
g.ngroups # number of groups
# Multiple columns
df.groupby(["category", "region"])["revenue"].sum()
# Named aggregations (pandas 0.25+)
df.groupby("category").agg(
avg_score=("score", "mean"),
total=("score", "sum"),
n=("score", "count"),
)
# Apply custom function
df.groupby("category")["score"].agg(lambda s: s.quantile(0.9))pythonยทtransform & filter
# transform โ returns same-shape result (great for new columns)
df["group_mean"] = df.groupby("category")["score"].transform("mean")
df["z_score"] = df.groupby("category")["score"].transform(
lambda s: (s - s.mean()) / s.std()
)
# filter โ keep only groups matching a condition
df.groupby("category").filter(lambda g: g["score"].mean() > 80)pythonยทpivot_table & crosstab
# pivot_table
pd.pivot_table(
df,
values="score",
index="category",
columns="region",
aggfunc="mean",
fill_value=0,
margins=True, # row/col totals
)
# crosstab โ frequency table
pd.crosstab(df["category"], df["region"])
pd.crosstab(df["category"], df["region"], normalize="index") # row %
# pivot (no aggregation โ requires unique index/col pairs)
df.pivot(index="date", columns="metric", values="value")Merging & Joining
merge, join, concat, and combine DataFrames
pythonยทmerge (SQL-style joins)
# Inner join (default) pd.merge(df_left, df_right, on="id") # Join types pd.merge(df_left, df_right, on="id", how="left") pd.merge(df_left, df_right, on="id", how="right") pd.merge(df_left, df_right, on="id", how="outer") pd.merge(df_left, df_right, on="id", how="cross") # Different column names pd.merge(df_left, df_right, left_on="user_id", right_on="id") # On index pd.merge(df_left, df_right, left_index=True, right_index=True) # Indicator column โ shows which table each row came from pd.merge(df_left, df_right, on="id", how="outer", indicator=True)
pythonยทconcat & join
# Stack DataFrames vertically pd.concat([df1, df2]) pd.concat([df1, df2], ignore_index=True) # reset index pd.concat([df1, df2], keys=["2023", "2024"]) # hierarchical index # Stack DataFrames horizontally pd.concat([df1, df2], axis=1) # join โ merge on index (shortcut) df_left.join(df_right) df_left.join(df_right, how="outer") df_left.join(df_right, on="key") # join on column in left # combine_first โ fill NaN from another DataFrame df1.combine_first(df2)
Reshaping
melt, stack, unstack, wide_to_long, and explode
pythonยทmelt & wide_to_long
# melt โ wide โ long (unpivot)
pd.melt(df, id_vars=["name"], value_vars=["q1", "q2", "q3"],
var_name="quarter", value_name="revenue")
# wide_to_long โ multiple variable groups
pd.wide_to_long(df, stubnames=["score", "rank"],
i="id", j="year")pythonยทstack, unstack & explode
# stack โ move columns to rows
stacked = df.stack() # columns โ innermost row index level
df.unstack() # innermost row level โ columns
df.unstack(level=0) # specific level
# explode โ list values into separate rows
df["tags"] = df["tags"].str.split(",")
df.explode("tags")
# Get dummies โ one-hot encode
pd.get_dummies(df["category"])
pd.get_dummies(df, columns=["category", "region"])
pd.get_dummies(df["category"], drop_first=True) # avoid dummy trapTime Series
date parsing, resampling, rolling windows, and time zone handling
pythonยทDates & dt accessor
# Parse dates
df["date"] = pd.to_datetime(df["date"])
df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
# DatetimeIndex
df = df.set_index("date")
df["2024"] # all of 2024
df["2024-03"] # March 2024
df["2024-01-01":"2024-03-31"] # date range slice
# dt accessor
df["date"].dt.year
df["date"].dt.month
df["date"].dt.day
df["date"].dt.dayofweek # 0=Monday
df["date"].dt.is_month_end
df["date"].dt.strftime("%b %Y")pythonยทResample & rolling
# Resample โ aggregate by time period
df["revenue"].resample("ME").sum() # month-end
df["revenue"].resample("QE").mean() # quarter-end
df["revenue"].resample("W").agg({"revenue": "sum", "visits": "mean"})
# Upsampling + fill
df.resample("D").ffill()
# Rolling window
df["revenue"].rolling(window=7).mean() # 7-day moving average
df["revenue"].rolling(window=30).std()
df["revenue"].rolling("7D").sum() # time-based window
# Expanding window (cumulative)
df["revenue"].expanding().mean()
df["revenue"].cumsum()
df["revenue"].cumprod()pythonยทTime zones & offsets
# Localize and convert
df.index = df.index.tz_localize("UTC")
df.index = df.index.tz_convert("America/New_York")
# Date offsets
from pandas.tseries.offsets import BDay, MonthEnd
pd.Timestamp("2024-01-01") + BDay(5) # 5 business days
pd.Timestamp("2024-01-15") + MonthEnd() # next month end
# Period
p = pd.Period("2024-Q1", freq="Q")
p.start_time
p.end_time
# Business days range
pd.bdate_range("2024-01-01", "2024-01-31")I/O โ Reading & Writing
CSV, Excel, JSON, Parquet, SQL, and clipboard
pythonยทCSV & Excel
# CSV
df = pd.read_csv("data.csv")
df = pd.read_csv("data.csv",
sep=";",
header=0,
index_col="id",
usecols=["name", "age", "score"],
dtype={"age": int},
parse_dates=["date"],
na_values=["N/A", "NULL", "-"],
nrows=1000, # read only first 1000 rows
skiprows=[1, 2],
)
df.to_csv("out.csv", index=False)
# Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df.to_excel("out.xlsx", sheet_name="Results", index=False)
# Multiple sheets
with pd.ExcelWriter("out.xlsx") as w:
df1.to_excel(w, sheet_name="Summary")
df2.to_excel(w, sheet_name="Detail")pythonยทJSON, Parquet & SQL
# JSON
df = pd.read_json("data.json")
df = pd.read_json("data.json", orient="records")
df.to_json("out.json", orient="records", indent=2)
# Parquet (requires pyarrow or fastparquet)
df = pd.read_parquet("data.parquet")
df = pd.read_parquet("data.parquet", columns=["name", "score"])
df.to_parquet("out.parquet", compression="snappy", index=False)
# SQL
import sqlalchemy as sa
engine = sa.create_engine("postgresql://user:pass@host/db")
df = pd.read_sql("SELECT * FROM users WHERE active = true", engine)
df = pd.read_sql_table("users", engine)
df.to_sql("users_backup", engine, if_exists="replace", index=False)
# Chunked reading (large files)
for chunk in pd.read_csv("big.csv", chunksize=10_000):
process(chunk)Performance & Useful Patterns
Efficient operations, method chaining, vectorisation, and memory tips
pythonยทMethod chaining
result = (
pd.read_csv("sales.csv", parse_dates=["date"])
.rename(columns=str.lower)
.dropna(subset=["revenue"])
.query("revenue > 0")
.assign(
month=lambda d: d["date"].dt.to_period("M"),
revenue_k=lambda d: d["revenue"] / 1000,
)
.groupby("month")
.agg(total=("revenue_k", "sum"), n=("revenue_k", "count"))
.reset_index()
.sort_values("month")
)pythonยทVectorisation over loops
# Bad โ Python loop
for i, row in df.iterrows():
df.at[i, "tax"] = row["revenue"] * 0.2 # very slow
# Good โ vectorised
df["tax"] = df["revenue"] * 0.2
# Good โ np.where for conditionals
df["label"] = np.where(df["score"] >= 60, "Pass", "Fail")
# Good โ np.select for multiple conditions
conditions = [df["score"] >= 90, df["score"] >= 70, df["score"] >= 50]
choices = ["A", "B", "C"]
df["grade"] = np.select(conditions, choices, default="D")
# itertuples is faster than iterrows if you must loop
for row in df.itertuples():
print(row.name, row.score)pythonยทMemory & type optimisation
# Downcast numeric types
df["age"] = pd.to_numeric(df["age"], downcast="integer")
df["score"] = pd.to_numeric(df["score"], downcast="float")
# Use categorical for low-cardinality strings
df["status"] = df["status"].astype("category")
# Check memory
df.memory_usage(deep=True).sum() / 1024**2 # MB
# Copy vs view
df2 = df.copy() # explicit copy, avoids SettingWithCopyWarning
df2 = df[["name"]].copy()
# Useful diagnostics
pd.set_option("display.max_columns", 50)
pd.set_option("display.float_format", "{:.2f}".format)
pd.reset_option("all")