This is the fifth installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This notebook demonstrates how to recreate the boxplots and ridgeline plots found in the “visualizing many distributions at once” chapter of the book.
We will use the vbar()
, scatter()
, harea()
, and patch()
glyphs to recreate the plots.
from bokeh.io import output_notebook
import pandas as pd
output_notebook() # render plots inline in notebook
The plot in this sub-section represent the mean daily temperatures in Lincoln, Nebraska in 2016. The line in the middle of the boxplot represent the median, and the box encloses the middle 50% of the data.
The top and bottom whiskers extend to the maximum and minimum that falls within 1.5 times the height of the box.
file = "../data/csv_files/lincoln.csv"
df = pd.read_csv(file)
df["DATE"] = pd.to_datetime(df["DATE"])
df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
df["MONTH"] = df.DATE.dt.strftime("%b")
df = df[
[
"MONTH",
"TMIN",
"TMAX",
"TAVG",
]
]
qs = df.groupby("MONTH").TAVG.quantile([0.25, 0.5, 0.75]).unstack().reset_index()
qs.columns = ["MONTH", "Q1", "Q2", "Q3"]
iqr = qs.Q3 - qs.Q1
qs["upper"] = qs.Q3 + 1.5 * iqr
qs["lower"] = qs.Q1 - 1.5 * iqr
df = pd.merge(df, qs, on="MONTH", how="left")
df.head()
MONTH | TMIN | TMAX | TAVG | Q1 | Q2 | Q3 | upper | lower | |
---|---|---|---|---|---|---|---|---|---|
0 | Jan | 15.0 | 36.0 | 25.5 | 23.0 | 27.5 | 31.5 | 44.25 | 10.25 |
1 | Jan | 18.0 | 39.0 | 28.5 | 23.0 | 27.5 | 31.5 | 44.25 | 10.25 |
2 | Jan | 15.0 | 32.0 | 23.5 | 23.0 | 27.5 | 31.5 | 44.25 | 10.25 |
3 | Jan | 15.0 | 27.0 | 21.0 | 23.0 | 27.5 | 31.5 | 44.25 | 10.25 |
4 | Jan | 21.0 | 40.0 | 30.5 | 23.0 | 27.5 | 31.5 | 44.25 | 10.25 |
from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
# create figure object
p = figure(
title="Figure 9.3",
x_range=df.MONTH.unique(),
toolbar_location=None,
height=400,
width=600,
x_axis_label="month",
y_axis_label="mean temperature (F)",
)
# create column data source object from the dataframe
source = ColumnDataSource(df)
# create whisker object and add it to figure
whisker = Whisker(base="MONTH", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)
# create boxplot using two vbar() glyphs
p.vbar(
x="MONTH",
top="Q2",
bottom="Q1",
width=0.8,
source=source,
color="#E0E0E0",
line_color="black",
)
p.vbar(
x="MONTH",
top="Q3",
bottom="Q2",
width=0.8,
source=source,
color="#E0E0E0",
line_color="black",
)
# plot outliers using scatter() glyph
outliers = df[~df.TAVG.between(df.lower, df.upper)]
p.scatter("MONTH", "TAVG", source=outliers, size=5, color="black")
# customize plot
p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None
show(p)
For more information about the scatter()
glyph, you can visit the reference guide.
The plot in this sub-section represents the mean daily temperatures in Lincoln, Nebraska in 2016. We use the same file as the boxplot.
import numpy as np
months = list(df.MONTH.unique())
from sklearn.neighbors import KernelDensity
# create figure object
p = figure(
title="Figure 9.8",
x_range=months,
toolbar_location=None,
height=400,
width=500,
x_axis_label="month",
y_axis_label="mean temperature (F)",
)
# create an offset for each category in the data
def offset(category, data, scale=7):
return list(zip([category] * len(data), scale * data))
for month in months:
month_df = df[df.MONTH == month].dropna()
tavg = month_df.TAVG.values
temps = np.linspace(tavg.min(), tavg.max(), 50)
# calculate the KDE for each month and plot the data
kde = KernelDensity(kernel="gaussian", bandwidth=3).fit(tavg[:, np.newaxis])
density = np.exp(kde.score_samples(temps[:, np.newaxis]))
x1, x2 = offset(month, density), offset(month, -density)
p.harea(x1=x1, x2=x2, y=temps, alpha=0.8, color="#E0E0E0")
# create jitter to offset temperature data points
tavg_density = np.exp(kde.score_samples(tavg[:, np.newaxis]))
jitter = (np.random.random(len(tavg)) * 2 - 1) * tavg_density
# create a scatter plot for the average temp in each month
p.scatter(
x=offset(month, jitter),
y=tavg,
color="black",
)
# customize plot
p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None
show(p)
For more information about the harea()
glyph, you can visit the reference guide.
The plot in this sub-section represent the voting pattern in the U.S House of Representatives over the years. DW-NOMINATE scores are frequently used to compare the voting patterns between parties and over time. Here, score distributions are shown for each Congress from 1963 to 2013 separately for Democrats and Republicans. Each Congress is represented by its first year (dim_1 column in the dataframe).
The patch()
glyph is used for the plot.
file = "../data/csv_files/dw_nominate_house.csv"
df = pd.read_csv(file)
# add year column by multiplying each congress by 2 from the year 1787
df["year"] = (df.congress) * 2 + 1787
# select only the relevant columns from the year 1963 onwards
year = df["year"] >= 1962
parties = (df["party_code"] == 100) | (df["party_code"] == 200)
dn = (df["cd"] != 0) & (df["cd"] != 98) & (df["cd"] != 99)
df = df[year & parties & dn].reset_index(drop=True)
# create two dataframes for both political parties
dems = df[df["party_code"] == 100]
repubs = df[df["party_code"] == 200]
from bokeh.models import ColumnDataSource
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.neighbors import KernelDensity
# create a column data source containing the KDE data for each
def tweak_df(df):
"""
Calculate Kernel Density Estimates (KDE) for each year in the input DataFrame.
Parameters:
df (pd.DataFrame): Input DataFrame with columns "year" and "dim_1".
Returns:
bokeh.models.ColumnDataSource: A ColumnDataSource containing KDE data for each year.
Raises:
ValueError: If the input DataFrame does not have the required columns or if there's an issue with KDE calculation.
"""
# Validate input DataFrame
if not isinstance(df, pd.DataFrame):
raise ValueError("Input must be a pandas DataFrame.")
if "year" not in df.columns or "dim_1" not in df.columns:
raise ValueError("Input DataFrame must have columns 'year' and 'dim_1'.")
grouped = df.groupby("year").dim_1
# create Column Data Source object to hold the KDE data
scores = np.linspace(-1.5, 1.5, 500)
blank = np.linspace(100, 200, 500)
source = ColumnDataSource(data=dict(scores=scores))
# add extra columns to create blank space in the KDE plot
source.add(blank, "1957")
source.add(blank, "1959")
source.add(blank, "1961")
def ridge(category, data):
return list(zip([category] * len(data), data))
# Calculate KDE for each column and create plotting ridge
try:
for year, vote in grouped:
year = str(year)
pdf = gaussian_kde(vote)
y = ridge(year, pdf(scores))
source.add(y, year)
except Exception as e:
raise ValueError(f"Error occurred during KDE calculation: {str(e)}")
return source
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
def plot_ridges(df1, df2=None):
"""
Plot ridges for two DataFrames.
Parameters:
df1: ColumnDataSource containing data for the first category.
df2 (optional): ColumnDataSource containing data for the second category.
Returns:
bokeh.plotting.figure: A Bokeh figure showing the ridges for both categories.
"""
# Input validation
if not isinstance(df1, ColumnDataSource):
raise ValueError(f"{df1} must be a valid ColumnDataSource")
if df2 is not None and not isinstance(df2, ColumnDataSource):
raise ValueError(f"{df2} must be a valid ColumnDataSource")
# Get the list of categories from the data keys
years = list(reversed(df1.data.keys()))[:-1]
# Create a figure object
p = figure(
title="Figure 9.12",
x_axis_label="DW-NOMINATE score",
y_axis_label="year",
toolbar_location=None,
x_range=(-0.75, 1.5),
y_range=years,
height=400,
width=600,
)
# Plot ridges for each category
for i, year in enumerate(years):
# Plot ridges for the first category
p.patch(
x="scores",
y=year,
source=df1,
fill_color="blue",
line_color="white",
legend_label="Democrats",
fill_alpha=0.5,
)
# Plot ridges for the second category if df2 is provided
if df2 is not None:
# Input validation: Check if df2 has the same keys as df1
if not set(df2.data.keys()) == set(df1.data.keys()):
raise ValueError(f"{df2} must have the same keys as {df1}")
p.patch(
x="scores",
y=year,
source=df2,
fill_color="red",
line_color="white",
legend_label="Republicans",
alpha=0.4,
)
# Customize plot
p.xaxis.ticker = [-0.75, -0.5, -0.25, 0.00, 0.25, 0.5, 0.75, 1.00]
p.ygrid.grid_line_color = None
p.legend.orientation = "horizontal"
p.legend.border_line_color = None
return p
# create ridgeline plot for both dataframes
rep_cds = tweak_df(repubs)
dem_cds = tweak_df(dems)
ridgeplot = plot_ridges(dem_cds, rep_cds)
show(ridgeplot)