This is the fourth installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This notebook demonstrates how to recreate the multiple distribution histograms and density plots found in the “visualizing distributions” chapter of the book. We will use the varea()
and hbar()
glyphs to recreate the density plots and histograms.
from bokeh.io import output_notebook
# import the relevant libraries
import pandas as pd
file = "../data/csv_files/titanic_all.csv"
# create new dataframe with only the relevant columns
titanic = pd.read_csv(file)
titanic = titanic.drop(["name", "class", "survived"], axis=1)
# create new dataframe for male and female passengers respectively
female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]
# get age data for the density plots.
f_values = female.age.dropna().values
m_values = male.age.dropna().values
t_values = titanic.age.dropna().values
from bokeh.plotting import figure, show
from sklearn.neighbors import KernelDensity
import numpy as np
# create function to plot the multiple density estimates
def plot_kde(data_dict, title, kernel="gaussian", bandwidth=2, line_color=None):
"""
Create a density plot using Kernel Density Estimation (KDE) for multiple datasets.
Parameters:
data_dict (list of dicts): A list of dictionaries, where each dictionary contains the following keys:
- 'data': The data to be plotted.
- 'color': The color of the filled area.
- 'legend_label': The legend label for the dataset.
title (str): The title of the plot.
kernel (str, optional): The type of kernel to use in creating the plot. Default is gaussian.
bandwidth (float, optional): The bandwidth of the KDE. Higher values result in smoother
but less accurate density plots. Default is 2.
line_color (str, optional): The color of the lines around the filled areas.
Default is None, which means no lines will be drawn.
Returns:
bokeh.plotting.figure.Figure: The Bokeh figure containing the density plot.
"""
positions = np.linspace(-10, 80, 1000)
# create figure object
p = figure(
title=title, # plot title
height=300, # plot height
width=500, # plot width
toolbar_location=None, # remove toolbars
x_axis_label="age (years)",
y_axis_label="scaled density",
)
# loop through each data_dict and plot a density plot for each
for info in data_dict:
data = info["data"]
color = info["color"]
legend_label = info["legend_label"]
# create kde object and fit object into 'data' parameter
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(data[:, np.newaxis])
# calculate log-density estimation (log_dens) at each position using the 'score_samples' method
density = np.exp(kde.score_samples(positions[:, np.newaxis]))
# scale the density estimation to correspond to the number of data values
scaled_density = density * len(data)
p.varea(
x=positions, # x-axis coordinates
y1=scaled_density, # y-axis coordinates of one side of the area
y2=0, # y-axis coordinates of the other side of the area
fill_alpha=0.9, # transparency of the filled area
fill_color=color, # color of the filled area
legend_label=legend_label,
)
# customize the x-axis
p.x_range.start = 0
p.xaxis.ticker = [0, 20, 40, 60]
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = "gray"
p.xaxis.major_tick_out = 2
# customize the y-axis
p.y_range.start = 0
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_line_color = "gray"
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_in = 0
p.legend.location = "top_right"
return p
# generate a single multiple density plot
data_dict = [
{"data": m_values, "color": "#5BA4DB", "legend_label": "male"},
{"data": f_values, "color": "#D0771E", "legend_label": "female"},
]
single = plot_kde(data_dict, "Figure 7.8", line_color="black")
show(single)
# generate two multiple density plots and display them in a grid
from bokeh.layouts import gridplot
male_data = [
{"data": t_values, "color": "#D5D4D3", "legend_label": "all passengers"},
{"data": m_values, "color": "#055BB2", "legend_label": "male"},
]
female_data = [
{"data": t_values, "color": "#D5D4D3", "legend_label": "all passengers"},
{"data": f_values, "color": "#CB6805", "legend_label": "female"},
]
male = plot_kde(male_data, "Figure 7.9 male passengers")
female = plot_kde(female_data, "Figure 7.9 female passengers")
layout = gridplot([male, female], ncols=2)
show(layout)
The plot in this sub-section represent the butterfat percentage in the milk of four cattle breeds.
The varea()
glyph is also used to create the density plots.
file = "../data/csv_files/cows.csv"
df = pd.read_csv(file)
# create dataframes for the four different cattle breeds
jersey = df[df["breed"] == "Jersey"]
holstein = df[df["breed"] == "Holstein-Friesian"]
guernsey = df[df["breed"] == "Guernsey"]
ayrshire = df[df["breed"] == "Ayrshire"]
# get butterfat data for the cattle breeds
j_values = jersey.butterfat.values
h_values = holstein.butterfat.values
g_values = guernsey.butterfat.values
a_values = ayrshire.butterfat.values
positions = np.linspace(2, 8, 1000)
from bokeh.models import Label, CustomJSTickFormatter
# arrange plotting data as pandas DataFrame
data_dict = {
"values": [a_values, g_values, h_values, j_values],
"bandwidths": [0.125, 0.25, 0.1, 0.3],
"colors": ["#409DFA", "#AC5703", "#9E5205", "green"],
"labels": ["Ayrshire", "Guernsey", "Holstein-Friesian", "Jersey"],
}
df = pd.DataFrame(data_dict)
# Create figure object
p = figure(
title="figure 7.11", # plot title
height=300, # plot height
width=600, # plot width
x_axis_label="butterfat contents",
y_axis_label="density",
)
# Loop to calculate KDE and plot vareaes
for _, row in df.iterrows():
data, bandwidth, color, label = (
row["values"],
row["bandwidths"],
row["colors"],
row["labels"],
)
kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(data[:, np.newaxis])
log_dens = kde.score_samples(positions[:, np.newaxis])
p.varea(
x=positions,
y1=np.exp(log_dens),
y2=0,
fill_alpha=0.3,
fill_color=color,
)
# Find the highest point and annotate with the label
max_idx = np.argmax(np.exp(log_dens))
highest_point_label = Label(
x=positions[max_idx],
y=np.exp(log_dens[max_idx]),
text=label,
text_font_size="10pt",
x_offset=20,
y_offset=-5,
text_color=color,
)
p.add_layout(highest_point_label)
# Convert x-axis labels to percentages
x_axis_labels = {3: "3%", 4: "4%", 5: "5%", 6: "6%", 7: "7%"}
p.xaxis.formatter = CustomJSTickFormatter(
code="""return tick in %s ? %s[tick] : '';""" % (x_axis_labels, x_axis_labels)
)
# customize x-axis
p.x_range.start = 3
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = "gray"
p.xaxis.major_tick_out = 2
p.xaxis.minor_tick_out = 0
# customize y-axis
p.yaxis.ticker = [0, 0.5, 1, 1.5]
p.y_range.start = 0
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_line_color = "gray"
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_in = 0
show(p)
For more information about the varea()
glyph, read our reference guide here.
The plot in this sub-section represent the age distributions of male and female Titanic passengers.
The hbar()
glyph is used to create the histogram plot.
We will use the same Titanic data from the density plots.
# create dataframe for only the males and females
female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]
# extract age data from the dataframes
m_age = male.age.dropna()
f_age = female.age.dropna()
# compute histograms for both datasets
bins = np.arange(0, 80, 3)
m_hist, edges = np.histogram(m_age, bins=bins)
f_hist, edges = np.histogram(f_age, bins=bins)
from bokeh.models import Label, CustomJSTickFormatter
# create figure object
p = figure(
title="Figure 7.10", # plot title
height=400, # plot height
width=600, # plot width
x_range=(-60, 40), # range of x-axis values to display
toolbar_location=None, # remove toolbars
x_axis_label="count",
y_axis_label="age (years)",
)
# plot male histogram
p.hbar(
right=m_hist * -1, # right endpoints of bars
y=edges[1:], # y-axis values
height=2, # bar height
color="#055BB2",
)
# plot female histogram
p.hbar(
right=f_hist,
y=edges[1:],
height=2,
color="#CB6805",
)
# customise x-axis and y-axis
p.xaxis.ticker = [-40, -20, 0, 20, 40]
p.yaxis.ticker = [0, 20, 40, 60]
p.y_range.start = 1.5
# create custom formatter function to make all tick labels positive
def positive_labels():
return "return Math.abs(tick);"
# apply the custom formatter to the x-axis using CustomJSTickFormatter
p.xaxis.formatter = CustomJSTickFormatter(args=dict(), code=positive_labels())
# add labels
m_label = Label(x=-40, y=70, text="male", text_font_size="15pt", x_offset=5)
f_label = Label(x=20, y=70, text="female", text_font_size="15pt", x_offset=5)
p.add_layout(m_label)
p.add_layout(f_label)
show(p)
For more information about the hbar()
glyph, read our reference guide here.