This is the third installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This notebook demonstrates how to recreate the single distribution histograms and density plots found in the visualizing distributions chapter of the book. We will use the Bokeh quad()
and patch()
glyphs to receate the histograms and density plots.
from bokeh.io import output_notebook, export_png
output_notebook() # render plots inline in notebook
The plots in this sub-section represent the age distribution of the Titanic passengers.
The quad()
glyph is used to create the histograms.
# import the relevant library
import pandas as pd
file = "../data/csv_files/titanic_all.csv"
df = pd.read_csv(file)
ages = df.age.dropna()
ages
0 29.00 1 2.00 2 30.00 3 25.00 4 0.92 ... 1308 27.00 1309 26.00 1310 22.00 1311 24.00 1312 29.00 Name: age, Length: 756, dtype: float64
from bokeh.plotting import figure, show
import numpy as np
# create a function to plot a histogram
def plot_hist(df, step, title):
"""
Create a histogram plot using Bokeh.
Args:
df (array-like): The data to be plotted as a histogram.
step (float): The width of each bin in the histogram.
title (str): The title of the plot.
Returns:
bokeh.plotting.figure.Figure: The Bokeh figure object representing the histogram plot.
Raises:
ValueError: If the input data is empty or contains invalid values.
"""
# data validation
if len(df) == 0:
raise ValueError("Input data is empty.")
if not all(isinstance(val, (int, float)) for val in df):
raise ValueError(
"Input data contains invalid values. Expected numerical values."
)
# function implementation
bins = np.arange(0, 80, step) # create bins
hist, edges = np.histogram(df, bins=bins) # create histogram and bin edges
# create figure object
p = figure(
title=title, # plot title
height=300, # plot height
width=400, # plot width
toolbar_location=None, # remove toolbars
x_axis_label="age (years)",
y_axis_label="count",
)
# create histogram plot
p.quad(
top=hist, # frequency value of each bin
bottom=0, # start of rectangle on y-axis
left=edges[:-1], # left edges of bin
right=edges[1:], # right edges of bin
fill_color="#52A9EB", # rectangle color
line_color="white", # line color of rectangle
)
# plot customization
# customise x-axis
p.x_range.start = 0
p.xaxis.ticker = [0, 20, 40, 60]
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = "gray"
p.xaxis.major_tick_out = 2
# customise y-axis
p.y_range.start = 0
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_line_color = "gray"
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_in = 0
return p
# plot a single histogram
single = plot_hist(ages, 5, "figure 7.1")
show(single)
from bokeh.layouts import gridplot
# plot four histograms and display in a 2x2 grid
one = plot_hist(ages, 1, "a")
three = plot_hist(ages, 3, "b")
five = plot_hist(ages, 5, "c")
fifteen = plot_hist(ages, 15, "d")
layout = gridplot([[one, three], [five, fifteen]])
show(layout)
# save output as a png file
export_png(layout, filename="../images/grid_histogram.png");
Unable to obtain driver using Selenium Manager: C:\Users\SimpcyClassy\anaconda3\lib\site-packages\selenium\webdriver\common\windows\selenium-manager.exe is missing. Please open an issue on https://github.com/SeleniumHQ/selenium/issues
For more information on the quad()
glyph, check our reference guide here.
The plots in this sub-section represent the kernel density estimate of the age distribution of passengers on the Titanic.
The patch()
glyph is used to create the density plots.
We use the same data as the histogram plot.
values = ages.values
positions = np.linspace(-10, 80, 1000)
from sklearn.neighbors import KernelDensity
# create a function to plot a Kernel density estimate
def plot_kde(kernel, bandwidth, title):
"""
Create a kernel density estimation (KDE) plot using Bokeh.
Args:
kernel (str): The type of kernel to be used in the KDE estimation.
bandwidth (float): The bandwidth parameter for the KDE.
title (str): The title of the plot.
Returns:
bokeh.plotting.figure.Figure: The Bokeh figure object representing the KDE plot.
Raises:
ValueError: If the input values for `kernel` or `bandwidth` are invalid.
"""
# data validation
supported_kernels = [
"gaussian",
"tophat",
"epanechnikov",
"exponential",
"linear",
"cosine",
]
if kernel not in supported_kernels:
raise ValueError(
f"Invalid kernel: '{kernel}'. Supported kernels are: {supported_kernels}"
)
if bandwidth <= 0:
raise ValueError("Bandwidth must be a positive value.")
# function implementation
# create kde object and fit object into 'values' parameter
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(values[:, np.newaxis])
# calculate log-density estimation (log_dens) at each position using the 'score_samples' method
log_dens = kde.score_samples(positions[:, np.newaxis])
# create figure object
p = figure(
title=title, # plot title
height=300, # plot height
width=600, # plot width
x_axis_label="age (years)",
y_axis_label="density",
)
# draw filled area on the figure using `patch` glyph
p.patch(
positions, # x-axis coordinates
np.exp(log_dens), # y-axis coordinates
fill_alpha=0.9, # fill transparency
fill_color="#52A9EB",
line_color="black",
)
# customize x-axis
p.x_range.start = 0
p.xaxis.ticker = [0, 20, 40, 60]
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = "gray"
p.xaxis.major_tick_out = 2
# customize y-axis
p.y_range.start = 0
p.yaxis.ticker = [0, 0.01, 0.02, 0.03, 0.04]
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_line_color = "gray"
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_in = 0
return p
# plot a single density plot
single = plot_kde("gaussian", 2, "figure 7.3")
show(single)
# plot four density plots and arrange them in a 2x2 grid
half = plot_kde("gaussian", 0.5, "a")
two = plot_kde("gaussian", 2, "b")
five = plot_kde("gaussian", 5, "c")
rect = plot_kde("tophat", 2, "d")
layout = gridplot([[half, two], [five, rect]])
show(layout)
export_png(layout, filename="../images/grid_density.png");
For more information on the patch()
glyph, check our reference guide, here