This is the sixth installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This notebook demonstrates how to recreate the boxplots and ridgeline plots found in the “visualizing associations” chapter of the book.
We will use the scatter()
and segment()
glyphs to recreate the plots.
from bokeh.io import output_notebook
import pandas as pd
import numpy as np
output_notebook() # render plots inline in the notebook
The plots in this sub-section represent the head length versus body mass for 123 blue jays. The birds' sex is indicated by color, and the birds' skull size by the circle size.
file = "../data/csv_files/blue_jays.csv"
df = pd.read_csv(file)
# add a "skull size" column to shrink circle size in the plot.
df["skull_size"] = [1.5 * (2 ** (size / 10)) for size in df["Skull"]]
df.head()
BirdID | KnownSex | BillDepth | BillWidth | BillLength | Head | Mass | Skull | Sex | skull_size | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0000-00000 | M | 8.26 | 9.21 | 25.92 | 56.58 | 73.30 | 30.66 | 1 | 12.561723 |
1 | 1142-05901 | M | 8.54 | 8.76 | 24.99 | 56.36 | 75.10 | 31.38 | 1 | 13.204543 |
2 | 1142-05905 | M | 8.39 | 8.78 | 26.07 | 57.32 | 70.25 | 31.25 | 1 | 13.086093 |
3 | 1142-05907 | F | 7.78 | 9.30 | 23.48 | 53.77 | 65.50 | 30.29 | 0 | 12.243656 |
4 | 1142-05909 | M | 8.71 | 9.84 | 25.47 | 57.32 | 74.90 | 31.85 | 1 | 13.641804 |
from bokeh.plotting import figure, show
def plot_scatter(df, fill_color=None, title=None, yaxis=False):
"""
Create a scatter plot using Bokeh.
Parameters:
df (pandas.DataFrame): DataFrame containing the data to be plotted.
fill_color (str, optional): Color to fill the scatter points.
title (str, optional): Title text for the plot.
yaxis (bool, optional): Whether to display the y-axis or not.
Returns:
bokeh.plotting.figure.Figure: Scatter plot figure.
"""
p = figure(
title=title,
height=400,
width=400,
y_range=(50, 62),
x_axis_label="body mass (g)",
y_axis_label="head length (mm)",
toolbar_location=None,
)
p.scatter(
x="Mass", # column name for x-axis of plot
y="Head", # column name for y-axis of plot
size="skull_size", # column name to use for circle size
source=df, # data source
fill_color=fill_color,
line_color="white",
)
# Customize plot axis and title
p.yaxis.ticker = [52, 54, 56, 58, 60]
p.yaxis.visible = yaxis
p.title.text_font_size = "12px"
p.title.align = "center"
p.title.background_fill_color = "darkgrey"
p.title.background_fill_alpha = 0.4
return p
from bokeh.layouts import gridplot
# create separate dataframes for the bird sexes
male = df[df["Sex"] == 1]
female = df[df["Sex"] == 0]
# plot each scatter plot using the appropriate dataframe
f = plot_scatter(female, "#DF7307", "female birds", yaxis=True)
m = plot_scatter(male, "#2769AB", "male birds")
# display both plots in a grd layout
layout = gridplot([[f, m]], sizing_mode="scale_both")
show(layout)
The same data from the previous sub-section is used here. In this sub-section, each individual plot show two data dimensions plotted against each other.
from itertools import product
from bokeh.io import show
from bokeh.layouts import gridplot
from bokeh.models import (
BasicTicker,
Scatter,
ColumnDataSource,
DataRange1d,
Grid,
LinearAxis,
Plot,
SaveTool,
)
from bokeh.transform import factor_cmap
sex = df.KnownSex.unique()
VARIABLES = ("Head", "Mass", "Skull")
LABELS = ("head length (mm)", "body mass (g)", "skull size (mm)")
N = len(VARIABLES)
source = ColumnDataSource(data=df)
xdrs = [DataRange1d(bounds=None) for _ in range(N)]
ydrs = [DataRange1d(bounds=None) for _ in range(N)]
plots = []
for i, (y, x) in enumerate(product(VARIABLES, VARIABLES)):
# create Plot object
p = Plot(
x_range=xdrs[i % N],
y_range=ydrs[i // N],
background_fill_color="white",
border_fill_color="white",
width=200,
height=200,
)
if i % N == 0: # for first column:
# set up y-axis label, label orientation, and grid line ticker.
p.min_border_left = p.min_border + 4
p.width += 40
yaxis = LinearAxis(axis_label=LABELS[VARIABLES.index(y)])
yaxis.major_label_orientation = "vertical"
p.add_layout(yaxis, "left")
yticker = yaxis.ticker
else:
yticker = BasicTicker()
p.add_layout(Grid(dimension=1, ticker=yticker))
if i >= N * (N - 1): # for last row:
# set up x-axis label, label orientation, and grid line ticker.
p.min_border_bottom = p.min_border + 40
p.height += 40
xaxis = LinearAxis(axis_label=LABELS[VARIABLES.index(x)])
p.add_layout(xaxis, "below")
xticker = xaxis.ticker
else:
xticker = BasicTicker()
p.add_layout(Grid(dimension=0, ticker=xticker))
# create scatter glyph
scatter = Scatter(
x=x,
y=y,
size=6,
line_color="white",
fill_color=factor_cmap("KnownSex", ["#2769AB", "#DF7307"], sex),
)
# add scatter glyph to plot
r = p.add_glyph(source, scatter)
p.x_range.renderers.append(r)
p.y_range.renderers.append(r)
# remove minor ticks in x and y axis of each plot
p.yaxis.minor_tick_out = 0
p.xaxis.minor_tick_out = 0
p.add_tools(SaveTool())
plots.append(p)
show(gridplot(plots, ncols=N))
The plots in this sub-section represent the correlations in mineral content for 214 samples of glass fragments obtained during forensic work.
The dataset contains seven variables measuring the amounts of magnesium (Mg), calcium (Ca), iron (Fe), potassium (K), sodium (Na), aluminum (Al), and barium (Ba) found in each glass fragment.
The magnitude of each correlation is also encoded in the size of the colored circles.
file = "../data/csv_files/forensic_glass.csv"
df = pd.read_csv(file)
pairs = [
"Mg-Ba",
"Mg-Al",
"Mg-Na",
"Mg-K",
"Mg-Fe",
"Mg-Ca",
"Ca-Ba",
"Ca-Al",
"Ca-Na",
"Ca-K",
"Ca-Fe",
"Fe-Ba",
"Fe-Al",
"Fe-Na",
"Fe-K",
"K-Ba",
"K-Al",
"K-Na",
"Na-Ba",
"Na-Al",
"Al-Ba",
]
pair_split = [pair.split("-") for pair in pairs]
correlations = []
for pair in pair_split:
matrix = np.corrcoef(df[f"{pair[0]}"], df[f"{pair[1]}"])
correlation = matrix[0, 1]
correlations.append(correlation)
new_df = pd.DataFrame(
{
"oxide_1": [x[0] for x in pair_split],
"oxide_2": [x[1] for x in pair_split],
"correlation": correlations,
"dot_size": [abs(corr) * 120 for corr in correlations],
}
)
new_df.head()
oxide_1 | oxide_2 | correlation | dot_size | |
---|---|---|---|---|
0 | Mg | Ba | -0.492262 | 59.071454 |
1 | Mg | Al | -0.481799 | 57.815821 |
2 | Mg | Na | -0.273732 | 32.847835 |
3 | Mg | K | 0.005396 | 0.647480 |
4 | Mg | Fe | 0.083060 | 9.967143 |
from bokeh.transform import transform
from bokeh.models import ColorBar, LinearColorMapper, FixedTicker
x_range = new_df["oxide_1"].unique()
y_range = list(reversed(new_df["oxide_2"].unique()))
p = figure(
title="Figure 12.7",
sizing_mode="scale_both",
x_axis_location="above",
toolbar_location=None,
x_range=x_range,
y_range=y_range,
)
# add color mapper to plot
colors = [
"#8B4513",
"#D8AF85",
"#CD853F",
"#E2CAB7",
"#DEB887",
"#A4D2D2",
"#8ABDBD",
"#ADD8E6",
"#009999",
"#188A8A",
]
mapper = LinearColorMapper(palette=colors, low=-0.5, high=0.5)
p.scatter(
x="oxide_1",
y="oxide_2",
size="dot_size",
source=new_df,
fill_color=transform("correlation", mapper),
line_color=None,
)
# create color bar object
color_bar = ColorBar(
color_mapper=mapper,
location=(200, 0),
ticker=FixedTicker(ticks=[-0.5, 0.0, 0.5]),
title="correlation",
title_text_align="center",
title_text_font_style="normal",
major_tick_line_color=None,
width=150,
height=20,
)
p.add_layout(color_bar, "below")
# customize plot
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xaxis.axis_line_color = None
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_line_color = None
p.yaxis.axis_line_color = None
p.grid.grid_line_color = None
p.outline_line_color = None
show(p)
The plot in this sub-section represent the carbon dioxide emissions per person in 2000 and 2010, for the ten countries with the largest difference between these two years.
file = "../data/csv_files/CO2_emissions.csv"
df = pd.read_csv(file)
countries = [
"Trinidad and Tobago",
"Qatar",
"United Arab Emirates",
"Oman",
"Bahrain",
"Singapore",
"Netherlands Antilles",
"Kazakhstan",
"Equatorial Guinea",
"Kuwait",
]
# create new dataframe for only the relevant countries
new_df = df[df["country"].isin(countries)].reset_index(drop=True)
years = (new_df["year"] == 2000.0) | (new_df["year"] == 2010.0)
new_df = new_df[years].reset_index(drop=True)
new_df["year"] = new_df.year.astype(int)
new_df["year"] = new_df.year.astype(str)
# create new columns for different years
a = new_df[new_df["year"] == "2000"]
b = new_df[new_df["year"] == "2010"]
plot_df = a.merge(b, on="country")
plot_df.head()
country | year_x | emissions_x | year_y | emissions_y | |
---|---|---|---|---|---|
0 | Bahrain | 2000 | 29.209555 | 2010 | 19.178419 |
1 | Equatorial Guinea | 2000 | 0.873720 | 2010 | 6.679983 |
2 | Kazakhstan | 2000 | 8.541839 | 2010 | 15.518572 |
3 | Kuwait | 2000 | 28.429719 | 2010 | 34.233166 |
4 | Netherlands Antilles | 2000 | 31.227112 | 2010 | 23.550535 |
from bokeh.models import LabelSet, ColumnDataSource
source = ColumnDataSource(plot_df)
p = figure(
y_range=(0, 60),
x_range=("2000", "2010"),
sizing_mode="scale_both",
toolbar_location=None,
x_axis_location="above",
y_axis_label="CO2 emissions (tons / person)",
)
# add scatter plot to figure
p.scatter(x="year_x", y="emissions_x", source=source, size=7)
p.scatter(
x="year_y",
y="emissions_y",
source=source,
size=7,
)
# add individual line segments for each country
p.segment(
x0="year_x",
y0="emissions_x",
x1="year_y",
y1="emissions_y",
source=source,
color="black",
)
# create country label
label = LabelSet(
x="year_y",
y="emissions_y",
text="country",
source=source,
text_font_size="11px",
x_offset=8,
y_offset=-7,
)
p.add_layout(label)
# customize plot
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xaxis.axis_line_color = None
p.yaxis.minor_tick_out = 0
p.yaxis.major_tick_in = 0
p.yaxis.ticker = [0, 20, 40, 60]
p.grid.grid_line_color = None
p.outline_line_color = None
show(p)