This is the first installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This page demonstrates how to recreate the horizontal, vertical, grouped and stacked bar plots found in the Visualising amounts
chapter of the book. We will use the Bokeh vbar()
, hbar()
, and vbar_stack()
glyphs to create the bar plots.
# import the relevant libraries
import pandas as pd
from bokeh.io import output_notebook
These plots represent the highest grossing movies for the weekend of December 22-24, 2017.
The vbar()
and hbar()
glyphs are used to create vertical and horizontal bar plots.
file = "../data/csv_files/movies.csv"
df = pd.read_csv(file)
# get only first part of title
df["Title"] = df["Title"].apply(lambda x: x.split(":")[0])
# format the "weekend_gross" values as integers
# divide values by 1,000,000 to save space on y-axis
df["Weekend gross"] = df["Weekend gross"].apply(
lambda x: (int(x.split("$")[1])) / 1_000_000
)
df
Rank | Title | Weekend gross | |
---|---|---|---|
0 | 1 | Star Wars | 71.565498 |
1 | 2 | Jumanji | 36.169328 |
2 | 3 | Pitch Perfect 3 | 19.928525 |
3 | 4 | The Greatest Showman | 8.805843 |
4 | 5 | Ferdinand | 7.316746 |
from bokeh.plotting import figure, show
# plot a vertical bar
# create figure object
p = figure(
x_range=df.Title, # range of categorical values for the x-axis
height=300, # plot height
width=600, # plot width
title="Figure 6.1 movie gross", # title of plot
y_axis_label="weekend gross (million USD)", # y-axis label
)
p.vbar(
x="Title", # column name for x-axis values
top="Weekend gross", # column name for the height of bars
width=0.7, # bar width
color="#66B2FF", # hex code for bar color
source=df, # data source for the column names
)
p.y_range.start = 0 # start y-axis range from 0
show(p) # display plot
# plot a horizontal bar
df = df.sort_values("Weekend gross") # sort the named column in ascending order
p = figure(
y_range=df.Title, # range of categorical values for y-axis
height=300,
title="Figure 6.3 weekend gross",
x_axis_label="weekend gross (million USD)",
sizing_mode="stretch_width", # make plot width responsive to size of screen
)
p.hbar(
y="Title", # column name for y-axis values
right="Weekend gross", # column name for right endpoints of bars
height=0.9, # bar height
color="#66B2FF",
source=df,
)
p.x_range.start = 0
show(p)
Other optional parameters you can use to further customise the plots include:
- fill_color
- alpha
- line width
file = "../data/csv_files/income_by_age.csv"
df = pd.read_csv(file)
# select rows containing the different races and sort by "race" and "age"
age_group = (
df.sort_values(["race", "age"])
.reset_index(drop=True)
.iloc[7:35, :]
.reset_index(drop=True)
)
# group by "age" and "race" with the median income as values
age_group = age_group.groupby(["age", "race"])[["median_income"]].sum()
age_group = age_group.unstack().reset_index()
age_group.columns = age_group.columns.droplevel(level=0)
age_group = age_group.rename(columns={"": "age"})
age_group
race | age | asian | black | hispanic | white |
---|---|---|---|---|---|
0 | 15 to 24 | 45809 | 30267 | 45080 | 44588 |
1 | 25 to 34 | 80098 | 39176 | 45876 | 65389 |
2 | 35 to 44 | 100443 | 49336 | 50245 | 78093 |
3 | 45 to 54 | 98925 | 50103 | 58103 | 82289 |
4 | 55 to 64 | 91193 | 40363 | 51996 | 69387 |
5 | 65 to 74 | 56646 | 28697 | 36704 | 52219 |
6 | > 74 | 26487 | 22302 | 23797 | 32203 |
from bokeh.models import NumeralTickFormatter as NTF
from bokeh.palettes import Blues5
from bokeh.transform import dodge
# plot grouped bars
p = figure(
title="Figure 6.7 median income by age group",
height=350,
sizing_mode="stretch_width",
x_range=age_group.age,
)
bar_width = 0.2
p.vbar(
x=dodge("age", -0.3, range=p.x_range), # use dodge to create bar offsets on x-axis
top="asian",
source=age_group,
width=bar_width,
color=Blues5[0], # add plot color using color palette
legend_label="Asian", # add legend label to plot
)
p.vbar(
x=dodge("age", -0.1, range=p.x_range),
top="white",
source=age_group,
width=bar_width,
color=Blues5[1],
legend_label="White",
)
p.vbar(
x=dodge("age", 0.1, range=p.x_range),
top="hispanic",
source=age_group,
width=bar_width,
color=Blues5[2],
legend_label="Hispanic",
)
p.vbar(
x=dodge("age", 0.3, range=p.x_range),
top="black",
source=age_group,
width=bar_width,
color=Blues5[3],
legend_label="Black",
)
# plot customization
# Remove x-axis lines, grid, and tick marks
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xgrid.grid_line_color = None
# remove y-axis lines and ticks
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_out = 0
# start and endpoints of y-axis
p.y_range.start = 0
p.y_range.end = 100_000
# format y-axis labels to $ sign with thousand delimiters
p.yaxis.formatter = NTF(format="$0,0")
show(p)
The dodge() method is used to visually offset or displace categorical data points along a given dimension (x-axis here) to avoid overlap in the plot.
# add a name for each race dataframe
asian = df.iloc[21:28, :].drop(["year", "race"], axis=1)
asian.name = "Asian"
black = df.iloc[28:35, :].drop(["year", "race"], axis=1)
black.name = "Black"
hispanic = df.iloc[35:, :].drop(["year", "race"], axis=1)
hispanic.name = "Hispanic"
white = df.iloc[7:14, :].drop(["year", "race"], axis=1)
white.name = "White"
races = (asian, white, hispanic, black)
import pandas as pd
from bokeh.plotting import figure
from bokeh.models import FactorRange
from bokeh.models.formatters import NumeralTickFormatter as NTF
# create a function to plot individual bar plots
def plot_bars(df: pd.DataFrame) -> figure:
"""
Creates a bar chart using Bokeh to visualize median income by age.
Parameters:
df (pd.DataFrame): The pandas DataFrame containing the data.
It should have the following columns:
- age: String or numeric values representing the age groups.
- median_income: Numeric values representing the median income for each age group.
Returns:
figure: A Bokeh figure object representing the bar chart.
Raises:
ValueError: If the required columns (age and median_income) are not present in the DataFrame.
TypeError: If the data types of the columns are not compatible with the plot.
Example:
df = pd.DataFrame({'age': ['18-24', '25-34', '35-44'], 'median_income': [50000, 60000, 70000]})
plot = plot_bars(df)
show(plot)
"""
# Data validation
if "age" not in df.columns or "median_income" not in df.columns:
raise ValueError("The DataFrame must have 'age' and 'median_income' columns.")
if not pd.api.types.is_numeric_dtype(df["median_income"]):
raise TypeError("The 'median_income' column must contain numeric values.")
if not pd.api.types.is_numeric_dtype(df["age"]):
factors = df["age"].unique().tolist()
df["age"] = pd.Categorical(df["age"], categories=factors, ordered=True)
# Function implementation
p = figure(
title=f"Figure 6.9: {df.name}",
height=300,
width=400,
x_range=FactorRange(factors=df.age),
toolbar_location=None,
)
p.vbar(x="age", top="median_income", color="#99CCFF", source=df, width=0.9)
p.xgrid.grid_line_color = None
p.xaxis.major_tick_out = 0
p.yaxis.formatter = NTF(format="$0,0")
p.xaxis.axis_label = "age (years)"
p.yaxis.axis_label = "median income (USD)"
p.yaxis.minor_tick_out = 0
p.yaxis.major_tick_out = 0
p.y_range.start = 0
p.y_range.end = 110_000
return p
from bokeh.layouts import gridplot
# plot the individual bars and append plots in a list
plots = []
for race in races:
plot = plot_bars(race)
plots.append(plot)
# use gridplot to arrange the plots in rows of two
layout = gridplot([plots[:2], plots[-2:]])
show(layout)
file = "../data/csv_files/titanic_all.csv"
df = pd.read_csv(file)
# group by class and count the passenger number by sex
by_class = (
df.groupby("class").sex.value_counts().unstack().drop("*", axis=0).reset_index()
)
# rename values in "class" column to match book labels
by_class["class"] = ["1st class", "2nd class", "3rd class"]
by_class
sex | class | female | male |
---|---|---|---|
0 | 1st class | 143.0 | 179.0 |
1 | 2nd class | 107.0 | 172.0 |
2 | 3rd class | 212.0 | 499.0 |
# plot stacked bars
p = figure(
title="Figure 6.10 titanic passengers",
height=300,
width=500,
x_range=by_class["class"],
toolbar_location=None,
)
# create stacked bars
p.vbar_stack(
["male", "female"], # column names of bars to stack
x="class", # column name for categorical x-axis
source=by_class, # data source for column names
width=0.9, # bar width
line_width=2.5, # increased line width to separate the bars
line_color="white", # line color separating the bars
color=["#0000CC", "#CC6600"], # colors for each bar stack
legend_label=["male passengers", "female passengers"],
)
# add text to bars
p.text(
x="class", # column name for horizontal position of text
y="male", # column name for vertical position of text
text="male", # text value
source=by_class, # data source for column names
y_offset=40, # vertical offset of text
text_color="#FFFFFF",
text_align="center", # align text to center of bars
)
p.text(
x="class",
y="male",
text="female",
source=by_class,
y_offset=-10,
text_color="#FFFFFF",
text_align="center",
)
# plot customization
# remove x-axis line and ticks
p.xaxis.axis_line_color = None
p.xaxis.axis_line_width = 0
p.xaxis.major_tick_out = 0
# remove y-axis lines, ticks, and labels. start y-axis at 0
p.y_range.start = 0
p.yaxis.visible = False
# remove plot grid lines and outline
p.grid.grid_line_color = None
p.outline_line_color = None
# position legend location and orientation
p.legend.location = "top_left"
p.legend.orientation = "vertical"
show(p)