This is the second installment in a series of blog posts where we reproduce plots from Claus Wilke’s book, Fundamentals of Data Visualization.
This page demonstrates how to recreate the dot plots and heatmaps found in the Visualising amounts
chapter of the book. We will use the Bokeh scatter()
, and rect()
glyphs to create the dot plots and heatmaps.
from bokeh.io import output_notebook
output_notebook() # render plots inline in notebook
The plots in this sub-section represent the life expectancies of countries in the Americas, for the year 2007.
The scatter()
glyph is used to create the dot plots.
# import the relevant libraries
import pandas as pd
file = "../data/csv_files/life_expectancy.csv"
df = pd.read_csv(file)
# select only the relevant columns
df = df.loc[:, ["country", "2007"]]
americas = (
"Argentina",
"Bolivia",
"Brazil",
"Canada",
"Chile",
"Colombia",
"Costa Rica",
"Cuba",
"Dominican Republic",
"Ecuador",
"El_Salvador",
"Guatemala",
"Haiti",
"Honduras",
"Jamaica",
"Mexico",
"Nicaragua",
"Panama",
"Paraguay",
"Peru",
"Puerto Rico",
"Trinidad and Tobago",
"United States",
"Uruguay",
"Venezuela",
)
# create a new dataframe consisting of only american countries
df = df[df["country"].isin(americas)].reset_index(drop=True)
df = df.rename(columns={"2007": "years"})
df["years"] = df["years"].round()
df
country | years | |
---|---|---|
0 | Argentina | 75.0 |
1 | Bolivia | 70.0 |
2 | Brazil | 73.0 |
3 | Canada | 81.0 |
4 | Chile | 78.0 |
5 | Colombia | 77.0 |
6 | Costa Rica | 80.0 |
7 | Cuba | 78.0 |
8 | Dominican Republic | 73.0 |
9 | Ecuador | 74.0 |
10 | Guatemala | 70.0 |
11 | Honduras | 71.0 |
12 | Haiti | 60.0 |
13 | Jamaica | 76.0 |
14 | Mexico | 76.0 |
15 | Nicaragua | 74.0 |
16 | Panama | 78.0 |
17 | Peru | 78.0 |
18 | Paraguay | 76.0 |
19 | Trinidad and Tobago | 73.0 |
20 | Uruguay | 76.0 |
21 | United States | 78.0 |
22 | Venezuela | 75.0 |
from bokeh.plotting import figure, show
# plot by country in alphabetical order
# sort dataframe by "country" column in descending order
df = df.sort_values("country", ascending=False)
# create figure object
p = figure(
title="Figure 6.13 Life expectancy", # plot title
height=400, # plot height
y_range=df.country, # categorical range on the y-axis
x_axis_label="life expectancy (years)",
sizing_mode="stretch_width", # make plot width responsive to screen size
)
# create dot plot
p.scatter(
x="years", # x-axis column name
y="country", # y-axis column name
source=df, # data source for x and y axis
size=8, # circle size
)
# plot customization
# remove line color and minor ticks in x-axis
p.xaxis.minor_tick_out = 0
p.xaxis.axis_line_color = None
# remove line color in y-axis
p.yaxis.axis_line_color = None
show(p) # display plot
# plot by life expectancy in descending order
# sort dataframe by "years" column in ascending order
df = df.sort_values("years")
p = figure(
title="Figure 6.11 Life expectancy",
height=400,
y_range=df.country,
x_axis_label="life expectancy (years)",
sizing_mode="stretch_width",
)
p.scatter(x="years", y="country", source=df, size=8)
p.xaxis.minor_tick_out = 0
p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None
show(p)
You can further customise your scatter()
plot by using additional paramaters such as:
alpha
color
legend_field
For more information on the scatter()
glyph, check our user guide here.
The plot in this sub-section represents Internet adoption over time for selected countries.
The rect()
glyph is used to create the heatmap.
file = "../data/csv_files/Internet_user.csv"
df = pd.read_csv(file, encoding="ISO-8859-1")
countries = (
"Iceland",
"Norway",
"United Kingdom",
"Japan",
"Canada",
"Germany",
"New Zealand",
"France",
"Israel",
"United States",
"Argentina",
"Chile",
"Italy",
"Brazil",
"Mexico",
"South Africa",
"China",
"Algeria",
"India",
"Kenya",
)
# create new dataframe with only the selected countries and columns
df = df[df["country"].isin(countries)].reset_index(drop=True).fillna(0)
df = df.drop(["country_code", "indicator", "indicator_code"], axis=1)
# stack dataframe columns
df = pd.DataFrame(df.set_index("country").stack(), columns=["percentage"])
df = df.reset_index().rename(columns={"level_1": "year"}).fillna(0)
# convert "year" column to integer type
df["year"] = df.year.astype(int)
df
country | year | percentage | |
---|---|---|---|
0 | Argentina | 1990 | 0.000000 |
1 | Argentina | 1991 | 0.000000 |
2 | Argentina | 1992 | 0.002993 |
3 | Argentina | 1993 | 0.029527 |
4 | Argentina | 1994 | 0.043706 |
... | ... | ... | ... |
535 | South Africa | 2012 | 41.000000 |
536 | South Africa | 2013 | 46.500000 |
537 | South Africa | 2014 | 49.000000 |
538 | South Africa | 2015 | 51.919116 |
539 | South Africa | 2016 | 54.000000 |
540 rows × 3 columns
# import relevenat libraries
from bokeh.transform import transform
from bokeh.models import ColorBar, LinearColorMapper, FixedTicker
# plot heatmap
# create figure object
p = figure(
title="Figure 6.15 Internet adoption over time", # plot title
height=400, # plot height
toolbar_location=None, # remove toolbars
y_axis_location="right", # display y axis on the right of plot
y_range=countries[::-1], # categorical range of y-axis in reverse order
)
# create color mapper object
mapper = LinearColorMapper(
palette="Magma256", low=min(df["percentage"]), high=max(df["percentage"])
)
# create rectangle glyph
p.rect(
x="year", # x-axis column name
y="country", # y-axis column name
width=2, # rectangle width
height=1, # rectangle height
source=df, # data source for x and y axis columns
# map percentage values to color mapper object using transform
fill_color=transform("percentage", mapper),
line_color="white", # rectangle line color
)
# plot customization
# configure x-axis ticks to show only specified tick labels
p.xaxis.ticker = [1995, 2000, 2005, 2010, 2015]
# start and end x-axis at the specified years
p.x_range.start = 1993
p.x_range.end = 2016
# remove x-axis major ticks
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
# remove y-axis lines and ticks
p.yaxis.minor_tick_out = 0
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_line_color = None
p.yaxis.axis_line_color = None
# create color bar object
color_bar = ColorBar(
color_mapper=mapper,
location=(0, 0),
ticker=FixedTicker(ticks=[0, 25, 50, 75, 100]),
title="internet users / 100 people",
title_text_font_style="normal",
major_tick_line_color=None,
width=300,
height=20,
)
# add color bar above the plot
p.add_layout(color_bar, "above")
show(p)
The transform
method is used to apply the color to the rectangles using the fill_color
parameter. It takes a column name and applies a transform function to the column name. For more information about transform
, visit our reference section here
For more information on the rect()
glyph, check our user guide here.