from bokeh.io import output_notebook
import pandas as pd
import numpy as np

output_notebook()  # render plots inline in the notebook


file = "../data/csv_files/blue_jays.csv"

df = pd.read_csv(file)

# add a "skull size" column to shrink circle size in the plot.
df["skull_size"] = [1.5 * (2 ** (size / 10)) for size in df["Skull"]]

df.head()


from bokeh.plotting import figure, show


def plot_scatter(df, fill_color=None, title=None, yaxis=False):
    """
    Create a scatter plot using Bokeh.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the data to be plotted.
    fill_color (str, optional): Color to fill the scatter points.
    title (str, optional): Title text for the plot.
    yaxis (bool, optional): Whether to display the y-axis or not.

    Returns:
    bokeh.plotting.figure.Figure: Scatter plot figure.
    """
    p = figure(
        title=title,
        height=400,
        width=400,
        y_range=(50, 62),
        x_axis_label="body mass (g)",
        y_axis_label="head length (mm)",
        toolbar_location=None,
    )

    p.scatter(
        x="Mass",  # column name for x-axis of plot
        y="Head",  # column name for y-axis of plot
        size="skull_size",  # column name to use for circle size
        source=df,  # data source
        fill_color=fill_color,
        line_color="white",
    )

    # Customize plot axis and title
    p.yaxis.ticker = [52, 54, 56, 58, 60]
    p.yaxis.visible = yaxis
    p.title.text_font_size = "12px"
    p.title.align = "center"
    p.title.background_fill_color = "darkgrey"
    p.title.background_fill_alpha = 0.4

    return p


from bokeh.layouts import gridplot

# create separate dataframes for the bird sexes
male = df[df["Sex"] == 1]
female = df[df["Sex"] == 0]

# plot each scatter plot using the appropriate dataframe
f = plot_scatter(female, "#DF7307", "female birds", yaxis=True)
m = plot_scatter(male, "#2769AB", "male birds")

# display both plots in a grd layout
layout = gridplot([[f, m]], sizing_mode="scale_both")
show(layout)


from itertools import product

from bokeh.io import show
from bokeh.layouts import gridplot
from bokeh.models import (
    BasicTicker,
    Scatter,
    ColumnDataSource,
    DataRange1d,
    Grid,
    LinearAxis,
    Plot,
    SaveTool,
)
from bokeh.transform import factor_cmap

sex = df.KnownSex.unique()
VARIABLES = ("Head", "Mass", "Skull")
LABELS = ("head length (mm)", "body mass (g)", "skull size (mm)")
N = len(VARIABLES)

source = ColumnDataSource(data=df)


xdrs = [DataRange1d(bounds=None) for _ in range(N)]
ydrs = [DataRange1d(bounds=None) for _ in range(N)]

plots = []

for i, (y, x) in enumerate(product(VARIABLES, VARIABLES)):
    # create Plot object
    p = Plot(
        x_range=xdrs[i % N],
        y_range=ydrs[i // N],
        background_fill_color="white",
        border_fill_color="white",
        width=200,
        height=200,
    )

    if i % N == 0:  # for first column:
        # set up y-axis label, label orientation, and grid line ticker.
        p.min_border_left = p.min_border + 4
        p.width += 40
        yaxis = LinearAxis(axis_label=LABELS[VARIABLES.index(y)])
        yaxis.major_label_orientation = "vertical"
        p.add_layout(yaxis, "left")
        yticker = yaxis.ticker
    else:
        yticker = BasicTicker()
    p.add_layout(Grid(dimension=1, ticker=yticker))

    if i >= N * (N - 1):  # for last row:
        # set up x-axis label, label orientation, and grid line ticker.
        p.min_border_bottom = p.min_border + 40
        p.height += 40
        xaxis = LinearAxis(axis_label=LABELS[VARIABLES.index(x)])
        p.add_layout(xaxis, "below")
        xticker = xaxis.ticker
    else:
        xticker = BasicTicker()
    p.add_layout(Grid(dimension=0, ticker=xticker))

    # create scatter glyph
    scatter = Scatter(
        x=x,
        y=y,
        size=6,
        line_color="white",
        fill_color=factor_cmap("KnownSex", ["#2769AB", "#DF7307"], sex),
    )

    # add scatter glyph to plot
    r = p.add_glyph(source, scatter)
    p.x_range.renderers.append(r)
    p.y_range.renderers.append(r)

    # remove minor ticks in x and y axis of each plot
    p.yaxis.minor_tick_out = 0
    p.xaxis.minor_tick_out = 0

    p.add_tools(SaveTool())

    plots.append(p)

show(gridplot(plots, ncols=N))


file = "../data/csv_files/forensic_glass.csv"

df = pd.read_csv(file)


pairs = [
    "Mg-Ba",
    "Mg-Al",
    "Mg-Na",
    "Mg-K",
    "Mg-Fe",
    "Mg-Ca",
    "Ca-Ba",
    "Ca-Al",
    "Ca-Na",
    "Ca-K",
    "Ca-Fe",
    "Fe-Ba",
    "Fe-Al",
    "Fe-Na",
    "Fe-K",
    "K-Ba",
    "K-Al",
    "K-Na",
    "Na-Ba",
    "Na-Al",
    "Al-Ba",
]

pair_split = [pair.split("-") for pair in pairs]
correlations = []

for pair in pair_split:
    matrix = np.corrcoef(df[f"{pair[0]}"], df[f"{pair[1]}"])
    correlation = matrix[0, 1]
    correlations.append(correlation)

new_df = pd.DataFrame(
    {
        "oxide_1": [x[0] for x in pair_split],
        "oxide_2": [x[1] for x in pair_split],
        "correlation": correlations,
        "dot_size": [abs(corr) * 120 for corr in correlations],
    }
)

new_df.head()


from bokeh.transform import transform
from bokeh.models import ColorBar, LinearColorMapper, FixedTicker


x_range = new_df["oxide_1"].unique()
y_range = list(reversed(new_df["oxide_2"].unique()))

p = figure(
    title="Figure 12.7",
    sizing_mode="scale_both",
    x_axis_location="above",
    toolbar_location=None,
    x_range=x_range,
    y_range=y_range,
)

# add color mapper to plot
colors = [
    "#8B4513",
    "#D8AF85",
    "#CD853F",
    "#E2CAB7",
    "#DEB887",
    "#A4D2D2",
    "#8ABDBD",
    "#ADD8E6",
    "#009999",
    "#188A8A",
]
mapper = LinearColorMapper(palette=colors, low=-0.5, high=0.5)

p.scatter(
    x="oxide_1",
    y="oxide_2",
    size="dot_size",
    source=new_df,
    fill_color=transform("correlation", mapper),
    line_color=None,
)

# create color bar object
color_bar = ColorBar(
    color_mapper=mapper,
    location=(200, 0),
    ticker=FixedTicker(ticks=[-0.5, 0.0, 0.5]),
    title="correlation",
    title_text_align="center",
    title_text_font_style="normal",
    major_tick_line_color=None,
    width=150,
    height=20,
)

p.add_layout(color_bar, "below")

# customize plot
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xaxis.axis_line_color = None

p.yaxis.major_tick_out = 0
p.yaxis.major_tick_line_color = None
p.yaxis.axis_line_color = None

p.grid.grid_line_color = None
p.outline_line_color = None

show(p)


file = "../data/csv_files/CO2_emissions.csv"
df = pd.read_csv(file)

countries = [
    "Trinidad and Tobago",
    "Qatar",
    "United Arab Emirates",
    "Oman",
    "Bahrain",
    "Singapore",
    "Netherlands Antilles",
    "Kazakhstan",
    "Equatorial Guinea",
    "Kuwait",
]

# create new dataframe for only the relevant countries
new_df = df[df["country"].isin(countries)].reset_index(drop=True)
years = (new_df["year"] == 2000.0) | (new_df["year"] == 2010.0)

new_df = new_df[years].reset_index(drop=True)
new_df["year"] = new_df.year.astype(int)
new_df["year"] = new_df.year.astype(str)


# create new columns for different years
a = new_df[new_df["year"] == "2000"]
b = new_df[new_df["year"] == "2010"]
plot_df = a.merge(b, on="country")

plot_df.head()


from bokeh.models import LabelSet, ColumnDataSource

source = ColumnDataSource(plot_df)

p = figure(
    y_range=(0, 60),
    x_range=("2000", "2010"),
    sizing_mode="scale_both",
    toolbar_location=None,
    x_axis_location="above",
    y_axis_label="CO2 emissions (tons / person)",
)

# add scatter plot to figure
p.scatter(x="year_x", y="emissions_x", source=source, size=7)

p.scatter(
    x="year_y",
    y="emissions_y",
    source=source,
    size=7,
)


# add individual line segments for each country
p.segment(
    x0="year_x",
    y0="emissions_x",
    x1="year_y",
    y1="emissions_y",
    source=source,
    color="black",
)

# create country label
label = LabelSet(
    x="year_y",
    y="emissions_y",
    text="country",
    source=source,
    text_font_size="11px",
    x_offset=8,
    y_offset=-7,
)

p.add_layout(label)

# customize plot
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xaxis.axis_line_color = None

p.yaxis.minor_tick_out = 0
p.yaxis.major_tick_in = 0
p.yaxis.ticker = [0, 20, 40, 60]

p.grid.grid_line_color = None
p.outline_line_color = None

show(p)

	BirdID	KnownSex	BillDepth	BillWidth	BillLength	Head	Mass	Skull	Sex	skull_size
0	0000-00000	M	8.26	9.21	25.92	56.58	73.30	30.66	1	12.561723
1	1142-05901	M	8.54	8.76	24.99	56.36	75.10	31.38	1	13.204543
2	1142-05905	M	8.39	8.78	26.07	57.32	70.25	31.25	1	13.086093
3	1142-05907	F	7.78	9.30	23.48	53.77	65.50	30.29	0	12.243656
4	1142-05909	M	8.71	9.84	25.47	57.32	74.90	31.85	1	13.641804

Visualizing associations among variables¶

Bubble chart¶

Data preparation¶

Plotting¶

All-against-all scatter plot matrix¶

Data preparation¶

Plotting¶

Correlogram¶

Data preparation¶

Plotting¶

Slope graph¶

Data preparation¶

Plotting¶

	oxide_1	oxide_2	correlation	dot_size
0	Mg	Ba	-0.492262	59.071454
1	Mg	Al	-0.481799	57.815821
2	Mg	Na	-0.273732	32.847835
3	Mg	K	0.005396	0.647480
4	Mg	Fe	0.083060	9.967143

	country	year_x	emissions_x	year_y	emissions_y
0	Bahrain	2000	29.209555	2010	19.178419
1	Equatorial Guinea	2000	0.873720	2010	6.679983
2	Kazakhstan	2000	8.541839	2010	15.518572
3	Kuwait	2000	28.429719	2010	34.233166
4	Netherlands Antilles	2000	31.227112	2010	23.550535